Merge 089e4e3 into dd46393

cmmorrow · Jan 2, 2019 · 1655ccb · 1655ccb
2 parents dd46393 + 089e4e3
commit 1655ccb
Show file tree

Hide file tree

Showing 10 changed files with 481 additions and 163 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,9 +1,12 @@
 sudo: required
+os: linux
+dist: xenial
 language: python
 python:
 - '2.7'
 - '3.5'
 - '3.6'
+- '3.7'
 before_install:
 - export DISPLAY=:99.0
 - sh -e /etc/init.d/xvfb start

diff --git a/sci_analysis/analysis/__init__.py b/sci_analysis/analysis/__init__.py
@@ -8,7 +8,7 @@
 from .stats import VectorStatistics, GroupStatistics, GroupStatisticsStacked, CategoricalStatistics
 
 
-def determine_analysis_type(data, other=None, groups=None):
+def determine_analysis_type(data, other=None, groups=None, labels=None):
     """Attempts to determine the type of data and returns the corresponding sci_analysis Data object.
 
     Parameters
@@ -19,6 +19,8 @@ def determine_analysis_type(data, other=None, groups=None):
         A second sequence of unknown data type.
     groups : array-like or None
         The group names to include if data is determined to be a Vector.
+    labels : array-like or None
+        The sequence of data point labels.
 
     Returns
     -------
@@ -32,8 +34,7 @@ def determine_analysis_type(data, other=None, groups=None):
     from pandas import Series
     from ..data import is_iterable, is_vector, is_categorical, Vector, Categorical
     from .exc import NoDataError
-    numeric_types = [float16, float32, float64,
-                     int8, int16, int32, int64]
+    numeric_types = [float16, float32, float64, int8, int16, int32, int64]
     if not is_iterable(data):
         raise ValueError('data cannot be a scalar value.')
     elif len(data) == 0:
@@ -51,19 +52,19 @@ def determine_analysis_type(data, other=None, groups=None):
         if data.dtype in numeric_types:
             if other is not None and other.dtype in numeric_types:
                 if groups is not None:
-                    return Vector(data, other=other, groups=groups)
+                    return Vector(data, other=other, groups=groups, labels=labels)
                 else:
-                    return Vector(data, other=other)
+                    return Vector(data, other=other, labels=labels)
             else:
                 if groups is not None:
-                    return Vector(data, groups=groups)
+                    return Vector(data, groups=groups, labels=labels)
                 else:
-                    return Vector(data)
+                    return Vector(data, labels=labels)
         else:
             return Categorical(data)
 
 
-def analyse(xdata, ydata=None, groups=None, **kwargs):
+def analyse(xdata, ydata=None, groups=None, labels=None, **kwargs):
     """
     Alias for analyze.
 
@@ -75,6 +76,8 @@ def analyse(xdata, ydata=None, groups=None, **kwargs):
         The response or secondary set of data.
     groups : array-like
         The group names used for location testing or Bivariate analysis.
+    labels : array-like or None
+        The sequence of data point labels.
     alpha : float
         The sensitivity to use for hypothesis tests.
 
@@ -94,10 +97,10 @@ def analyse(xdata, ydata=None, groups=None, **kwargs):
     xdata : dict(array-like(num)), ydata : None --- Location Test(unstacked)
     xdata : array-like(num), ydata : None, groups : array-like --- Location Test(stacked)
     """
-    return analyze(xdata, ydata=ydata, groups=groups, **kwargs)
+    return analyze(xdata, ydata=ydata, groups=groups, labels=labels, **kwargs)
 
 
-def analyze(xdata, ydata=None, groups=None, alpha=0.05, **kwargs):
+def analyze(xdata, ydata=None, groups=None, labels=None, alpha=0.05, **kwargs):
     """
     Automatically performs a statistical analysis based on the input arguments.
 
@@ -109,6 +112,8 @@ def analyze(xdata, ydata=None, groups=None, alpha=0.05, **kwargs):
         The response or secondary set of data.
     groups : array-like
         The group names used for location testing or Bivariate analysis.
+    labels : array-like or None
+        The sequence of data point labels.
     alpha : float
         The sensitivity to use for hypothesis tests.
 
@@ -185,9 +190,9 @@ def analyze(xdata, ydata=None, groups=None, alpha=0.05, **kwargs):
         return tested if debug else None
 
     if ydata is not None:
-        _data = determine_analysis_type(xdata, other=ydata, groups=groups)
+        _data = determine_analysis_type(xdata, other=ydata, groups=groups, labels=labels)
     else:
-        _data = determine_analysis_type(xdata, groups=groups)
+        _data = determine_analysis_type(xdata, groups=groups, labels=labels)
 
     if is_vector(_data) and not _data.other.empty:
         # Correlation and Linear Regression

diff --git a/sci_analysis/analysis/comparison.py b/sci_analysis/analysis/comparison.py
@@ -235,7 +235,10 @@ def run(self):
 
 class GroupCorrelation(GroupComparison):
 
-    _names = {'pearson': 'Pearson Correlation Coefficient', 'spearman': 'Spearman Correlation Coefficient'}
+    _names = {
+        'pearson': 'Pearson Correlation Coefficient',
+        'spearman': 'Spearman Correlation Coefficient',
+    }
     _min_size = 2
     _r_value = 'r value'
     _p_value = 'p value'

diff --git a/sci_analysis/data/numeric.py b/sci_analysis/data/numeric.py
@@ -1,11 +1,10 @@
 # Import packages
 import pandas as pd
 import numpy as np
-import datetime
 
 # Import from local
 from .data import Data, is_data
-from .data_operations import is_iterable, flatten
+from .data_operations import flatten
 
 
 class EmptyVectorError(Exception):
@@ -62,9 +61,10 @@ class Numeric(Data):
     _ind = 'ind'
     _dep = 'dep'
     _grp = 'grp'
-    _col_names = (_ind, _dep, _grp)
+    _lbl = 'lbl'
+    _col_names = (_ind, _dep, _grp, _lbl)
 
-    def __init__(self, sequence=None, other=None, groups=None, name=None):
+    def __init__(self, sequence=None, other=None, groups=None, labels=None, name=None):
         """Takes an array-like object and converts it to a pandas Series with any non-numeric values converted to NaN.
 
         Parameters
@@ -75,6 +75,8 @@ def __init__(self, sequence=None, other=None, groups=None, name=None):
             The secondary input object
         groups : list | set | tuple | np.array | pd.Series, optional
             The sequence of group names for sub-arrays
+        labels : list | set | tuple | np.array | pd.Series, optional
+            The sequence of data point labels
         name : str, optional
             The name of the Numeric object
         """
@@ -100,6 +102,8 @@ def __init__(self, sequence=None, other=None, groups=None, name=None):
                 self._values[self._dep] = other
                 self._values[self._grp] = groups
                 self._values.loc[:, self._grp] = self._values[self._grp].astype('category')
+                if labels is not None:
+                    self._values[self._lbl] = labels
             except ValueError:
                 raise UnequalVectorLengthError('length of data does not match length of other.')
             if any(self._values[self._dep].notnull()):
@@ -142,7 +146,6 @@ def drop_nan(self):
         arr : pandas.DataFrame
             A copy of the Numeric object's internal Series with all NaN values removed.
         """
-        self._dropped_vals = self._values[self._ind].isnull()
         return self._values.dropna(how='any', subset=[self._ind])
 
     def drop_nan_intersect(self):
@@ -155,7 +158,6 @@ def drop_nan_intersect(self):
         arr : pandas.DataFrame
             A tuple of numpy Arrays corresponding to the internal Vector and seq with all nan values removed.
         """
-        self._dropped_vals = (self._values[self._dep].isnull() | self._values[self._ind].isnull())
         return self._values.dropna(how='any', subset=[self._ind, self._dep])
 
     @property
@@ -172,14 +174,22 @@ def other(self):
 
     @property
     def groups(self):
-        return {grp: seq[self._ind].rename(grp)
-                for grp, seq in self._values.groupby(self._grp)
-                if not seq.empty}
+        groups = self._values.groupby(self._grp)
+        return {grp: seq[self._ind].rename(grp) for grp, seq in groups if not seq.empty}
+
+    @property
+    def labels(self):
+        return self._values[self._lbl].fillna('None')
 
     @property
     def paired_groups(self):
-        return {grp: (df[self._ind], df[self._dep])
-                for grp, df in self._values.groupby(self._grp) if not df.empty}
+        groups = self._values.groupby(self._grp)
+        return {grp: (df[self._ind], df[self._dep]) for grp, df in groups if not df.empty}
+
+    @property
+    def group_labels(self):
+        groups = self._values.groupby(self._grp)
+        return {grp: df[self._lbl] for grp, df in groups if not df.empty}
 
     @property
     def values(self):
@@ -189,13 +199,17 @@ def values(self):
     def auto_groups(self):
         return self._auto_groups
 
+    @property
+    def has_labels(self):
+        return any(pd.notna(self._values[self._lbl]))
+
 
 class Vector(Numeric):
     """
     The sci_analysis representation of continuous, numeric data.
     """
 
-    def __init__(self, sequence=None, other=None, groups=None, name=None):
+    def __init__(self, sequence=None, other=None, groups=None, labels=None, name=None):
         """
         Takes an array-like object and converts it to a pandas Series of
         dtype float64, with any non-numeric values converted to NaN.
@@ -208,11 +222,13 @@ def __init__(self, sequence=None, other=None, groups=None, name=None):
             The secondary input object
         groups : array-like
             The sequence of group names for sub-arrays
+        labels : list | set | tuple | np.array | pd.Series, optional
+            The sequence of data point labels
         name : str, optional
             The name of the Vector object
         """
 
-        super(Vector, self).__init__(sequence=sequence, other=other, groups=groups, name=name)
+        super(Vector, self).__init__(sequence=sequence, other=other, groups=groups, labels=labels, name=name)
         if not self._values.empty:
             self._values[self._ind] = self._values[self._ind].astype('float')
             self._values[self._dep] = self._values[self._dep].astype('float')