Skip to content

Commit

Permalink
Merge 089e4e3 into dd46393
Browse files Browse the repository at this point in the history
  • Loading branch information
cmmorrow committed Jan 2, 2019
2 parents dd46393 + 089e4e3 commit 1655ccb
Show file tree
Hide file tree
Showing 10 changed files with 481 additions and 163 deletions.
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
sudo: required
os: linux
dist: xenial
language: python
python:
- '2.7'
- '3.5'
- '3.6'
- '3.7'
before_install:
- export DISPLAY=:99.0
- sh -e /etc/init.d/xvfb start
Expand Down
29 changes: 17 additions & 12 deletions sci_analysis/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from .stats import VectorStatistics, GroupStatistics, GroupStatisticsStacked, CategoricalStatistics


def determine_analysis_type(data, other=None, groups=None):
def determine_analysis_type(data, other=None, groups=None, labels=None):
"""Attempts to determine the type of data and returns the corresponding sci_analysis Data object.
Parameters
Expand All @@ -19,6 +19,8 @@ def determine_analysis_type(data, other=None, groups=None):
A second sequence of unknown data type.
groups : array-like or None
The group names to include if data is determined to be a Vector.
labels : array-like or None
The sequence of data point labels.
Returns
-------
Expand All @@ -32,8 +34,7 @@ def determine_analysis_type(data, other=None, groups=None):
from pandas import Series
from ..data import is_iterable, is_vector, is_categorical, Vector, Categorical
from .exc import NoDataError
numeric_types = [float16, float32, float64,
int8, int16, int32, int64]
numeric_types = [float16, float32, float64, int8, int16, int32, int64]
if not is_iterable(data):
raise ValueError('data cannot be a scalar value.')
elif len(data) == 0:
Expand All @@ -51,19 +52,19 @@ def determine_analysis_type(data, other=None, groups=None):
if data.dtype in numeric_types:
if other is not None and other.dtype in numeric_types:
if groups is not None:
return Vector(data, other=other, groups=groups)
return Vector(data, other=other, groups=groups, labels=labels)
else:
return Vector(data, other=other)
return Vector(data, other=other, labels=labels)
else:
if groups is not None:
return Vector(data, groups=groups)
return Vector(data, groups=groups, labels=labels)
else:
return Vector(data)
return Vector(data, labels=labels)
else:
return Categorical(data)


def analyse(xdata, ydata=None, groups=None, **kwargs):
def analyse(xdata, ydata=None, groups=None, labels=None, **kwargs):
"""
Alias for analyze.
Expand All @@ -75,6 +76,8 @@ def analyse(xdata, ydata=None, groups=None, **kwargs):
The response or secondary set of data.
groups : array-like
The group names used for location testing or Bivariate analysis.
labels : array-like or None
The sequence of data point labels.
alpha : float
The sensitivity to use for hypothesis tests.
Expand All @@ -94,10 +97,10 @@ def analyse(xdata, ydata=None, groups=None, **kwargs):
xdata : dict(array-like(num)), ydata : None --- Location Test(unstacked)
xdata : array-like(num), ydata : None, groups : array-like --- Location Test(stacked)
"""
return analyze(xdata, ydata=ydata, groups=groups, **kwargs)
return analyze(xdata, ydata=ydata, groups=groups, labels=labels, **kwargs)


def analyze(xdata, ydata=None, groups=None, alpha=0.05, **kwargs):
def analyze(xdata, ydata=None, groups=None, labels=None, alpha=0.05, **kwargs):
"""
Automatically performs a statistical analysis based on the input arguments.
Expand All @@ -109,6 +112,8 @@ def analyze(xdata, ydata=None, groups=None, alpha=0.05, **kwargs):
The response or secondary set of data.
groups : array-like
The group names used for location testing or Bivariate analysis.
labels : array-like or None
The sequence of data point labels.
alpha : float
The sensitivity to use for hypothesis tests.
Expand Down Expand Up @@ -185,9 +190,9 @@ def analyze(xdata, ydata=None, groups=None, alpha=0.05, **kwargs):
return tested if debug else None

if ydata is not None:
_data = determine_analysis_type(xdata, other=ydata, groups=groups)
_data = determine_analysis_type(xdata, other=ydata, groups=groups, labels=labels)
else:
_data = determine_analysis_type(xdata, groups=groups)
_data = determine_analysis_type(xdata, groups=groups, labels=labels)

if is_vector(_data) and not _data.other.empty:
# Correlation and Linear Regression
Expand Down
5 changes: 4 additions & 1 deletion sci_analysis/analysis/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,10 @@ def run(self):

class GroupCorrelation(GroupComparison):

_names = {'pearson': 'Pearson Correlation Coefficient', 'spearman': 'Spearman Correlation Coefficient'}
_names = {
'pearson': 'Pearson Correlation Coefficient',
'spearman': 'Spearman Correlation Coefficient',
}
_min_size = 2
_r_value = 'r value'
_p_value = 'p value'
Expand Down
42 changes: 29 additions & 13 deletions sci_analysis/data/numeric.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
# Import packages
import pandas as pd
import numpy as np
import datetime

# Import from local
from .data import Data, is_data
from .data_operations import is_iterable, flatten
from .data_operations import flatten


class EmptyVectorError(Exception):
Expand Down Expand Up @@ -62,9 +61,10 @@ class Numeric(Data):
_ind = 'ind'
_dep = 'dep'
_grp = 'grp'
_col_names = (_ind, _dep, _grp)
_lbl = 'lbl'
_col_names = (_ind, _dep, _grp, _lbl)

def __init__(self, sequence=None, other=None, groups=None, name=None):
def __init__(self, sequence=None, other=None, groups=None, labels=None, name=None):
"""Takes an array-like object and converts it to a pandas Series with any non-numeric values converted to NaN.
Parameters
Expand All @@ -75,6 +75,8 @@ def __init__(self, sequence=None, other=None, groups=None, name=None):
The secondary input object
groups : list | set | tuple | np.array | pd.Series, optional
The sequence of group names for sub-arrays
labels : list | set | tuple | np.array | pd.Series, optional
The sequence of data point labels
name : str, optional
The name of the Numeric object
"""
Expand All @@ -100,6 +102,8 @@ def __init__(self, sequence=None, other=None, groups=None, name=None):
self._values[self._dep] = other
self._values[self._grp] = groups
self._values.loc[:, self._grp] = self._values[self._grp].astype('category')
if labels is not None:
self._values[self._lbl] = labels
except ValueError:
raise UnequalVectorLengthError('length of data does not match length of other.')
if any(self._values[self._dep].notnull()):
Expand Down Expand Up @@ -142,7 +146,6 @@ def drop_nan(self):
arr : pandas.DataFrame
A copy of the Numeric object's internal Series with all NaN values removed.
"""
self._dropped_vals = self._values[self._ind].isnull()
return self._values.dropna(how='any', subset=[self._ind])

def drop_nan_intersect(self):
Expand All @@ -155,7 +158,6 @@ def drop_nan_intersect(self):
arr : pandas.DataFrame
A tuple of numpy Arrays corresponding to the internal Vector and seq with all nan values removed.
"""
self._dropped_vals = (self._values[self._dep].isnull() | self._values[self._ind].isnull())
return self._values.dropna(how='any', subset=[self._ind, self._dep])

@property
Expand All @@ -172,14 +174,22 @@ def other(self):

@property
def groups(self):
return {grp: seq[self._ind].rename(grp)
for grp, seq in self._values.groupby(self._grp)
if not seq.empty}
groups = self._values.groupby(self._grp)
return {grp: seq[self._ind].rename(grp) for grp, seq in groups if not seq.empty}

@property
def labels(self):
return self._values[self._lbl].fillna('None')

@property
def paired_groups(self):
return {grp: (df[self._ind], df[self._dep])
for grp, df in self._values.groupby(self._grp) if not df.empty}
groups = self._values.groupby(self._grp)
return {grp: (df[self._ind], df[self._dep]) for grp, df in groups if not df.empty}

@property
def group_labels(self):
groups = self._values.groupby(self._grp)
return {grp: df[self._lbl] for grp, df in groups if not df.empty}

@property
def values(self):
Expand All @@ -189,13 +199,17 @@ def values(self):
def auto_groups(self):
return self._auto_groups

@property
def has_labels(self):
return any(pd.notna(self._values[self._lbl]))


class Vector(Numeric):
"""
The sci_analysis representation of continuous, numeric data.
"""

def __init__(self, sequence=None, other=None, groups=None, name=None):
def __init__(self, sequence=None, other=None, groups=None, labels=None, name=None):
"""
Takes an array-like object and converts it to a pandas Series of
dtype float64, with any non-numeric values converted to NaN.
Expand All @@ -208,11 +222,13 @@ def __init__(self, sequence=None, other=None, groups=None, name=None):
The secondary input object
groups : array-like
The sequence of group names for sub-arrays
labels : list | set | tuple | np.array | pd.Series, optional
The sequence of data point labels
name : str, optional
The name of the Vector object
"""

super(Vector, self).__init__(sequence=sequence, other=other, groups=groups, name=name)
super(Vector, self).__init__(sequence=sequence, other=other, groups=groups, labels=labels, name=name)
if not self._values.empty:
self._values[self._ind] = self._values[self._ind].astype('float')
self._values[self._dep] = self._values[self._dep].astype('float')
Expand Down

0 comments on commit 1655ccb

Please sign in to comment.