Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mathematical/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@
__version__ = "0.1.11"
__email__ = "dominic@davis-foster.co.uk"

# this package
from . import data_frames, outliers, stats, utils
50 changes: 25 additions & 25 deletions mathematical/data_frames.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,20 @@
#
#

# stdlib
from typing import List, Optional, Sequence

# 3rd party
from pandas import Series # type: ignore

# Outlier Modes

MAD = 1
QUARTILES = 2
STDEV2 = 3


def df_mean(row, column_label_list=None):
def df_mean(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float:
"""
Calculate the mean of each row for the specified columns of a data frame

Expand All @@ -45,21 +52,20 @@ def df_mean(row, column_label_list=None):
:param row: row of the data frame
:type row: pandas.core.series.Series
:param column_label_list: list of column labels to calculate mean for
:type column_label_list: list

:return: Mean
:rtype: float
"""

from numpy import nanmean
from numpy import nanmean # type: ignore

if column_label_list is None:
column_label_list = list(row.index)

return nanmean(row[column_label_list])
return float(nanmean(row[column_label_list]))


def df_median(row, column_label_list=None):
def df_median(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float:
"""
Calculate the median of each row for the specified columns of a data frame

Expand All @@ -71,7 +77,6 @@ def df_median(row, column_label_list=None):
:param row: row of the data frame
:type row: pandas.core.series.Series
:param column_label_list: list of column labels to calculate median for
:type column_label_list: list

:return: Median
:rtype: float
Expand All @@ -82,10 +87,10 @@ def df_median(row, column_label_list=None):
if column_label_list is None:
column_label_list = list(row.index)

return nanmedian(row[column_label_list])
return float(nanmedian(row[column_label_list]))


def df_stdev(row, column_label_list=None):
def df_stdev(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float:
"""
Calculate the standard deviation of each row for the specified columns of a data frame

Expand All @@ -97,7 +102,6 @@ def df_stdev(row, column_label_list=None):
:param row: row of the data frame
:type row: pandas.core.series.Series
:param column_label_list: list of column labels to calculate standard deviation for
:type column_label_list: list

:return: Standard deviation
:rtype: float
Expand All @@ -108,10 +112,10 @@ def df_stdev(row, column_label_list=None):
if column_label_list is None:
column_label_list = list(row.index)

return nanstd(row[column_label_list])
return float(nanstd(row[column_label_list]))


def df_log_stdev(row, column_label_list=None):
def df_log_stdev(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float:
"""
Calculate the standard deviation of the log10 values in each row for the specified columns of a data frame

Expand All @@ -123,7 +127,6 @@ def df_log_stdev(row, column_label_list=None):
:param row: row of the data frame
:type row: pandas.core.series.Series
:param column_label_list: list of column labels to calculate standard deviation for
:type column_label_list: list

:return: Standard deviation
:rtype: float
Expand All @@ -135,10 +138,10 @@ def df_log_stdev(row, column_label_list=None):
if column_label_list is None:
column_label_list = list(row.index)

return nanstd([log10(x) if x > 0.0 else nan for x in row[column_label_list]])
return float(nanstd([log10(x) if x > 0.0 else nan for x in row[column_label_list]]))


def df_percentage(row, column_label, total):
def df_percentage(row: Series, column_label: str, total: float) -> float:
"""
Returns the value of the specified column as a percentage of the given total
The total is usually the sum of the specified column
Expand All @@ -153,7 +156,7 @@ def df_percentage(row, column_label, total):
:param column_label: column label to calculate percentage for
:type column_label: str
:param total: total value
:type column_label: str
:type total: float

:return: Percentage * 100
:rtype: float
Expand All @@ -162,7 +165,7 @@ def df_percentage(row, column_label, total):
return (row[column_label] / float(total)) * 100.0


def df_log(row, column_label_list, base=10):
def df_log(row: Series, column_label_list: Sequence[str], base: float = 10) -> float:
"""
Calculate the logarithm of the values in each row for the specified columns of a data frame

Expand All @@ -174,7 +177,6 @@ def df_log(row, column_label_list, base=10):
:param row: row of the data frame
:type row: pandas.core.series.Series
:param column_label_list: list of column labels to calculate log for
:type column_label_list: list
:param base: logarithmic base
:type base: float

Expand All @@ -190,7 +192,7 @@ def df_log(row, column_label_list, base=10):
return 0


def df_data_points(row, column_label_list):
def df_data_points(row: Series, column_label_list: Sequence[str]) -> List:
"""
Compile the values for the specified columns in each row into a list

Expand All @@ -202,7 +204,6 @@ def df_data_points(row, column_label_list):
:param row: row of the data frame
:type row: pandas.core.series.Series
:param column_label_list: list of column labels to calculate standard deviation for
:type column_label_list: list

:return: data points
:rtype: list
Expand All @@ -211,7 +212,7 @@ def df_data_points(row, column_label_list):
return [row[column_label] for column_label in column_label_list]


def df_outliers(row, column_label_list=None, outlier_mode=MAD):
def df_outliers(row: Series, column_label_list: Sequence[str] = None, outlier_mode: int = MAD) -> Series:
"""
Identify outliers in each row

Expand All @@ -223,7 +224,6 @@ def df_outliers(row, column_label_list=None, outlier_mode=MAD):
:param row: row of the data frame
:type row: pandas.core.series.Series
:param column_label_list: list of column labels to determine outliers for
:type column_label_list: list
:param outlier_mode: outlier detection method to use
:type outlier_mode: int

Expand All @@ -246,14 +246,14 @@ def df_outliers(row, column_label_list=None, outlier_mode=MAD):
elif outlier_mode == QUARTILES:
x = outliers.quartile_outliers(data)
elif outlier_mode == STDEV2:
x = outliers.stdev_outlier(data, 2) # outlier classed as more than 2 stdev away from mean
x = outliers.stdev_outlier(data, rng=2) # outlier classed as more than 2 stdev away from mean
else:
return None
raise ValueError("Unknown outlier mode.")

return pd.Series(list(x))
return Series(list(x))


def df_count(row, column_label_list=None):
def df_count(row: Series, column_label_list: Optional[Sequence[str]] = None) -> int:
"""
Count the number of occurrences of a non-NaN value in the specified columns of a data frame

Expand Down
33 changes: 23 additions & 10 deletions mathematical/linear_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,22 @@
# | DOI: `10.1021/acs.jproteome.8b00717 <http://dx.doi.org/10.1021/acs.jproteome.8b00717>`_
#

# stdlib
from typing import Optional, Sequence, Tuple, Union

# 3rd party
import numpy
from domdf_python_tools.doctools import is_documented_by
import numpy # type: ignore
from domdf_python_tools.doctools import is_documented_by # type: ignore

ArrayLike_Float = Union[Sequence[float], numpy.ndarray]


def linear_regression_vertical(x, y=None, a=None, b=None):
def linear_regression_vertical(
x: ArrayLike_Float,
y: Optional[ArrayLike_Float] = None,
a: Optional[float] = None,
b: Optional[float] = None,
) -> Tuple[float, float, float, float]:
"""
Calculate coefficients of a linear regression y = a * x + b.
The fit minimizes *vertical* distances between the points and the line.
Expand All @@ -73,7 +83,7 @@ def linear_regression_vertical(x, y=None, a=None, b=None):
:return: (a, b, r, stderr), where
a -- slope coefficient,
b -- free term,
r -- Peason correlation coefficient,
r -- Pearson correlation coefficient,
stderr -- standard deviation.
:rtype: tuple
"""
Expand All @@ -83,7 +93,7 @@ def linear_regression_vertical(x, y=None, a=None, b=None):
y = numpy.array(y, copy=False)
else:
if len(x.shape) != 2 or x.shape[-1] != 2:
raise TypeError('If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape))
raise TypeError(f'If `y` is not given, x.shape should be (N, 2), given: {x.shape}')
y = x[:, 1]
x = x[:, 0]
if a is not None and b is None:
Expand All @@ -96,15 +106,16 @@ def linear_regression_vertical(x, y=None, a=None, b=None):
r = numpy.corrcoef(x, y)[0, 1]
stderr = (y - a * x - b).std()

return a, b, r, stderr
return a, b, r, stderr # type: ignore # TODO


@is_documented_by(linear_regression_vertical)
def linear_regression(x, y=None, a=None, b=None):
return linear_regression_vertical(x, y, a, b)
linear_regression = linear_regression_vertical


def linear_regression_perpendicular(x, y=None):
def linear_regression_perpendicular(
x: ArrayLike_Float,
y: Optional[ArrayLike_Float] = None,
) -> Tuple[float, float, float, float]:
"""
Calculate coefficients of a linear regression y = a * x + b.
The fit minimizes *perpendicular* distances between the points and the line.
Expand All @@ -127,13 +138,15 @@ def linear_regression_perpendicular(x, y=None):
"""

x = numpy.array(x, copy=False)

if y is not None:
y = numpy.array(y, copy=False)
data = numpy.hstack((x.reshape((-1, 1)), y.reshape((-1, 1))))
else:
if len(x.shape) != 2 or x.shape[-1] != 2:
raise TypeError('If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape))
data = x

mu = data.mean(axis=0)
eigenvectors, eigenvalues, V = numpy.linalg.svd((data - mu).T, full_matrices=False)
a = eigenvectors[0][1] / eigenvectors[0][0]
Expand Down
56 changes: 40 additions & 16 deletions mathematical/outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,27 @@
#
#

import numpy
from . import utils
from . import stats
# stdlib
from typing import List, Sequence, Tuple

# 3rd party
import numpy # type: ignore

# this package
from . import stats, utils


def mad_outliers(
dataset,
strip_zero=True,
threshold=3,
):
dataset: Sequence[float],
strip_zero: bool = True,
threshold: int = 3,
) -> Tuple[List[float], List[float]]:
"""
Using the Median Absolute Deviation to Find Outliers

:param dataset:
:type dataset: list
:param strip_zero:
:type strip_zero: bool
:param threshold: The multiple of MAD above which values are considered to be outliers
Leys et al (2013) make the following recommendations:
1 In univariate statistics, the Median Absolute Deviation is the most robust
Expand Down Expand Up @@ -88,26 +94,35 @@ def mad_outliers(
return outliers, data_exc_outliers


def two_stdev(dataset, strip_zero=True):
def two_stdev(dataset: Sequence[float], strip_zero: bool = True) -> Tuple[List[float], List[float]]:
"""
Outliers are greater than 2x stdev from mean

:param dataset:
:param strip_zero:
:type strip_zero: bool

:return:
:return: # TODO
"""

return stdev_outlier(dataset, strip_zero=strip_zero)


def stdev_outlier(dataset, strip_zero=True, rng=int(2)):
def stdev_outlier(
dataset: Sequence[float],
strip_zero: bool = True,
rng: int = 2,
) -> Tuple[List[float], List[float]]:
"""
Outliers are greater than rng*stdev from mean

:param dataset:
:param strip_zero:
:type strip_zero: bool
:param rng:
:type rng:

:return:
:return: 'TODO
"""

dataset = utils.strip_none_bool_string(dataset)
Expand All @@ -133,11 +148,13 @@ def stdev_outlier(dataset, strip_zero=True, rng=int(2)):
return outliers, data_exc_outliers


def quartile_outliers(dataset, strip_zero=True):
def quartile_outliers(dataset: Sequence[float], strip_zero: bool = True) -> Tuple[List[float], List[float]]:
"""
outliers are more than 3x inter-quartile range from upper or lower quartile

:param dataset:
:param strip_zero:
:type strip_zero: bool

:return:
"""
Expand Down Expand Up @@ -171,14 +188,20 @@ def quartile_outliers(dataset, strip_zero=True):
return outliers, data_exc_outliers


def spss_outliers(dataset, strip_zero=True, mode="all"):
def spss_outliers(
dataset: Sequence[float],
strip_zero: bool = True,
mode: str = "all",
): # TODO: -> Tuple[List[float], List[float], List[float]]
"""
Based on IBM SPSS method for detecting outliers
Based on IBM SPSS method for detecting outliers.

Outliers more than 1.5*IQR from Q1 or Q3

"Extreme values" more than 3*IQR from Q1 or Q3

:param dataset:
:param mode:
:param mode: str

:return:
"""
Expand All @@ -194,6 +217,7 @@ def spss_outliers(dataset, strip_zero=True, mode="all"):
for val in dataset:
if val in ['', 0.0, 0]:
dataset.remove(val)

if len(dataset) == 0:
return float('nan')
elif dataset == [None]:
Expand Down
Loading