domdfcoding · domdfcoding · Jun 12, 2020 · Jun 3, 2020 · Jun 4, 2020 · Jun 4, 2020
diff --git a/mathematical/__init__.py b/mathematical/__init__.py
@@ -32,4 +32,5 @@
 __version__ = "0.1.11"
 __email__ = "dominic@davis-foster.co.uk"
 
+# this package
 from . import data_frames, outliers, stats, utils
diff --git a/mathematical/data_frames.py b/mathematical/data_frames.py
@@ -27,13 +27,20 @@
 #
 #
 
+# stdlib
+from typing import List, Optional, Sequence
+
+# 3rd party
+from pandas import Series  # type: ignore
+
 # Outlier Modes
+
 MAD = 1
 QUARTILES = 2
 STDEV2 = 3
 
 
-def df_mean(row, column_label_list=None):
+def df_mean(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float:
 	"""
 	Calculate the mean of each row for the specified columns of a data frame
 
@@ -45,21 +52,20 @@ def df_mean(row, column_label_list=None):
 	:param row: row of the data frame
 	:type row: pandas.core.series.Series
 	:param column_label_list: list of column labels to calculate mean for
-	:type column_label_list: list
 
 	:return: Mean
 	:rtype: float
 	"""
 
-	from numpy import nanmean
+	from numpy import nanmean  # type: ignore
 
 	if column_label_list is None:
 		column_label_list = list(row.index)
 
-	return nanmean(row[column_label_list])
+	return float(nanmean(row[column_label_list]))
 
 
-def df_median(row, column_label_list=None):
+def df_median(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float:
 	"""
 	Calculate the median of each row for the specified columns of a data frame
 
@@ -71,7 +77,6 @@ def df_median(row, column_label_list=None):
 	:param row: row of the data frame
 	:type row: pandas.core.series.Series
 	:param column_label_list: list of column labels to calculate median for
-	:type column_label_list: list
 
 	:return: Median
 	:rtype: float
@@ -82,10 +87,10 @@ def df_median(row, column_label_list=None):
 	if column_label_list is None:
 		column_label_list = list(row.index)
 
-	return nanmedian(row[column_label_list])
+	return float(nanmedian(row[column_label_list]))
 
 
-def df_stdev(row, column_label_list=None):
+def df_stdev(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float:
 	"""
 	Calculate the standard deviation of each row for the specified columns of a data frame
 
@@ -97,7 +102,6 @@ def df_stdev(row, column_label_list=None):
 	:param row: row of the data frame
 	:type row: pandas.core.series.Series
 	:param column_label_list: list of column labels to calculate standard deviation for
-	:type column_label_list: list
 
 	:return: Standard deviation
 	:rtype: float
@@ -108,10 +112,10 @@ def df_stdev(row, column_label_list=None):
 	if column_label_list is None:
 		column_label_list = list(row.index)
 
-	return nanstd(row[column_label_list])
+	return float(nanstd(row[column_label_list]))
 
 
-def df_log_stdev(row, column_label_list=None):
+def df_log_stdev(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float:
 	"""
 	Calculate the standard deviation of the log10 values in each row for the specified columns of a data frame
 
@@ -123,7 +127,6 @@ def df_log_stdev(row, column_label_list=None):
 	:param row: row of the data frame
 	:type row: pandas.core.series.Series
 	:param column_label_list: list of column labels to calculate standard deviation for
-	:type column_label_list: list
 
 	:return: Standard deviation
 	:rtype: float
@@ -135,10 +138,10 @@ def df_log_stdev(row, column_label_list=None):
 	if column_label_list is None:
 		column_label_list = list(row.index)
 
-	return nanstd([log10(x) if x > 0.0 else nan for x in row[column_label_list]])
+	return float(nanstd([log10(x) if x > 0.0 else nan for x in row[column_label_list]]))
 
 
-def df_percentage(row, column_label, total):
+def df_percentage(row: Series, column_label: str, total: float) -> float:
 	"""
 	Returns the value of the specified column as a percentage of the given total
 	The total is usually the sum of the specified column
@@ -153,7 +156,7 @@ def df_percentage(row, column_label, total):
 	:param column_label: column label to calculate percentage for
 	:type column_label: str
 	:param total: total value
-	:type column_label: str
+	:type total: float
 
 	:return: Percentage * 100
 	:rtype: float
@@ -162,7 +165,7 @@ def df_percentage(row, column_label, total):
 	return (row[column_label] / float(total)) * 100.0
 
 
-def df_log(row, column_label_list, base=10):
+def df_log(row: Series, column_label_list: Sequence[str], base: float = 10) -> float:
 	"""
 	Calculate the logarithm of the values in each row for the specified columns of a data frame
 
@@ -174,7 +177,6 @@ def df_log(row, column_label_list, base=10):
 	:param row: row of the data frame
 	:type row: pandas.core.series.Series
 	:param column_label_list: list of column labels to calculate log for
-	:type column_label_list: list
 	:param base: logarithmic base
 	:type base: float
 
@@ -190,7 +192,7 @@ def df_log(row, column_label_list, base=10):
 		return 0
 
 
-def df_data_points(row, column_label_list):
+def df_data_points(row: Series, column_label_list: Sequence[str]) -> List:
 	"""
 	Compile the values for the specified columns in each row into a list
 
@@ -202,7 +204,6 @@ def df_data_points(row, column_label_list):
 	:param row: row of the data frame
 	:type row: pandas.core.series.Series
 	:param column_label_list: list of column labels to calculate standard deviation for
-	:type column_label_list: list
 
 	:return: data points
 	:rtype: list
@@ -211,7 +212,7 @@ def df_data_points(row, column_label_list):
 	return [row[column_label] for column_label in column_label_list]
 
 
-def df_outliers(row, column_label_list=None, outlier_mode=MAD):
+def df_outliers(row: Series, column_label_list: Sequence[str] = None, outlier_mode: int = MAD) -> Series:
 	"""
 	Identify outliers in each row
 
@@ -223,7 +224,6 @@ def df_outliers(row, column_label_list=None, outlier_mode=MAD):
 	:param row: row of the data frame
 	:type row: pandas.core.series.Series
 	:param column_label_list: list of column labels to determine outliers for
-	:type column_label_list: list
 	:param outlier_mode: outlier detection method to use
 	:type outlier_mode: int
 
@@ -246,14 +246,14 @@ def df_outliers(row, column_label_list=None, outlier_mode=MAD):
 	elif outlier_mode == QUARTILES:
 		x = outliers.quartile_outliers(data)
 	elif outlier_mode == STDEV2:
-		x = outliers.stdev_outlier(data, 2)  # outlier classed as more than 2 stdev away from mean
+		x = outliers.stdev_outlier(data, rng=2)  # outlier classed as more than 2 stdev away from mean
 	else:
-		return None
+		raise ValueError("Unknown outlier mode.")
 
-	return pd.Series(list(x))
+	return Series(list(x))
 
 
-def df_count(row, column_label_list=None):
+def df_count(row: Series, column_label_list: Optional[Sequence[str]] = None) -> int:
 	"""
 	Count the number of occurrences of a non-NaN value in the specified columns of a data frame
 

diff --git a/mathematical/linear_regression.py b/mathematical/linear_regression.py
@@ -49,12 +49,22 @@
 #  |  DOI: `10.1021/acs.jproteome.8b00717 <http://dx.doi.org/10.1021/acs.jproteome.8b00717>`_
 #
 
+# stdlib
+from typing import Optional, Sequence, Tuple, Union
+
 # 3rd party
-import numpy
-from domdf_python_tools.doctools import is_documented_by
+import numpy  # type: ignore
+from domdf_python_tools.doctools import is_documented_by  # type: ignore
+
+ArrayLike_Float = Union[Sequence[float], numpy.ndarray]
 
 
-def linear_regression_vertical(x, y=None, a=None, b=None):
+def linear_regression_vertical(
+		x: ArrayLike_Float,
+		y: Optional[ArrayLike_Float] = None,
+		a: Optional[float] = None,
+		b: Optional[float] = None,
+		) -> Tuple[float, float, float, float]:
 	"""
 	Calculate coefficients of a linear regression y = a * x + b.
 	The fit minimizes *vertical* distances between the points and the line.
@@ -73,7 +83,7 @@ def linear_regression_vertical(x, y=None, a=None, b=None):
 	:return: (a, b, r, stderr), where
 		a -- slope coefficient,
 		b -- free term,
-		r -- Peason correlation coefficient,
+		r -- Pearson correlation coefficient,
 		stderr -- standard deviation.
 	:rtype: tuple
 	"""
@@ -83,7 +93,7 @@ def linear_regression_vertical(x, y=None, a=None, b=None):
 		y = numpy.array(y, copy=False)
 	else:
 		if len(x.shape) != 2 or x.shape[-1] != 2:
-			raise TypeError('If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape))
+			raise TypeError(f'If `y` is not given, x.shape should be (N, 2), given: {x.shape}')
 		y = x[:, 1]
 		x = x[:, 0]
 	if a is not None and b is None:
@@ -96,15 +106,16 @@ def linear_regression_vertical(x, y=None, a=None, b=None):
 	r = numpy.corrcoef(x, y)[0, 1]
 	stderr = (y - a * x - b).std()
 
-	return a, b, r, stderr
+	return a, b, r, stderr  # type: ignore  # TODO
 
 
-@is_documented_by(linear_regression_vertical)
-def linear_regression(x, y=None, a=None, b=None):
-	return linear_regression_vertical(x, y, a, b)
+linear_regression = linear_regression_vertical
 
 
-def linear_regression_perpendicular(x, y=None):
+def linear_regression_perpendicular(
+		x: ArrayLike_Float,
+		y: Optional[ArrayLike_Float] = None,
+		) -> Tuple[float, float, float, float]:
 	"""
 	Calculate coefficients of a linear regression y = a * x + b.
 	The fit minimizes *perpendicular* distances between the points and the line.
@@ -127,13 +138,15 @@ def linear_regression_perpendicular(x, y=None):
 	"""
 
 	x = numpy.array(x, copy=False)
+
 	if y is not None:
 		y = numpy.array(y, copy=False)
 		data = numpy.hstack((x.reshape((-1, 1)), y.reshape((-1, 1))))
 	else:
 		if len(x.shape) != 2 or x.shape[-1] != 2:
 			raise TypeError('If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape))
 		data = x
+
 	mu = data.mean(axis=0)
 	eigenvectors, eigenvalues, V = numpy.linalg.svd((data - mu).T, full_matrices=False)
 	a = eigenvectors[0][1] / eigenvectors[0][0]

diff --git a/mathematical/outliers.py b/mathematical/outliers.py
@@ -32,21 +32,27 @@
 #
 #
 
-import numpy
-from . import utils
-from . import stats
+# stdlib
+from typing import List, Sequence, Tuple
+
+# 3rd party
+import numpy  # type: ignore
+
+# this package
+from . import stats, utils
 
 
 def mad_outliers(
-		dataset,
-		strip_zero=True,
-		threshold=3,
-		):
+		dataset: Sequence[float],
+		strip_zero: bool = True,
+		threshold: int = 3,
+		) -> Tuple[List[float], List[float]]:
 	"""
 	Using the Median Absolute Deviation to Find Outliers
 
 	:param dataset:
-	:type dataset: list
+	:param strip_zero:
+	:type strip_zero: bool
 	:param threshold: The multiple of MAD above which values are considered to be outliers
 		Leys et al (2013) make the following recommendations:
 			1 In univariate statistics, the Median Absolute Deviation is the most robust
@@ -88,26 +94,35 @@ def mad_outliers(
 	return outliers, data_exc_outliers
 
 
-def two_stdev(dataset, strip_zero=True):
+def two_stdev(dataset: Sequence[float], strip_zero: bool = True) -> Tuple[List[float], List[float]]:
 	"""
 	Outliers are greater than 2x stdev from mean
 
 	:param dataset:
+	:param strip_zero:
+	:type strip_zero: bool
 
-	:return:
+	:return: #	TODO
 	"""
 
 	return stdev_outlier(dataset, strip_zero=strip_zero)
 
 
-def stdev_outlier(dataset, strip_zero=True, rng=int(2)):
+def stdev_outlier(
+		dataset: Sequence[float],
+		strip_zero: bool = True,
+		rng: int = 2,
+		) -> Tuple[List[float], List[float]]:
 	"""
 	Outliers are greater than rng*stdev from mean
 
 	:param dataset:
+	:param strip_zero:
+	:type strip_zero: bool
 	:param rng:
+	:type rng:
 
-	:return:
+	:return: 'TODO
 	"""
 
 	dataset = utils.strip_none_bool_string(dataset)
@@ -133,11 +148,13 @@ def stdev_outlier(dataset, strip_zero=True, rng=int(2)):
 	return outliers, data_exc_outliers
 
 
-def quartile_outliers(dataset, strip_zero=True):
+def quartile_outliers(dataset: Sequence[float], strip_zero: bool = True) -> Tuple[List[float], List[float]]:
 	"""
 	outliers are more than 3x inter-quartile range from upper or lower quartile
 
 	:param dataset:
+	:param strip_zero:
+	:type strip_zero: bool
 
 	:return:
 	"""
@@ -171,14 +188,20 @@ def quartile_outliers(dataset, strip_zero=True):
 	return outliers, data_exc_outliers
 
 
-def spss_outliers(dataset, strip_zero=True, mode="all"):
+def spss_outliers(
+		dataset: Sequence[float],
+		strip_zero: bool = True,
+		mode: str = "all",
+		):  # TODO:  -> Tuple[List[float], List[float], List[float]]
 	"""
-	Based on IBM SPSS method for detecting outliers
+	Based on IBM SPSS method for detecting outliers.
+
 	Outliers more than 1.5*IQR from Q1 or Q3
+
 	"Extreme values" more than 3*IQR from Q1 or Q3
 
 	:param dataset:
-	:param mode:
+	:param mode: str
 
 	:return:
 	"""
@@ -194,6 +217,7 @@ def spss_outliers(dataset, strip_zero=True, mode="all"):
 		for val in dataset:
 			if val in ['', 0.0, 0]:
 				dataset.remove(val)
+
 	if len(dataset) == 0:
 		return float('nan')
 	elif dataset == [None]: