From ac4af5a6b7f360b2a3cf5c6e1e32d9bbd30f399b Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 3 Jun 2020 21:06:31 +0100 Subject: [PATCH 01/11] Added type annotations --- mathematical/data_frames.py | 24 +++++++------ mathematical/linear_regression.py | 2 +- mathematical/outliers.py | 28 +++++++-------- mathematical/stats.py | 34 +++++++++--------- mathematical/utils.py | 57 ++++++++++++++++--------------- 5 files changed, 76 insertions(+), 69 deletions(-) diff --git a/mathematical/data_frames.py b/mathematical/data_frames.py index 87e61bd..1b4b67c 100644 --- a/mathematical/data_frames.py +++ b/mathematical/data_frames.py @@ -28,12 +28,16 @@ # # Outlier Modes +from typing import List + +from pandas import Series + MAD = 1 QUARTILES = 2 STDEV2 = 3 -def df_mean(row, column_label_list=None): +def df_mean(row: Series, column_label_list: List[str] = None) -> float: #TODO """ Calculate the mean of each row for the specified columns of a data frame @@ -59,7 +63,7 @@ def df_mean(row, column_label_list=None): return nanmean(row[column_label_list]) -def df_median(row, column_label_list=None): +def df_median(row: Series, column_label_list: List[str] = None) -> float: #TODO """ Calculate the median of each row for the specified columns of a data frame @@ -85,7 +89,7 @@ def df_median(row, column_label_list=None): return nanmedian(row[column_label_list]) -def df_stdev(row, column_label_list=None): +def df_stdev(row: Series, column_label_list: List[str] = None) -> float: #TODO """ Calculate the standard deviation of each row for the specified columns of a data frame @@ -111,7 +115,7 @@ def df_stdev(row, column_label_list=None): return nanstd(row[column_label_list]) -def df_log_stdev(row, column_label_list=None): +def df_log_stdev(row: Series, column_label_list: List[str] = None) -> float: #TODO """ Calculate the standard deviation of the log10 values in each row for the specified columns of a data frame @@ -138,7 +142,7 @@ def df_log_stdev(row, column_label_list=None): return nanstd([log10(x) if x > 0.0 else nan for x in row[column_label_list]]) -def df_percentage(row, column_label, total): +def df_percentage(row: Series, column_label: str, total: float) -> float: """ Returns the value of the specified column as a percentage of the given total The total is usually the sum of the specified column @@ -153,7 +157,7 @@ def df_percentage(row, column_label, total): :param column_label: column label to calculate percentage for :type column_label: str :param total: total value - :type column_label: str + :type total: float :return: Percentage * 100 :rtype: float @@ -162,7 +166,7 @@ def df_percentage(row, column_label, total): return (row[column_label] / float(total)) * 100.0 -def df_log(row, column_label_list, base=10): +def df_log(row: Series, column_label_list: List[str], base = 10) -> float: """ Calculate the logarithm of the values in each row for the specified columns of a data frame @@ -190,7 +194,7 @@ def df_log(row, column_label_list, base=10): return 0 -def df_data_points(row, column_label_list): +def df_data_points(row: Series, column_label_list: List[str]) -> list: """ Compile the values for the specified columns in each row into a list @@ -211,7 +215,7 @@ def df_data_points(row, column_label_list): return [row[column_label] for column_label in column_label_list] -def df_outliers(row, column_label_list=None, outlier_mode=MAD): +def df_outliers(row: Series, column_label_list: List[str] = None, outlier_mode=MAD): #TODO """ Identify outliers in each row @@ -253,7 +257,7 @@ def df_outliers(row, column_label_list=None, outlier_mode=MAD): return pd.Series(list(x)) -def df_count(row, column_label_list=None): +def df_count(row: Series, column_label_list: [str] = None) -> int: """ Count the number of occurrences of a non-NaN value in the specified columns of a data frame diff --git a/mathematical/linear_regression.py b/mathematical/linear_regression.py index 6966cc2..9d53a58 100644 --- a/mathematical/linear_regression.py +++ b/mathematical/linear_regression.py @@ -54,7 +54,7 @@ from domdf_python_tools.doctools import is_documented_by -def linear_regression_vertical(x, y=None, a=None, b=None): +def linear_regression_vertical(x, y=None, a=None, b=None) -> tuple: """ Calculate coefficients of a linear regression y = a * x + b. The fit minimizes *vertical* distances between the points and the line. diff --git a/mathematical/outliers.py b/mathematical/outliers.py index d7f7b27..5669e31 100644 --- a/mathematical/outliers.py +++ b/mathematical/outliers.py @@ -38,9 +38,9 @@ def mad_outliers( - dataset, - strip_zero=True, - threshold=3, + dataset: list, + strip_zero: bool = True, + threshold: int = 3, ): """ Using the Median Absolute Deviation to Find Outliers @@ -63,7 +63,7 @@ def mad_outliers( See https://dipot.ulb.ac.be/dspace/bitstream/2013/139499/1/Leys_MAD_final-libre.pdf :type threshold: int - :return: + :return: #TODO """ dataset = utils.strip_none_bool_string(dataset) @@ -88,26 +88,26 @@ def mad_outliers( return outliers, data_exc_outliers -def two_stdev(dataset, strip_zero=True): +def two_stdev(dataset, strip_zero: bool = True): """ Outliers are greater than 2x stdev from mean :param dataset: - :return: + :return: # TODO """ return stdev_outlier(dataset, strip_zero=strip_zero) -def stdev_outlier(dataset, strip_zero=True, rng=int(2)): +def stdev_outlier(dataset, strip_zero: bool = True, rng=int(2)): """ Outliers are greater than rng*stdev from mean :param dataset: :param rng: - :return: + :return: 'TODO """ dataset = utils.strip_none_bool_string(dataset) @@ -133,13 +133,13 @@ def stdev_outlier(dataset, strip_zero=True, rng=int(2)): return outliers, data_exc_outliers -def quartile_outliers(dataset, strip_zero=True): +def quartile_outliers(dataset, strip_zero: bool = True): """ outliers are more than 3x inter-quartile range from upper or lower quartile - :param dataset: + :param dataset: # - :return: + :return: #TODO """ dataset = utils.strip_none_bool_string(dataset) @@ -171,16 +171,16 @@ def quartile_outliers(dataset, strip_zero=True): return outliers, data_exc_outliers -def spss_outliers(dataset, strip_zero=True, mode="all"): +def spss_outliers(dataset, strip_zero: bool = True, mode: str = "all"): """ Based on IBM SPSS method for detecting outliers Outliers more than 1.5*IQR from Q1 or Q3 "Extreme values" more than 3*IQR from Q1 or Q3 :param dataset: - :param mode: + :param mode: str - :return: + :return: # TODO """ if len(dataset) < 2: diff --git a/mathematical/stats.py b/mathematical/stats.py index 29bcc5a..ce26a49 100644 --- a/mathematical/stats.py +++ b/mathematical/stats.py @@ -46,13 +46,15 @@ import warnings # 3rd party +from typing import List + import numpy # this package from . import utils -def mean_none(dataset): +def mean_none(dataset: List[str]): """ Calculate the mean, excluding NaN, strings, boolean values, and zeros @@ -69,7 +71,7 @@ def mean_none(dataset): return numpy.nanmean(dataset) -def std_none(dataset, ddof=1): +def std_none(dataset: List[str], ddof: int = 1): """ Calculate the standard deviation, excluding NaN, strings, boolean values, and zeros @@ -85,10 +87,10 @@ def std_none(dataset, ddof=1): dataset = utils.remove_zero(dataset) print(dataset) - return numpy.nanstd(dataset, ddof=ddof) + return numpy.nanstd(dataset, ddof = ddof) -def median_none(dataset): +def median_none(dataset:List[str]): """ Calculate the median, excluding NaN, strings, boolean values, and zeros @@ -105,7 +107,7 @@ def median_none(dataset): return numpy.nanmedian(dataset) -def iqr_none(dataset): +def iqr_none(dataset:List[str]) -> float: """ Calculate the interquartile range, excluding NaN, strings, boolean values, and zeros @@ -123,7 +125,7 @@ def iqr_none(dataset): return iq -def percentile_none(dataset, percentage): +def percentile_none(dataset: List[str], percentage: float) -> float: """ Calculate the given percentile, excluding NaN, strings, boolean values, and zeros @@ -149,7 +151,7 @@ def percentile_none(dataset, percentage): return numpy.percentile(dataset, percentage) -def pooled_sd(sample1, sample2, weighted=False): +def pooled_sd(sample1: List, sample2: List, weighted: List = False) -> float: """ Pooled Standard Deviation @@ -176,7 +178,7 @@ def pooled_sd(sample1, sample2, weighted=False): return numpy.sqrt(((sd1**2) + (sd2**2)) / 2) -def d_cohen(sample1, sample2, sd=1, tail=1, pooled=False): +def d_cohen(sample1: List, sample2: List, sd: int = 1, tail = 1, pooled = False) -> float: """ Cohen's d-Statistic @@ -212,7 +214,7 @@ def d_cohen(sample1, sample2, sd=1, tail=1, pooled=False): return (mean1 - mean2) / sd -def g_hedge(sample1, sample2): +def g_hedge(sample1: List, sample2: List): #TODO """ Hedge's g-Statistic @@ -229,7 +231,7 @@ def g_hedge(sample1, sample2): return (mean1 - mean2) / pooled_sd(sample1, sample2, True) -def g_durlak_bias(g, n): +def g_durlak_bias(g :float, n: float) -> float: #TODO """ Application of Durlak's bias correction to the Hedge's g statistic. Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm @@ -249,7 +251,7 @@ def g_durlak_bias(g, n): return g * Durlak -def interpret_d(d_or_g): +def interpret_d(d_or_g: float) -> float: """ Interpret Cohen's d or Hedge's g values using Table 1 from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3444174/ @@ -273,7 +275,7 @@ def interpret_d(d_or_g): return "Large Effect" -def _contains_nan(a, nan_policy='propagate'): +def _contains_nan(a, nan_policy = 'propagate'): policies = ['propagate', 'raise', 'omit'] if nan_policy not in policies: raise ValueError("nan_policy must be one of {%s}" % ', '.join(f"'{s}'" for s in policies)) @@ -304,7 +306,7 @@ def _contains_nan(a, nan_policy='propagate'): return contains_nan, nan_policy -def median_absolute_deviation(x, axis=0, center=numpy.median, scale=1.4826, nan_policy='propagate'): +def median_absolute_deviation(x, axis: int = 0, center = numpy.median, scale: int = 1.4826, nan_policy = 'propagate'): #TODO """ Compute the median absolute deviation of the data along the given axis. The median absolute deviation (MAD, [1]_) computes the median over the @@ -387,7 +389,7 @@ def median_absolute_deviation(x, axis=0, center=numpy.median, scale=1.4826, nan_ return scale * mad -def absolute_deviation(x, axis=0, center=numpy.median, nan_policy='propagate'): +def absolute_deviation(x, axis: int = 0, center=numpy.median, nan_policy='propagate'): #TODO """ Compute the absolute deviations from the median of the data along the given axis. @@ -447,7 +449,7 @@ def absolute_deviation(x, axis=0, center=numpy.median, nan_policy='propagate'): return ad -def absolute_deviation_from_median(x, axis=0, center=numpy.median, scale=1.4826, nan_policy='propagate'): +def absolute_deviation_from_median(x, axis: int = 0, center = numpy.median, scale: int = 1.4826, nan_policy = 'propagate'): """ Compute the absolute deviation from the median of each point in the data along the given axis, given in terms of the MAD. @@ -499,7 +501,7 @@ def absolute_deviation_from_median(x, axis=0, center=numpy.median, scale=1.4826, return ad_from_median -def within1min(value1, value2): +def within1min(value1: float, value2: float): if value1 not in [0, None, ''] and value2 not in [0, None, '']: return (float(value1) - 1) < (float(value2)) < (float(value1) + 1) else: diff --git a/mathematical/utils.py b/mathematical/utils.py index 6fd0c1d..854506c 100644 --- a/mathematical/utils.py +++ b/mathematical/utils.py @@ -78,6 +78,7 @@ # # stdlib +import decimal import math from operator import eq, ge, gt, le, lt, ne @@ -85,7 +86,7 @@ import numpy -def intdiv(p, q): +def intdiv(p: float, q: float) -> int: """ Integer divsions which rounds toward zero @@ -105,7 +106,7 @@ def intdiv(p, q): return r -def roman(num): +def roman(num: float) -> str: """ Examples -------- @@ -125,7 +126,7 @@ def roman(num): return result -def magnitude(x): +def magnitude(x: float) -> int: """ Determine the magnitude of the given value @@ -146,7 +147,7 @@ def magnitude(x): # return int(math.floor(math.log10(abs(num)))) -def remove_zero(inputlist): +def remove_zero(inputlist: list)-> list: """ Remove zero values from the given list Also removes False and None @@ -162,7 +163,7 @@ def remove_zero(inputlist): return list(inputlist[numpy.nonzero(inputlist)]) -def isint(num): # Only works with floating point numbers +def isint(num: float) -> bool: # Only works with floating point numbers """ Checks whether a float is an integer value @@ -175,7 +176,7 @@ def isint(num): # Only works with floating point numbers return num == int(num) -def RepresentsInt(s): +def RepresentsInt(s: bool): """ Checks whether a value can be converted to int @@ -190,7 +191,7 @@ def RepresentsInt(s): return False -def rounders(val_to_round, round_format): +def rounders(val_to_round: int, round_format: str) -> decimal: """ Round a value to the specified number format, e.g. "0.000" for three decimal places @@ -206,7 +207,7 @@ def rounders(val_to_round, round_format): return Decimal(Decimal(val_to_round).quantize(Decimal(str(round_format)), rounding=ROUND_HALF_UP)) -def strip_strings(ls): +def strip_strings(ls: list) -> list: """ Remove strings from a list @@ -220,7 +221,7 @@ def strip_strings(ls): return [x for x in ls if not isinstance(x, str)] -def strip_booleans(ls): +def strip_booleans(ls: list) -> list: """ Remove booleans from a list @@ -234,7 +235,7 @@ def strip_booleans(ls): return [x for x in ls if not isinstance(x, bool)] -def strip_nonetype(ls): +def strip_nonetype(ls: list) -> list: """ Remove None from a list @@ -248,7 +249,7 @@ def strip_nonetype(ls): return [x for x in ls if x is not None] -def strip_none_bool_string(ls): +def strip_none_bool_string(ls: list) -> list: """ Remove None, Boolean and strings from a list @@ -264,7 +265,7 @@ def strip_none_bool_string(ls): return ls -def gcd(a, b): +def gcd(a: float, b: float) -> float: """ Returns the GCD (HCF) of a and b using Euclid's Algorithm @@ -280,7 +281,7 @@ def gcd(a, b): return math.gcd(a, b) -def gcd_array(array): +def gcd_array(array) -> float: """ Returns the GCD for an array of numbers using Euclid's Algorithm @@ -289,7 +290,7 @@ def gcd_array(array): :param array: :type array: :return: - :rtype: + :rtype: float """ a = array[0] @@ -302,13 +303,13 @@ def gcd_array(array): return x -def gcd2(numbers): +def gcd2(numbers: float) -> float: """ Returns the GCD (HCF) of a list of numbers using Euclid's Algorithm :param numbers: - :return: + :return:float """ c = numbers[0] @@ -317,12 +318,12 @@ def gcd2(numbers): return c -def lcm(numbers): +def lcm(numbers: float) -> float: """ Returns the LCM of a list of numbers using Euclid's Algorithm :param numbers: - :return: + :return: float """ product = numbers[0] @@ -336,30 +337,30 @@ def lcm(numbers): return product -def hcf(a, b): +def hcf(a: float, b: float): """ :param a: :param b: - :return: + :return:float """ gcd(a, b) -def hcf2(numbers): +def hcf2(numbers: float) -> float: """ :param numbers: - :return: + :return:float """ gcd2(numbers) -def modInverse(a, m): +def modInverse(a: float, m: float): """ Returns the modular inverse of a % m, which is the number x such that a*x % m = 1 :param a: @@ -385,7 +386,7 @@ def modInverse(a, m): _precalc_fact = numpy.log([math.factorial(n) for n in range(20)]) -def log_factorial(x): +def log_factorial(x: float)-> float: x = numpy.array(x) pf = _precalc_fact m = (x >= pf.size) @@ -396,15 +397,15 @@ def log_factorial(x): return out -def _log_pi_r(d, k, p=0.5): +def _log_pi_r(d: float, k: float, p: float = 0.5) -> float: return k * math.log(p) + log_factorial(k + d) - log_factorial(k) - log_factorial(d) -def _log_pi(d, k, p=0.5): +def _log_pi(d: float, k: float, p: float = 0.5) -> float: return _log_pi_r(d, k, p) + (d + 1) * math.log(1 - p) -def _expectation(d, T, p=0.5): +def _expectation(d: float, T: float, p: float = 0.5): if T is None: return d + 1 T = numpy.array(T, dtype=int) @@ -413,7 +414,7 @@ def _expectation(d, T, p=0.5): return ((m * pi).cumsum() / pi.cumsum())[T] -def _confidence_value(conf, d, T, p=0.5): +def _confidence_value(conf: float, d: float, T: float, p: float = 0.5) : if T is not None: T = numpy.array(T, dtype=int) m = numpy.arange(T.max() + 1, dtype=int) From 5a9b9bc77c98dcc53f38c3f3e22384a4b174027f Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 4 Jun 2020 09:01:24 +0100 Subject: [PATCH 02/11] Added type annotations --- mathematical/data_frames.py | 24 ++++++++++++------------ mathematical/linear_regression.py | 2 +- mathematical/outliers.py | 2 +- mathematical/stats.py | 22 +++++++++++----------- mathematical/utils.py | 18 ++++++++++++------ 5 files changed, 37 insertions(+), 31 deletions(-) diff --git a/mathematical/data_frames.py b/mathematical/data_frames.py index 1b4b67c..6705f71 100644 --- a/mathematical/data_frames.py +++ b/mathematical/data_frames.py @@ -37,7 +37,7 @@ STDEV2 = 3 -def df_mean(row: Series, column_label_list: List[str] = None) -> float: #TODO +def df_mean(row: Series, column_label_list: List[str] = None) -> float: """ Calculate the mean of each row for the specified columns of a data frame @@ -60,10 +60,10 @@ def df_mean(row: Series, column_label_list: List[str] = None) -> float: #TODO if column_label_list is None: column_label_list = list(row.index) - return nanmean(row[column_label_list]) + return float(nanmean(row[column_label_list])) -def df_median(row: Series, column_label_list: List[str] = None) -> float: #TODO +def df_median(row: Series, column_label_list: List[str] = None) -> float: """ Calculate the median of each row for the specified columns of a data frame @@ -86,10 +86,10 @@ def df_median(row: Series, column_label_list: List[str] = None) -> float: #TODO if column_label_list is None: column_label_list = list(row.index) - return nanmedian(row[column_label_list]) + return float(nanmedian(row[column_label_list])) -def df_stdev(row: Series, column_label_list: List[str] = None) -> float: #TODO +def df_stdev(row: Series, column_label_list: List[str] = None) -> float: """ Calculate the standard deviation of each row for the specified columns of a data frame @@ -112,10 +112,10 @@ def df_stdev(row: Series, column_label_list: List[str] = None) -> float: #TODO if column_label_list is None: column_label_list = list(row.index) - return nanstd(row[column_label_list]) + return float(nanstd(row[column_label_list])) -def df_log_stdev(row: Series, column_label_list: List[str] = None) -> float: #TODO +def df_log_stdev(row: Series, column_label_list: List[str] = None) -> float: """ Calculate the standard deviation of the log10 values in each row for the specified columns of a data frame @@ -139,7 +139,7 @@ def df_log_stdev(row: Series, column_label_list: List[str] = None) -> float: #TO if column_label_list is None: column_label_list = list(row.index) - return nanstd([log10(x) if x > 0.0 else nan for x in row[column_label_list]]) + return float(nanstd([log10(x) if x > 0.0 else nan for x in row[column_label_list]])) def df_percentage(row: Series, column_label: str, total: float) -> float: @@ -166,7 +166,7 @@ def df_percentage(row: Series, column_label: str, total: float) -> float: return (row[column_label] / float(total)) * 100.0 -def df_log(row: Series, column_label_list: List[str], base = 10) -> float: +def df_log(row: Series, column_label_list: List[str], base: float = 10) -> float: """ Calculate the logarithm of the values in each row for the specified columns of a data frame @@ -194,7 +194,7 @@ def df_log(row: Series, column_label_list: List[str], base = 10) -> float: return 0 -def df_data_points(row: Series, column_label_list: List[str]) -> list: +def df_data_points(row: Series, column_label_list: List[str]) -> List: """ Compile the values for the specified columns in each row into a list @@ -215,7 +215,7 @@ def df_data_points(row: Series, column_label_list: List[str]) -> list: return [row[column_label] for column_label in column_label_list] -def df_outliers(row: Series, column_label_list: List[str] = None, outlier_mode=MAD): #TODO +def df_outliers(row: Series, column_label_list: List[str] = None, outlier_mode: int = MAD) -> Series: """ Identify outliers in each row @@ -254,7 +254,7 @@ def df_outliers(row: Series, column_label_list: List[str] = None, outlier_mode=M else: return None - return pd.Series(list(x)) + return Series(list(x)) def df_count(row: Series, column_label_list: [str] = None) -> int: diff --git a/mathematical/linear_regression.py b/mathematical/linear_regression.py index 9d53a58..e0c6ec0 100644 --- a/mathematical/linear_regression.py +++ b/mathematical/linear_regression.py @@ -54,7 +54,7 @@ from domdf_python_tools.doctools import is_documented_by -def linear_regression_vertical(x, y=None, a=None, b=None) -> tuple: +def linear_regression_vertical(x, y = None, a = None, b = None) -> tuple: """ Calculate coefficients of a linear regression y = a * x + b. The fit minimizes *vertical* distances between the points and the line. diff --git a/mathematical/outliers.py b/mathematical/outliers.py index 5669e31..ff513f3 100644 --- a/mathematical/outliers.py +++ b/mathematical/outliers.py @@ -38,7 +38,7 @@ def mad_outliers( - dataset: list, + dataset: List, strip_zero: bool = True, threshold: int = 3, ): diff --git a/mathematical/stats.py b/mathematical/stats.py index ce26a49..17237c3 100644 --- a/mathematical/stats.py +++ b/mathematical/stats.py @@ -54,7 +54,7 @@ from . import utils -def mean_none(dataset: List[str]): +def mean_none(dataset: List) -> float: """ Calculate the mean, excluding NaN, strings, boolean values, and zeros @@ -68,10 +68,10 @@ def mean_none(dataset: List[str]): dataset = utils.strip_none_bool_string(dataset) dataset = utils.remove_zero(dataset) - return numpy.nanmean(dataset) + return float(numpy.nanmean(dataset)) -def std_none(dataset: List[str], ddof: int = 1): +def std_none(dataset: List[str], ddof: int = 1) -> float: """ Calculate the standard deviation, excluding NaN, strings, boolean values, and zeros @@ -87,7 +87,7 @@ def std_none(dataset: List[str], ddof: int = 1): dataset = utils.remove_zero(dataset) print(dataset) - return numpy.nanstd(dataset, ddof = ddof) + return float(numpy.nanstd(dataset, ddof=ddof)) def median_none(dataset:List[str]): @@ -122,7 +122,7 @@ def iqr_none(dataset:List[str]) -> float: q3 = percentile_none(dataset, 75) iq = q3 - q1 - return iq + return float(iq) def percentile_none(dataset: List[str], percentage: float) -> float: @@ -148,7 +148,7 @@ def percentile_none(dataset: List[str], percentage: float) -> float: if len(dataset) < 2: raise ValueError("Dataset too small") - return numpy.percentile(dataset, percentage) + return float(numpy.percentile(dataset, percentage)) def pooled_sd(sample1: List, sample2: List, weighted: List = False) -> float: @@ -275,7 +275,7 @@ def interpret_d(d_or_g: float) -> float: return "Large Effect" -def _contains_nan(a, nan_policy = 'propagate'): +def _contains_nan(a, nan_policy:str = 'propagate'): policies = ['propagate', 'raise', 'omit'] if nan_policy not in policies: raise ValueError("nan_policy must be one of {%s}" % ', '.join(f"'{s}'" for s in policies)) @@ -306,7 +306,7 @@ def _contains_nan(a, nan_policy = 'propagate'): return contains_nan, nan_policy -def median_absolute_deviation(x, axis: int = 0, center = numpy.median, scale: int = 1.4826, nan_policy = 'propagate'): #TODO +def median_absolute_deviation(x, axis: int = 0, center = numpy.median, scale: int = 1.4826, nan_policy: str = 'propagate'): #TODO """ Compute the median absolute deviation of the data along the given axis. The median absolute deviation (MAD, [1]_) computes the median over the @@ -389,7 +389,7 @@ def median_absolute_deviation(x, axis: int = 0, center = numpy.median, scale: in return scale * mad -def absolute_deviation(x, axis: int = 0, center=numpy.median, nan_policy='propagate'): #TODO +def absolute_deviation(x, axis: int = 0, center=numpy.median, nan_policy: str = 'propagate'): #TODO """ Compute the absolute deviations from the median of the data along the given axis. @@ -449,7 +449,7 @@ def absolute_deviation(x, axis: int = 0, center=numpy.median, nan_policy='propag return ad -def absolute_deviation_from_median(x, axis: int = 0, center = numpy.median, scale: int = 1.4826, nan_policy = 'propagate'): +def absolute_deviation_from_median(x, axis: int = 0, center = numpy.median, scale: int = 1.4826, nan_policy: str = 'propagate'): """ Compute the absolute deviation from the median of each point in the data along the given axis, given in terms of the MAD. @@ -501,7 +501,7 @@ def absolute_deviation_from_median(x, axis: int = 0, center = numpy.median, scal return ad_from_median -def within1min(value1: float, value2: float): +def within1min(value1: float, value2: float) -> bool: if value1 not in [0, None, ''] and value2 not in [0, None, '']: return (float(value1) - 1) < (float(value2)) < (float(value1) + 1) else: diff --git a/mathematical/utils.py b/mathematical/utils.py index 854506c..0099bc9 100644 --- a/mathematical/utils.py +++ b/mathematical/utils.py @@ -83,6 +83,8 @@ from operator import eq, ge, gt, le, lt, ne # 3rd party +from typing import List + import numpy @@ -147,7 +149,7 @@ def magnitude(x: float) -> int: # return int(math.floor(math.log10(abs(num)))) -def remove_zero(inputlist: list)-> list: +def remove_zero(inputlist: List)-> List: """ Remove zero values from the given list Also removes False and None @@ -191,7 +193,11 @@ def RepresentsInt(s: bool): return False -def rounders(val_to_round: int, round_format: str) -> decimal: +class Decimal(object): + pass + + +def rounders(val_to_round: int, round_format: str) -> Decimal: """ Round a value to the specified number format, e.g. "0.000" for three decimal places @@ -207,7 +213,7 @@ def rounders(val_to_round: int, round_format: str) -> decimal: return Decimal(Decimal(val_to_round).quantize(Decimal(str(round_format)), rounding=ROUND_HALF_UP)) -def strip_strings(ls: list) -> list: +def strip_strings(ls: List) -> List: """ Remove strings from a list @@ -221,7 +227,7 @@ def strip_strings(ls: list) -> list: return [x for x in ls if not isinstance(x, str)] -def strip_booleans(ls: list) -> list: +def strip_booleans(ls: List) -> List: """ Remove booleans from a list @@ -235,7 +241,7 @@ def strip_booleans(ls: list) -> list: return [x for x in ls if not isinstance(x, bool)] -def strip_nonetype(ls: list) -> list: +def strip_nonetype(ls: List) -> List: """ Remove None from a list @@ -249,7 +255,7 @@ def strip_nonetype(ls: list) -> list: return [x for x in ls if x is not None] -def strip_none_bool_string(ls: list) -> list: +def strip_none_bool_string(ls: List) -> List: """ Remove None, Boolean and strings from a list From 2a8d795c2e473642da1da3510e13ff373ab953b0 Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 4 Jun 2020 19:51:40 +0100 Subject: [PATCH 03/11] Added type annotations --- mathematical/data_frames.py | 2 +- mathematical/linear_regression.py | 4 +++- mathematical/outliers.py | 13 +++++++------ mathematical/stats.py | 28 ++++++++++++++-------------- mathematical/utils.py | 19 +++++++++---------- 5 files changed, 34 insertions(+), 32 deletions(-) diff --git a/mathematical/data_frames.py b/mathematical/data_frames.py index 6705f71..47c72be 100644 --- a/mathematical/data_frames.py +++ b/mathematical/data_frames.py @@ -257,7 +257,7 @@ def df_outliers(row: Series, column_label_list: List[str] = None, outlier_mode: return Series(list(x)) -def df_count(row: Series, column_label_list: [str] = None) -> int: +def df_count(row: Series, column_label_list: List[str] = None) -> int: """ Count the number of occurrences of a non-NaN value in the specified columns of a data frame diff --git a/mathematical/linear_regression.py b/mathematical/linear_regression.py index e0c6ec0..442c115 100644 --- a/mathematical/linear_regression.py +++ b/mathematical/linear_regression.py @@ -50,11 +50,13 @@ # # 3rd party +from typing import Tuple + import numpy from domdf_python_tools.doctools import is_documented_by -def linear_regression_vertical(x, y = None, a = None, b = None) -> tuple: +def linear_regression_vertical(x: numpy.ndarray, y: numpy.ndarray = None, a = None, b = None) -> Tuple[float, float, float, float]: """ Calculate coefficients of a linear regression y = a * x + b. The fit minimizes *vertical* distances between the points and the line. diff --git a/mathematical/outliers.py b/mathematical/outliers.py index ff513f3..19a397b 100644 --- a/mathematical/outliers.py +++ b/mathematical/outliers.py @@ -31,6 +31,7 @@ # MA 02110-1301, USA. # # +from typing import Sequence, Tuple, List import numpy from . import utils @@ -38,10 +39,10 @@ def mad_outliers( - dataset: List, + dataset: Sequence[float], strip_zero: bool = True, threshold: int = 3, - ): + ) -> Tuple[List[float], List[float]]: """ Using the Median Absolute Deviation to Find Outliers @@ -88,7 +89,7 @@ def mad_outliers( return outliers, data_exc_outliers -def two_stdev(dataset, strip_zero: bool = True): +def two_stdev(dataset: Sequence[float], strip_zero: bool = True): """ Outliers are greater than 2x stdev from mean @@ -100,7 +101,7 @@ def two_stdev(dataset, strip_zero: bool = True): return stdev_outlier(dataset, strip_zero=strip_zero) -def stdev_outlier(dataset, strip_zero: bool = True, rng=int(2)): +def stdev_outlier(dataset: Sequence[float], strip_zero: bool = True, rng=int(2)): """ Outliers are greater than rng*stdev from mean @@ -133,7 +134,7 @@ def stdev_outlier(dataset, strip_zero: bool = True, rng=int(2)): return outliers, data_exc_outliers -def quartile_outliers(dataset, strip_zero: bool = True): +def quartile_outliers(dataset: Sequence[float], strip_zero: bool = True): """ outliers are more than 3x inter-quartile range from upper or lower quartile @@ -171,7 +172,7 @@ def quartile_outliers(dataset, strip_zero: bool = True): return outliers, data_exc_outliers -def spss_outliers(dataset, strip_zero: bool = True, mode: str = "all"): +def spss_outliers(dataset: Sequence[float], strip_zero: bool = True, mode: str = "all"): """ Based on IBM SPSS method for detecting outliers Outliers more than 1.5*IQR from Q1 or Q3 diff --git a/mathematical/stats.py b/mathematical/stats.py index 17237c3..828824b 100644 --- a/mathematical/stats.py +++ b/mathematical/stats.py @@ -46,7 +46,7 @@ import warnings # 3rd party -from typing import List +from typing import List, Sequence, Callable import numpy @@ -54,7 +54,7 @@ from . import utils -def mean_none(dataset: List) -> float: +def mean_none(dataset: Sequence[float]) -> float: """ Calculate the mean, excluding NaN, strings, boolean values, and zeros @@ -71,7 +71,7 @@ def mean_none(dataset: List) -> float: return float(numpy.nanmean(dataset)) -def std_none(dataset: List[str], ddof: int = 1) -> float: +def std_none(dataset: Sequence[float], ddof: int = 1) -> float: """ Calculate the standard deviation, excluding NaN, strings, boolean values, and zeros @@ -90,7 +90,7 @@ def std_none(dataset: List[str], ddof: int = 1) -> float: return float(numpy.nanstd(dataset, ddof=ddof)) -def median_none(dataset:List[str]): +def median_none(dataset: Sequence[float]): """ Calculate the median, excluding NaN, strings, boolean values, and zeros @@ -107,7 +107,7 @@ def median_none(dataset:List[str]): return numpy.nanmedian(dataset) -def iqr_none(dataset:List[str]) -> float: +def iqr_none(dataset: Sequence[float]) -> float: """ Calculate the interquartile range, excluding NaN, strings, boolean values, and zeros @@ -125,7 +125,7 @@ def iqr_none(dataset:List[str]) -> float: return float(iq) -def percentile_none(dataset: List[str], percentage: float) -> float: +def percentile_none(dataset: Sequence[float], percentage: float) -> float: """ Calculate the given percentile, excluding NaN, strings, boolean values, and zeros @@ -151,7 +151,7 @@ def percentile_none(dataset: List[str], percentage: float) -> float: return float(numpy.percentile(dataset, percentage)) -def pooled_sd(sample1: List, sample2: List, weighted: List = False) -> float: +def pooled_sd(sample1: Sequence[float], sample2: Sequence[float], weighted: bool = False) -> float: """ Pooled Standard Deviation @@ -178,7 +178,7 @@ def pooled_sd(sample1: List, sample2: List, weighted: List = False) -> float: return numpy.sqrt(((sd1**2) + (sd2**2)) / 2) -def d_cohen(sample1: List, sample2: List, sd: int = 1, tail = 1, pooled = False) -> float: +def d_cohen(sample1: Sequence[float], sample2:Sequence[float], sd: int = 1, tail = 1, pooled: bool = False) -> float: """ Cohen's d-Statistic @@ -214,7 +214,7 @@ def d_cohen(sample1: List, sample2: List, sd: int = 1, tail = 1, pooled = False) return (mean1 - mean2) / sd -def g_hedge(sample1: List, sample2: List): #TODO +def g_hedge(sample1: Sequence[float], sample2: Sequence[float]) -> float: """ Hedge's g-Statistic @@ -231,7 +231,7 @@ def g_hedge(sample1: List, sample2: List): #TODO return (mean1 - mean2) / pooled_sd(sample1, sample2, True) -def g_durlak_bias(g :float, n: float) -> float: #TODO +def g_durlak_bias(g: float, n: float) -> float: """ Application of Durlak's bias correction to the Hedge's g statistic. Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm @@ -275,7 +275,7 @@ def interpret_d(d_or_g: float) -> float: return "Large Effect" -def _contains_nan(a, nan_policy:str = 'propagate'): +def _contains_nan(a, nan_policy: str = 'propagate'): policies = ['propagate', 'raise', 'omit'] if nan_policy not in policies: raise ValueError("nan_policy must be one of {%s}" % ', '.join(f"'{s}'" for s in policies)) @@ -306,7 +306,7 @@ def _contains_nan(a, nan_policy:str = 'propagate'): return contains_nan, nan_policy -def median_absolute_deviation(x, axis: int = 0, center = numpy.median, scale: int = 1.4826, nan_policy: str = 'propagate'): #TODO +def median_absolute_deviation(x, axis: int = 0, center: Callable = numpy.median, scale: int = 1.4826, nan_policy: str = 'propagate'): #TODO """ Compute the median absolute deviation of the data along the given axis. The median absolute deviation (MAD, [1]_) computes the median over the @@ -389,7 +389,7 @@ def median_absolute_deviation(x, axis: int = 0, center = numpy.median, scale: in return scale * mad -def absolute_deviation(x, axis: int = 0, center=numpy.median, nan_policy: str = 'propagate'): #TODO +def absolute_deviation(x, axis: int = 0, center: Callable = numpy.median, nan_policy: str = 'propagate'): #TODO """ Compute the absolute deviations from the median of the data along the given axis. @@ -449,7 +449,7 @@ def absolute_deviation(x, axis: int = 0, center=numpy.median, nan_policy: str = return ad -def absolute_deviation_from_median(x, axis: int = 0, center = numpy.median, scale: int = 1.4826, nan_policy: str = 'propagate'): +def absolute_deviation_from_median(x, axis: int = 0, center: Callable = numpy.median, scale: int = 1.4826, nan_policy: str = 'propagate'): """ Compute the absolute deviation from the median of each point in the data along the given axis, given in terms of the MAD. diff --git a/mathematical/utils.py b/mathematical/utils.py index 0099bc9..90606d7 100644 --- a/mathematical/utils.py +++ b/mathematical/utils.py @@ -83,7 +83,7 @@ from operator import eq, ge, gt, le, lt, ne # 3rd party -from typing import List +from typing import List, Sequence, Any, Union, Optional import numpy @@ -149,7 +149,7 @@ def magnitude(x: float) -> int: # return int(math.floor(math.log10(abs(num)))) -def remove_zero(inputlist: List)-> List: +def remove_zero(inputlist: Sequence[float]) -> List[float]: """ Remove zero values from the given list Also removes False and None @@ -178,7 +178,7 @@ def isint(num: float) -> bool: # Only works with floating point numbers return num == int(num) -def RepresentsInt(s: bool): +def RepresentsInt(s: Any) -> bool: """ Checks whether a value can be converted to int @@ -192,12 +192,11 @@ def RepresentsInt(s: bool): except (ValueError, TypeError) as e: return False +from decimal import Decimal -class Decimal(object): - pass -def rounders(val_to_round: int, round_format: str) -> Decimal: +def rounders(val_to_round: Union[str, float, Decimal], round_format: str) -> Decimal: """ Round a value to the specified number format, e.g. "0.000" for three decimal places @@ -309,7 +308,7 @@ def gcd_array(array) -> float: return x -def gcd2(numbers: float) -> float: +def gcd2(numbers: Sequence[float]) -> float: """ Returns the GCD (HCF) of a list of numbers using Euclid's Algorithm @@ -324,7 +323,7 @@ def gcd2(numbers: float) -> float: return c -def lcm(numbers: float) -> float: +def lcm(numbers:Sequence[float]) -> float: """ Returns the LCM of a list of numbers using Euclid's Algorithm :param numbers: @@ -343,7 +342,7 @@ def lcm(numbers: float) -> float: return product -def hcf(a: float, b: float): +def hcf(a: float, b: float) -> float: """ :param a: @@ -366,7 +365,7 @@ def hcf2(numbers: float) -> float: gcd2(numbers) -def modInverse(a: float, m: float): +def modInverse(a: float, m: float) -> Optional[float]: """ Returns the modular inverse of a % m, which is the number x such that a*x % m = 1 :param a: From 1e50a3ec66c33136cdf5c525ca9af1b778a3ee03 Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 4 Jun 2020 22:03:49 +0100 Subject: [PATCH 04/11] Added type annotations --- mathematical/data_frames.py | 4 ++-- mathematical/linear_regression.py | 4 ++-- mathematical/outliers.py | 2 +- mathematical/stats.py | 4 ++-- mathematical/utils.py | 2 +- tests/test_data_frames.py | 4 ++-- tests/test_linear_regression.py | 4 ++-- tests/test_stats.py | 4 ++-- tests/test_utils.py | 4 ++-- 9 files changed, 16 insertions(+), 16 deletions(-) diff --git a/mathematical/data_frames.py b/mathematical/data_frames.py index 47c72be..7d27cae 100644 --- a/mathematical/data_frames.py +++ b/mathematical/data_frames.py @@ -30,7 +30,7 @@ # Outlier Modes from typing import List -from pandas import Series +from pandas import Series # type: ignore MAD = 1 QUARTILES = 2 @@ -55,7 +55,7 @@ def df_mean(row: Series, column_label_list: List[str] = None) -> float: :rtype: float """ - from numpy import nanmean + from numpy import nanmean # type: ignore if column_label_list is None: column_label_list = list(row.index) diff --git a/mathematical/linear_regression.py b/mathematical/linear_regression.py index 442c115..60d4908 100644 --- a/mathematical/linear_regression.py +++ b/mathematical/linear_regression.py @@ -52,8 +52,8 @@ # 3rd party from typing import Tuple -import numpy -from domdf_python_tools.doctools import is_documented_by +import numpy # type: ignore +from domdf_python_tools.doctools import is_documented_by # type: ignore def linear_regression_vertical(x: numpy.ndarray, y: numpy.ndarray = None, a = None, b = None) -> Tuple[float, float, float, float]: diff --git a/mathematical/outliers.py b/mathematical/outliers.py index 19a397b..baf6f75 100644 --- a/mathematical/outliers.py +++ b/mathematical/outliers.py @@ -33,7 +33,7 @@ # from typing import Sequence, Tuple, List -import numpy +import numpy # type: ignore from . import utils from . import stats diff --git a/mathematical/stats.py b/mathematical/stats.py index 828824b..82707a3 100644 --- a/mathematical/stats.py +++ b/mathematical/stats.py @@ -43,12 +43,12 @@ # # stdlib -import warnings +import warnings # type: ignore # 3rd party from typing import List, Sequence, Callable -import numpy +import numpy # type: ignore # this package from . import utils diff --git a/mathematical/utils.py b/mathematical/utils.py index 90606d7..1203978 100644 --- a/mathematical/utils.py +++ b/mathematical/utils.py @@ -85,7 +85,7 @@ # 3rd party from typing import List, Sequence, Any, Union, Optional -import numpy +import numpy # type: ignore def intdiv(p: float, q: float) -> int: diff --git a/tests/test_data_frames.py b/tests/test_data_frames.py index 419edd2..e43c348 100644 --- a/tests/test_data_frames.py +++ b/tests/test_data_frames.py @@ -9,8 +9,8 @@ import copy -import pandas -import pytest +import pandas # type: ignore +import pytest # type: ignore from mathematical.data_frames import ( df_count, diff --git a/tests/test_linear_regression.py b/tests/test_linear_regression.py index e67031d..c0afc82 100644 --- a/tests/test_linear_regression.py +++ b/tests/test_linear_regression.py @@ -50,8 +50,8 @@ from itertools import count # 3rd party -import numpy -import pytest +import numpy # type: ignore +import pytest # type: ignore # this package from mathematical import linear_regression diff --git a/tests/test_stats.py b/tests/test_stats.py index 9bae0fc..0e0fd8e 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -6,7 +6,7 @@ Test functions in stats.py """ -import numpy +import numpy # type: ignore from mathematical import stats data = [1, 2, 3, 4, 5, 0, "abc", False, None, numpy.nan] @@ -43,7 +43,7 @@ def test_iqr_none(): def test_mad(): # Based on example from scipy.median_absolute_deviation docstring - import scipy.stats + import scipy.stats # type: ignore x = scipy.stats.norm.rvs(size=100, scale=1, random_state=123456) assert isinstance(stats.median_absolute_deviation(x), float) assert stats.median_absolute_deviation(x) == 1.2280762773108278 diff --git a/tests/test_utils.py b/tests/test_utils.py index da657c4..15e9ac0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -50,8 +50,8 @@ # | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -import decimal -import numpy +import decimal # type: ignore +import numpy # type: ignore from mathematical import utils From d96b0173cbdb76e0bce0a86ce93ff2bcedd2f836 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 6 Jun 2020 19:05:50 +0100 Subject: [PATCH 05/11] Added type annotations --- mathematical/data_frames.py | 2 +- mathematical/stats.py | 19 ++++++++++--------- mathematical/utils.py | 26 +++++++++++++------------- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/mathematical/data_frames.py b/mathematical/data_frames.py index 7d27cae..70835ca 100644 --- a/mathematical/data_frames.py +++ b/mathematical/data_frames.py @@ -250,7 +250,7 @@ def df_outliers(row: Series, column_label_list: List[str] = None, outlier_mode: elif outlier_mode == QUARTILES: x = outliers.quartile_outliers(data) elif outlier_mode == STDEV2: - x = outliers.stdev_outlier(data, 2) # outlier classed as more than 2 stdev away from mean + x = outliers.stdev_outlier(data, rng=2) # outlier classed as more than 2 stdev away from mean else: return None diff --git a/mathematical/stats.py b/mathematical/stats.py index 82707a3..4e15902 100644 --- a/mathematical/stats.py +++ b/mathematical/stats.py @@ -46,7 +46,7 @@ import warnings # type: ignore # 3rd party -from typing import List, Sequence, Callable +from typing import List, Sequence, Callable, Union, Optional import numpy # type: ignore @@ -54,7 +54,7 @@ from . import utils -def mean_none(dataset: Sequence[float]) -> float: +def mean_none(dataset: Sequence[Union[float, bool, None]]) -> float: """ Calculate the mean, excluding NaN, strings, boolean values, and zeros @@ -71,7 +71,7 @@ def mean_none(dataset: Sequence[float]) -> float: return float(numpy.nanmean(dataset)) -def std_none(dataset: Sequence[float], ddof: int = 1) -> float: +def std_none(dataset: Sequence[Union[float, bool, None]], ddof: int = 1) -> float: """ Calculate the standard deviation, excluding NaN, strings, boolean values, and zeros @@ -90,7 +90,7 @@ def std_none(dataset: Sequence[float], ddof: int = 1) -> float: return float(numpy.nanstd(dataset, ddof=ddof)) -def median_none(dataset: Sequence[float]): +def median_none(dataset: Sequence[Union[float, bool, None]]): """ Calculate the median, excluding NaN, strings, boolean values, and zeros @@ -107,7 +107,7 @@ def median_none(dataset: Sequence[float]): return numpy.nanmedian(dataset) -def iqr_none(dataset: Sequence[float]) -> float: +def iqr_none(dataset: Sequence[Union[float, bool, None]]) -> float: """ Calculate the interquartile range, excluding NaN, strings, boolean values, and zeros @@ -125,7 +125,7 @@ def iqr_none(dataset: Sequence[float]) -> float: return float(iq) -def percentile_none(dataset: Sequence[float], percentage: float) -> float: +def percentile_none(dataset: Sequence[Union[float, bool, None]], percentage: float) -> float: """ Calculate the given percentile, excluding NaN, strings, boolean values, and zeros @@ -251,7 +251,7 @@ def g_durlak_bias(g: float, n: float) -> float: return g * Durlak -def interpret_d(d_or_g: float) -> float: +def interpret_d(d_or_g: float) -> Optional[str]: """ Interpret Cohen's d or Hedge's g values using Table 1 from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3444174/ @@ -273,6 +273,7 @@ def interpret_d(d_or_g: float) -> float: return "Intermediate Effect" elif 0.8 <= d_or_g: return "Large Effect" + return None def _contains_nan(a, nan_policy: str = 'propagate'): @@ -306,7 +307,7 @@ def _contains_nan(a, nan_policy: str = 'propagate'): return contains_nan, nan_policy -def median_absolute_deviation(x, axis: int = 0, center: Callable = numpy.median, scale: int = 1.4826, nan_policy: str = 'propagate'): #TODO +def median_absolute_deviation(x, axis: int = 0, center: Callable = numpy.median, scale: float = 1.4826, nan_policy: str = 'propagate'): #TODO """ Compute the median absolute deviation of the data along the given axis. The median absolute deviation (MAD, [1]_) computes the median over the @@ -449,7 +450,7 @@ def absolute_deviation(x, axis: int = 0, center: Callable = numpy.median, nan_po return ad -def absolute_deviation_from_median(x, axis: int = 0, center: Callable = numpy.median, scale: int = 1.4826, nan_policy: str = 'propagate'): +def absolute_deviation_from_median(x, axis: int = 0, center: Callable = numpy.median, scale: float = 1.4826, nan_policy: str = 'propagate'): """ Compute the absolute deviation from the median of each point in the data along the given axis, given in terms of the MAD. diff --git a/mathematical/utils.py b/mathematical/utils.py index 1203978..c35793b 100644 --- a/mathematical/utils.py +++ b/mathematical/utils.py @@ -105,7 +105,7 @@ def intdiv(p: float, q: float) -> int: r = p // q if r < 0 and q * r != p: r += 1 - return r + return int(r) def roman(num: float) -> str: @@ -149,7 +149,7 @@ def magnitude(x: float) -> int: # return int(math.floor(math.log10(abs(num)))) -def remove_zero(inputlist: Sequence[float]) -> List[float]: +def remove_zero(inputlist: Sequence[Union[float, bool, None]]) -> List[float]: """ Remove zero values from the given list Also removes False and None @@ -212,7 +212,7 @@ def rounders(val_to_round: Union[str, float, Decimal], round_format: str) -> Dec return Decimal(Decimal(val_to_round).quantize(Decimal(str(round_format)), rounding=ROUND_HALF_UP)) -def strip_strings(ls: List) -> List: +def strip_strings(ls: Sequence[Any]) -> List: """ Remove strings from a list @@ -226,7 +226,7 @@ def strip_strings(ls: List) -> List: return [x for x in ls if not isinstance(x, str)] -def strip_booleans(ls: List) -> List: +def strip_booleans(ls: Sequence[Any]) -> List: """ Remove booleans from a list @@ -240,7 +240,7 @@ def strip_booleans(ls: List) -> List: return [x for x in ls if not isinstance(x, bool)] -def strip_nonetype(ls: List) -> List: +def strip_nonetype(ls: Sequence[Any]) -> List: """ Remove None from a list @@ -254,7 +254,7 @@ def strip_nonetype(ls: List) -> List: return [x for x in ls if x is not None] -def strip_none_bool_string(ls: List) -> List: +def strip_none_bool_string(ls: Sequence[Any]) -> List: """ Remove None, Boolean and strings from a list @@ -270,7 +270,7 @@ def strip_none_bool_string(ls: List) -> List: return ls -def gcd(a: float, b: float) -> float: +def gcd(a: int, b: int) -> int: """ Returns the GCD (HCF) of a and b using Euclid's Algorithm @@ -308,7 +308,7 @@ def gcd_array(array) -> float: return x -def gcd2(numbers: Sequence[float]) -> float: +def gcd2(numbers: int) -> int: """ Returns the GCD (HCF) of a list of numbers using Euclid's Algorithm @@ -342,30 +342,30 @@ def lcm(numbers:Sequence[float]) -> float: return product -def hcf(a: float, b: float) -> float: +def hcf(a: int, b: int) -> int: """ :param a: :param b: - :return:float + :return:int """ gcd(a, b) -def hcf2(numbers: float) -> float: +def hcf2(numbers: int) -> int: """ :param numbers: - :return:float + :return:int """ gcd2(numbers) -def modInverse(a: float, m: float) -> Optional[float]: +def modInverse(a: int, m: int) -> Optional[float]: """ Returns the modular inverse of a % m, which is the number x such that a*x % m = 1 :param a: From 493642e0c4fecff9286b40ff3c269b9a479710ba Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 6 Jun 2020 19:49:51 +0100 Subject: [PATCH 06/11] Added type annotations --- mathematical/__init__.py | 1 + mathematical/data_frames.py | 2 ++ mathematical/linear_regression.py | 8 ++++++-- mathematical/outliers.py | 9 ++++++--- mathematical/stats.py | 21 +++++++++++++++------ mathematical/utils.py | 13 +++++-------- tests/test_data_frames.py | 12 ++++-------- tests/test_stats.py | 3 +++ tests/test_utils.py | 4 ++++ 9 files changed, 46 insertions(+), 27 deletions(-) diff --git a/mathematical/__init__.py b/mathematical/__init__.py index a095050..bf46fba 100644 --- a/mathematical/__init__.py +++ b/mathematical/__init__.py @@ -32,4 +32,5 @@ __version__ = "0.1.11" __email__ = "dominic@davis-foster.co.uk" +# this package from . import data_frames, outliers, stats, utils diff --git a/mathematical/data_frames.py b/mathematical/data_frames.py index 70835ca..0696979 100644 --- a/mathematical/data_frames.py +++ b/mathematical/data_frames.py @@ -27,9 +27,11 @@ # # +# stdlib # Outlier Modes from typing import List +# 3rd party from pandas import Series # type: ignore MAD = 1 diff --git a/mathematical/linear_regression.py b/mathematical/linear_regression.py index 60d4908..e56bcc1 100644 --- a/mathematical/linear_regression.py +++ b/mathematical/linear_regression.py @@ -49,14 +49,18 @@ # | DOI: `10.1021/acs.jproteome.8b00717 `_ # -# 3rd party +# stdlib from typing import Tuple +# 3rd party import numpy # type: ignore from domdf_python_tools.doctools import is_documented_by # type: ignore -def linear_regression_vertical(x: numpy.ndarray, y: numpy.ndarray = None, a = None, b = None) -> Tuple[float, float, float, float]: +def linear_regression_vertical(x: numpy.ndarray, + y: numpy.ndarray = None, + a=None, + b=None) -> Tuple[float, float, float, float]: """ Calculate coefficients of a linear regression y = a * x + b. The fit minimizes *vertical* distances between the points and the line. diff --git a/mathematical/outliers.py b/mathematical/outliers.py index baf6f75..937e911 100644 --- a/mathematical/outliers.py +++ b/mathematical/outliers.py @@ -29,13 +29,16 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. +# stdlib # # -from typing import Sequence, Tuple, List +from typing import List, Sequence, Tuple +# 3rd party import numpy # type: ignore -from . import utils -from . import stats + +# this package +from . import stats, utils def mad_outliers( diff --git a/mathematical/stats.py b/mathematical/stats.py index 4e15902..b9e831b 100644 --- a/mathematical/stats.py +++ b/mathematical/stats.py @@ -44,10 +44,9 @@ # stdlib import warnings # type: ignore +from typing import Callable, List, Optional, Sequence, Union # 3rd party -from typing import List, Sequence, Callable, Union, Optional - import numpy # type: ignore # this package @@ -178,7 +177,9 @@ def pooled_sd(sample1: Sequence[float], sample2: Sequence[float], weighted: bool return numpy.sqrt(((sd1**2) + (sd2**2)) / 2) -def d_cohen(sample1: Sequence[float], sample2:Sequence[float], sd: int = 1, tail = 1, pooled: bool = False) -> float: +def d_cohen( + sample1: Sequence[float], sample2: Sequence[float], sd: int = 1, tail=1, pooled: bool = False + ) -> float: """ Cohen's d-Statistic @@ -307,7 +308,13 @@ def _contains_nan(a, nan_policy: str = 'propagate'): return contains_nan, nan_policy -def median_absolute_deviation(x, axis: int = 0, center: Callable = numpy.median, scale: float = 1.4826, nan_policy: str = 'propagate'): #TODO +def median_absolute_deviation( + x, + axis: int = 0, + center: Callable = numpy.median, + scale: float = 1.4826, + nan_policy: str = 'propagate' + ): #TODO """ Compute the median absolute deviation of the data along the given axis. The median absolute deviation (MAD, [1]_) computes the median over the @@ -390,7 +397,7 @@ def median_absolute_deviation(x, axis: int = 0, center: Callable = numpy.median, return scale * mad -def absolute_deviation(x, axis: int = 0, center: Callable = numpy.median, nan_policy: str = 'propagate'): #TODO +def absolute_deviation(x, axis: int = 0, center: Callable = numpy.median, nan_policy: str = 'propagate'): #TODO """ Compute the absolute deviations from the median of the data along the given axis. @@ -450,7 +457,9 @@ def absolute_deviation(x, axis: int = 0, center: Callable = numpy.median, nan_po return ad -def absolute_deviation_from_median(x, axis: int = 0, center: Callable = numpy.median, scale: float = 1.4826, nan_policy: str = 'propagate'): +def absolute_deviation_from_median( + x, axis: int = 0, center: Callable = numpy.median, scale: float = 1.4826, nan_policy: str = 'propagate' + ): """ Compute the absolute deviation from the median of each point in the data along the given axis, given in terms of the MAD. diff --git a/mathematical/utils.py b/mathematical/utils.py index c35793b..9c475ff 100644 --- a/mathematical/utils.py +++ b/mathematical/utils.py @@ -80,11 +80,11 @@ # stdlib import decimal import math +from decimal import Decimal from operator import eq, ge, gt, le, lt, ne +from typing import Any, List, Optional, Sequence, Union # 3rd party -from typing import List, Sequence, Any, Union, Optional - import numpy # type: ignore @@ -192,9 +192,6 @@ def RepresentsInt(s: Any) -> bool: except (ValueError, TypeError) as e: return False -from decimal import Decimal - - def rounders(val_to_round: Union[str, float, Decimal], round_format: str) -> Decimal: """ @@ -323,7 +320,7 @@ def gcd2(numbers: int) -> int: return c -def lcm(numbers:Sequence[float]) -> float: +def lcm(numbers: Sequence[float]) -> float: """ Returns the LCM of a list of numbers using Euclid's Algorithm :param numbers: @@ -391,7 +388,7 @@ def modInverse(a: int, m: int) -> Optional[float]: _precalc_fact = numpy.log([math.factorial(n) for n in range(20)]) -def log_factorial(x: float)-> float: +def log_factorial(x: float) -> float: x = numpy.array(x) pf = _precalc_fact m = (x >= pf.size) @@ -419,7 +416,7 @@ def _expectation(d: float, T: float, p: float = 0.5): return ((m * pi).cumsum() / pi.cumsum())[T] -def _confidence_value(conf: float, d: float, T: float, p: float = 0.5) : +def _confidence_value(conf: float, d: float, T: float, p: float = 0.5): if T is not None: T = numpy.array(T, dtype=int) m = numpy.arange(T.max() + 1, dtype=int) diff --git a/tests/test_data_frames.py b/tests/test_data_frames.py index e43c348..a9a8249 100644 --- a/tests/test_data_frames.py +++ b/tests/test_data_frames.py @@ -7,20 +7,16 @@ """ +# stdlib import copy +# 3rd party import pandas # type: ignore import pytest # type: ignore +# this package from mathematical.data_frames import ( - df_count, - df_data_points, - df_log, - df_log_stdev, - df_mean, - df_median, - df_percentage, - df_stdev, + df_count, df_data_points, df_log, df_log_stdev, df_mean, df_median, df_percentage, df_stdev ) diff --git a/tests/test_stats.py b/tests/test_stats.py index 0e0fd8e..d6b8fa6 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -6,7 +6,10 @@ Test functions in stats.py """ +# 3rd party import numpy # type: ignore + +# this package from mathematical import stats data = [1, 2, 3, 4, 5, 0, "abc", False, None, numpy.nan] diff --git a/tests/test_utils.py b/tests/test_utils.py index 15e9ac0..a80220a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -50,9 +50,13 @@ # | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # +# stdlib import decimal # type: ignore + +# 3rd party import numpy # type: ignore +# this package from mathematical import utils data = [1, 2, 3, 4, 5, 0, "abc", False, None, numpy.nan] From ee1671ea60e5ebfdf61bbaab9f8854ea95178995 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 6 Jun 2020 22:30:09 +0100 Subject: [PATCH 07/11] Added type annotations --- mathematical/data_frames.py | 17 +++++++++-------- mathematical/linear_regression.py | 10 ++++++---- mathematical/outliers.py | 3 ++- mathematical/stats.py | 17 +++++++++++++---- tests/test_utils.py | 2 +- tox.ini | 4 ++-- 6 files changed, 33 insertions(+), 20 deletions(-) diff --git a/mathematical/data_frames.py b/mathematical/data_frames.py index 0696979..9d3ed36 100644 --- a/mathematical/data_frames.py +++ b/mathematical/data_frames.py @@ -28,18 +28,19 @@ # # stdlib -# Outlier Modes -from typing import List +from typing import List, Optional, Sequence # 3rd party from pandas import Series # type: ignore +# Outlier Modes + MAD = 1 QUARTILES = 2 STDEV2 = 3 -def df_mean(row: Series, column_label_list: List[str] = None) -> float: +def df_mean(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float: """ Calculate the mean of each row for the specified columns of a data frame @@ -65,7 +66,7 @@ def df_mean(row: Series, column_label_list: List[str] = None) -> float: return float(nanmean(row[column_label_list])) -def df_median(row: Series, column_label_list: List[str] = None) -> float: +def df_median(row: Series, column_label_list: Optional[Sequence[list]] = None) -> float: """ Calculate the median of each row for the specified columns of a data frame @@ -91,7 +92,7 @@ def df_median(row: Series, column_label_list: List[str] = None) -> float: return float(nanmedian(row[column_label_list])) -def df_stdev(row: Series, column_label_list: List[str] = None) -> float: +def df_stdev(row: Series, column_label_list: Optional[Sequence[list]] = None) -> float: """ Calculate the standard deviation of each row for the specified columns of a data frame @@ -117,7 +118,7 @@ def df_stdev(row: Series, column_label_list: List[str] = None) -> float: return float(nanstd(row[column_label_list])) -def df_log_stdev(row: Series, column_label_list: List[str] = None) -> float: +def df_log_stdev(row: Series, column_label_list: Optional[Sequence[list]] = None) -> float: """ Calculate the standard deviation of the log10 values in each row for the specified columns of a data frame @@ -196,7 +197,7 @@ def df_log(row: Series, column_label_list: List[str], base: float = 10) -> float return 0 -def df_data_points(row: Series, column_label_list: List[str]) -> List: +def df_data_points(row: Series, column_label_list: Optional[Sequence[list]]) -> List: """ Compile the values for the specified columns in each row into a list @@ -259,7 +260,7 @@ def df_outliers(row: Series, column_label_list: List[str] = None, outlier_mode: return Series(list(x)) -def df_count(row: Series, column_label_list: List[str] = None) -> int: +def df_count(row: Series, column_label_list: Optional[Sequence[list]] = None) -> int: """ Count the number of occurrences of a non-NaN value in the specified columns of a data frame diff --git a/mathematical/linear_regression.py b/mathematical/linear_regression.py index e56bcc1..619a9cf 100644 --- a/mathematical/linear_regression.py +++ b/mathematical/linear_regression.py @@ -57,10 +57,12 @@ from domdf_python_tools.doctools import is_documented_by # type: ignore -def linear_regression_vertical(x: numpy.ndarray, - y: numpy.ndarray = None, - a=None, - b=None) -> Tuple[float, float, float, float]: +def linear_regression_vertical( + x: numpy.ndarray, + y: numpy.ndarray = None, + a=None, + b=None, + ) -> Tuple[float, float, float, float]: """ Calculate coefficients of a linear regression y = a * x + b. The fit minimizes *vertical* distances between the points and the line. diff --git a/mathematical/outliers.py b/mathematical/outliers.py index 937e911..6933c02 100644 --- a/mathematical/outliers.py +++ b/mathematical/outliers.py @@ -29,6 +29,7 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. + # stdlib # # @@ -104,7 +105,7 @@ def two_stdev(dataset: Sequence[float], strip_zero: bool = True): return stdev_outlier(dataset, strip_zero=strip_zero) -def stdev_outlier(dataset: Sequence[float], strip_zero: bool = True, rng=int(2)): +def stdev_outlier(dataset: Sequence[float], strip_zero: bool = True, rng: int = 2): """ Outliers are greater than rng*stdev from mean diff --git a/mathematical/stats.py b/mathematical/stats.py index b9e831b..841edfe 100644 --- a/mathematical/stats.py +++ b/mathematical/stats.py @@ -178,7 +178,11 @@ def pooled_sd(sample1: Sequence[float], sample2: Sequence[float], weighted: bool def d_cohen( - sample1: Sequence[float], sample2: Sequence[float], sd: int = 1, tail=1, pooled: bool = False + sample1: Sequence[float], + sample2: Sequence[float], + sd: int = 1, + tail=1, + pooled: bool = False, ) -> float: """ Cohen's d-Statistic @@ -313,7 +317,7 @@ def median_absolute_deviation( axis: int = 0, center: Callable = numpy.median, scale: float = 1.4826, - nan_policy: str = 'propagate' + nan_policy: str = 'propagate,' ): #TODO """ Compute the median absolute deviation of the data along the given axis. @@ -397,7 +401,12 @@ def median_absolute_deviation( return scale * mad -def absolute_deviation(x, axis: int = 0, center: Callable = numpy.median, nan_policy: str = 'propagate'): #TODO +def absolute_deviation( + x, + axis: int = 0, + center: Callable = numpy.median, + nan_policy: str = 'propagate', + ): #TODO """ Compute the absolute deviations from the median of the data along the given axis. @@ -458,7 +467,7 @@ def absolute_deviation(x, axis: int = 0, center: Callable = numpy.median, nan_po def absolute_deviation_from_median( - x, axis: int = 0, center: Callable = numpy.median, scale: float = 1.4826, nan_policy: str = 'propagate' + x, axis: int = 0, center: Callable = numpy.median, scale: float = 1.4826, nan_policy: str = 'propagate,' ): """ Compute the absolute deviation from the median of each point in the data diff --git a/tests/test_utils.py b/tests/test_utils.py index a80220a..8081da3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -51,7 +51,7 @@ # # stdlib -import decimal # type: ignore +import decimal # 3rd party import numpy # type: ignore diff --git a/tox.ini b/tox.ini index 4179b34..0045943 100644 --- a/tox.ini +++ b/tox.ini @@ -66,7 +66,7 @@ commands = flake8 mathematical tests [testenv:yapf] -basepython = python3.7 +basepython = python3.8 changedir={toxinidir} skip_install = true ignore_errors=true @@ -75,7 +75,7 @@ commands = yapf -i --recursive mathematical tests [testenv:isort] -basepython = python3.6 +basepython = python3.8 skip_install = true ignore_errors=true changedir={toxinidir} From fc30b42c75bbeae60770cce5cf6242ef23516f04 Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 7 Jun 2020 18:39:18 +0100 Subject: [PATCH 08/11] added type annotations --- mathematical/stats.py | 31 ++++++++++++++--------------- mathematical/utils.py | 46 ++++++++++++------------------------------- 2 files changed, 28 insertions(+), 49 deletions(-) diff --git a/mathematical/stats.py b/mathematical/stats.py index 841edfe..5d0525e 100644 --- a/mathematical/stats.py +++ b/mathematical/stats.py @@ -180,7 +180,7 @@ def pooled_sd(sample1: Sequence[float], sample2: Sequence[float], weighted: bool def d_cohen( sample1: Sequence[float], sample2: Sequence[float], - sd: int = 1, + which: int = 1, tail=1, pooled: bool = False, ) -> float: @@ -193,8 +193,8 @@ def d_cohen( :type sample1: list :param sample2: datapoints for second sample :type sample2: list - :param sd: Use the standard deviation of the first sample (1) or the second sample (2) - :type sd: int + :param which: Use the standard deviation of the first sample (1) or the second sample (2) + :type which: int :param tail: :param pooled: @@ -205,18 +205,18 @@ def d_cohen( mean1 = numpy.mean(sample1) mean2 = numpy.mean(sample2) - if sd == 1: - sd = numpy.std(sample1) + if which == 1: + stdev = numpy.std(sample1) else: - sd = numpy.std(sample2) + stdev = numpy.std(sample2) if pooled: - sd = pooled_sd(sample1, sample2) + stdev = pooled_sd(sample1, sample2) if tail == 2: - return numpy.abs(mean1 - mean2) / sd + return numpy.abs(mean1 - mean2) / stdev - return (mean1 - mean2) / sd + return (mean1 - mean2) / stdev def g_hedge(sample1: Sequence[float], sample2: Sequence[float]) -> float: @@ -256,7 +256,7 @@ def g_durlak_bias(g: float, n: float) -> float: return g * Durlak -def interpret_d(d_or_g: float) -> Optional[str]: +def interpret_d(d_or_g: float) -> str: """ Interpret Cohen's d or Hedge's g values using Table 1 from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3444174/ @@ -268,9 +268,7 @@ def interpret_d(d_or_g: float) -> Optional[str]: :rtype: """ - if d_or_g < 0: - return f"{interpret_d(numpy.abs(d_or_g)).split(' ')[0]} Adverse Effect" - elif 0.0 <= d_or_g < 0.2: + if 0.0 <= d_or_g < 0.2: return "No Effect" elif 0.2 <= d_or_g < 0.5: return "Small Effect" @@ -278,7 +276,8 @@ def interpret_d(d_or_g: float) -> Optional[str]: return "Intermediate Effect" elif 0.8 <= d_or_g: return "Large Effect" - return None + else: # d_or_g < 0 + return f"{interpret_d(numpy.abs(d_or_g)).split(' ')[0]} Adverse Effect" def _contains_nan(a, nan_policy: str = 'propagate'): @@ -317,7 +316,7 @@ def median_absolute_deviation( axis: int = 0, center: Callable = numpy.median, scale: float = 1.4826, - nan_policy: str = 'propagate,' + nan_policy: str = 'propagate' ): #TODO """ Compute the median absolute deviation of the data along the given axis. @@ -467,7 +466,7 @@ def absolute_deviation( def absolute_deviation_from_median( - x, axis: int = 0, center: Callable = numpy.median, scale: float = 1.4826, nan_policy: str = 'propagate,' + x, axis: int = 0, center: Callable = numpy.median, scale: float = 1.4826, nan_policy: str = 'propagate', ): """ Compute the absolute deviation from the median of each point in the data diff --git a/mathematical/utils.py b/mathematical/utils.py index 9c475ff..d494c5f 100644 --- a/mathematical/utils.py +++ b/mathematical/utils.py @@ -122,7 +122,7 @@ def roman(num: float) -> str: values = 1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1 result = '' for t, v in zip(tokens, values): - cnt = num // v + cnt = int(num // v) result += t * cnt num -= v * cnt return result @@ -305,7 +305,7 @@ def gcd_array(array) -> float: return x -def gcd2(numbers: int) -> int: +def gcd2(numbers: Sequence[int]) -> int: """ Returns the GCD (HCF) of a list of numbers using Euclid's Algorithm @@ -320,7 +320,7 @@ def gcd2(numbers: int) -> int: return c -def lcm(numbers: Sequence[float]) -> float: +def lcm(numbers: Sequence[int]) -> float: """ Returns the LCM of a list of numbers using Euclid's Algorithm :param numbers: @@ -348,10 +348,10 @@ def hcf(a: int, b: int) -> int: :return:int """ - gcd(a, b) + return gcd(a, b) -def hcf2(numbers: int) -> int: +def hcf2(numbers: Sequence[int]) -> int: """ :param numbers: @@ -359,7 +359,7 @@ def hcf2(numbers: int) -> int: :return:int """ - gcd2(numbers) + return gcd2(numbers) def modInverse(a: int, m: int) -> Optional[float]: @@ -389,14 +389,14 @@ def modInverse(a: int, m: int) -> Optional[float]: def log_factorial(x: float) -> float: - x = numpy.array(x) + arr = numpy.array(x) pf = _precalc_fact - m = (x >= pf.size) - out = numpy.empty(x.shape) - out[~m] = pf[x[~m].astype(int)] - x = x[m] - out[m] = x * numpy.log(x) - x + 0.5 * numpy.log(2 * numpy.pi * x) - return out + m: bool = (arr >= pf.size) + out = numpy.empty(arr.shape) + out[~m] = pf[arr[~m].astype(int)] + arr = arr[m] + out[m] = arr * numpy.log(arr) - arr + 0.5 * numpy.log(2 * numpy.pi * arr) + return float(out) def _log_pi_r(d: float, k: float, p: float = 0.5) -> float: @@ -405,23 +405,3 @@ def _log_pi_r(d: float, k: float, p: float = 0.5) -> float: def _log_pi(d: float, k: float, p: float = 0.5) -> float: return _log_pi_r(d, k, p) + (d + 1) * math.log(1 - p) - - -def _expectation(d: float, T: float, p: float = 0.5): - if T is None: - return d + 1 - T = numpy.array(T, dtype=int) - m = numpy.arange(T.max() + 1, dtype=int) - pi = numpy.exp(_log_pi(d, m, p)) - return ((m * pi).cumsum() / pi.cumsum())[T] - - -def _confidence_value(conf: float, d: float, T: float, p: float = 0.5): - if T is not None: - T = numpy.array(T, dtype=int) - m = numpy.arange(T.max() + 1, dtype=int) - else: - m = numpy.arange(max(50 * d, 10000)) - log_pi = _log_pi(d, m, p) - pics = numpy.exp(log_pi).cumsum() - return numpy.searchsorted(pics, conf * (pics[T] if T is not None else 1)) From d88633e5fecfa9c248f5d1a3f359476e30043fbc Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 7 Jun 2020 21:48:55 +0100 Subject: [PATCH 09/11] Added type annotations --- mathematical/data_frames.py | 2 +- mathematical/stats.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/mathematical/data_frames.py b/mathematical/data_frames.py index 9d3ed36..d85407a 100644 --- a/mathematical/data_frames.py +++ b/mathematical/data_frames.py @@ -197,7 +197,7 @@ def df_log(row: Series, column_label_list: List[str], base: float = 10) -> float return 0 -def df_data_points(row: Series, column_label_list: Optional[Sequence[list]]) -> List: +def df_data_points(row: Series, column_label_list: Sequence[str]) -> List: """ Compile the values for the specified columns in each row into a list diff --git a/mathematical/stats.py b/mathematical/stats.py index 5d0525e..a4a0a61 100644 --- a/mathematical/stats.py +++ b/mathematical/stats.py @@ -466,8 +466,12 @@ def absolute_deviation( def absolute_deviation_from_median( - x, axis: int = 0, center: Callable = numpy.median, scale: float = 1.4826, nan_policy: str = 'propagate', - ): + x, + axis: int = 0, + center: Callable = numpy.median, + scale: float = 1.4826, + nan_policy: str = 'propagate', + ) -> numpy.ndarray: """ Compute the absolute deviation from the median of each point in the data along the given axis, given in terms of the MAD. From 3c1b0d3a3cc7a65df2a7dd50de35dd22334a8e95 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 8 Jun 2020 12:20:04 +0100 Subject: [PATCH 10/11] Added type annotations --- mathematical/data_frames.py | 21 ++++-------- mathematical/linear_regression.py | 29 +++++++++------- mathematical/outliers.py | 41 ++++++++++++++++------- mathematical/stats.py | 55 ++++++++++++++++--------------- mathematical/utils.py | 17 ++++++---- 5 files changed, 91 insertions(+), 72 deletions(-) diff --git a/mathematical/data_frames.py b/mathematical/data_frames.py index d85407a..0c0f005 100644 --- a/mathematical/data_frames.py +++ b/mathematical/data_frames.py @@ -52,7 +52,6 @@ def df_mean(row: Series, column_label_list: Optional[Sequence[str]] = None) -> f :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to calculate mean for - :type column_label_list: list :return: Mean :rtype: float @@ -66,7 +65,7 @@ def df_mean(row: Series, column_label_list: Optional[Sequence[str]] = None) -> f return float(nanmean(row[column_label_list])) -def df_median(row: Series, column_label_list: Optional[Sequence[list]] = None) -> float: +def df_median(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float: """ Calculate the median of each row for the specified columns of a data frame @@ -78,7 +77,6 @@ def df_median(row: Series, column_label_list: Optional[Sequence[list]] = None) - :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to calculate median for - :type column_label_list: list :return: Median :rtype: float @@ -92,7 +90,7 @@ def df_median(row: Series, column_label_list: Optional[Sequence[list]] = None) - return float(nanmedian(row[column_label_list])) -def df_stdev(row: Series, column_label_list: Optional[Sequence[list]] = None) -> float: +def df_stdev(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float: """ Calculate the standard deviation of each row for the specified columns of a data frame @@ -104,7 +102,6 @@ def df_stdev(row: Series, column_label_list: Optional[Sequence[list]] = None) -> :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to calculate standard deviation for - :type column_label_list: list :return: Standard deviation :rtype: float @@ -118,7 +115,7 @@ def df_stdev(row: Series, column_label_list: Optional[Sequence[list]] = None) -> return float(nanstd(row[column_label_list])) -def df_log_stdev(row: Series, column_label_list: Optional[Sequence[list]] = None) -> float: +def df_log_stdev(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float: """ Calculate the standard deviation of the log10 values in each row for the specified columns of a data frame @@ -130,7 +127,6 @@ def df_log_stdev(row: Series, column_label_list: Optional[Sequence[list]] = None :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to calculate standard deviation for - :type column_label_list: list :return: Standard deviation :rtype: float @@ -169,7 +165,7 @@ def df_percentage(row: Series, column_label: str, total: float) -> float: return (row[column_label] / float(total)) * 100.0 -def df_log(row: Series, column_label_list: List[str], base: float = 10) -> float: +def df_log(row: Series, column_label_list: Sequence[str], base: float = 10) -> float: """ Calculate the logarithm of the values in each row for the specified columns of a data frame @@ -181,7 +177,6 @@ def df_log(row: Series, column_label_list: List[str], base: float = 10) -> float :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to calculate log for - :type column_label_list: list :param base: logarithmic base :type base: float @@ -209,7 +204,6 @@ def df_data_points(row: Series, column_label_list: Sequence[str]) -> List: :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to calculate standard deviation for - :type column_label_list: list :return: data points :rtype: list @@ -218,7 +212,7 @@ def df_data_points(row: Series, column_label_list: Sequence[str]) -> List: return [row[column_label] for column_label in column_label_list] -def df_outliers(row: Series, column_label_list: List[str] = None, outlier_mode: int = MAD) -> Series: +def df_outliers(row: Series, column_label_list: Sequence[str] = None, outlier_mode: int = MAD) -> Series: """ Identify outliers in each row @@ -230,7 +224,6 @@ def df_outliers(row: Series, column_label_list: List[str] = None, outlier_mode: :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to determine outliers for - :type column_label_list: list :param outlier_mode: outlier detection method to use :type outlier_mode: int @@ -255,12 +248,12 @@ def df_outliers(row: Series, column_label_list: List[str] = None, outlier_mode: elif outlier_mode == STDEV2: x = outliers.stdev_outlier(data, rng=2) # outlier classed as more than 2 stdev away from mean else: - return None + raise ValueError("Unknown outlier mode.") return Series(list(x)) -def df_count(row: Series, column_label_list: Optional[Sequence[list]] = None) -> int: +def df_count(row: Series, column_label_list: Optional[Sequence[str]] = None) -> int: """ Count the number of occurrences of a non-NaN value in the specified columns of a data frame diff --git a/mathematical/linear_regression.py b/mathematical/linear_regression.py index 619a9cf..ca10a85 100644 --- a/mathematical/linear_regression.py +++ b/mathematical/linear_regression.py @@ -50,18 +50,20 @@ # # stdlib -from typing import Tuple +from typing import Optional, Sequence, Tuple, Union # 3rd party import numpy # type: ignore from domdf_python_tools.doctools import is_documented_by # type: ignore +ArrayLike_Float = Union[Sequence[float], numpy.ndarray] + def linear_regression_vertical( - x: numpy.ndarray, - y: numpy.ndarray = None, - a=None, - b=None, + x: ArrayLike_Float, + y: Optional[ArrayLike_Float] = None, + a: Optional[float] = None, + b: Optional[float] = None, ) -> Tuple[float, float, float, float]: """ Calculate coefficients of a linear regression y = a * x + b. @@ -81,7 +83,7 @@ def linear_regression_vertical( :return: (a, b, r, stderr), where a -- slope coefficient, b -- free term, - r -- Peason correlation coefficient, + r -- Pearson correlation coefficient, stderr -- standard deviation. :rtype: tuple """ @@ -91,7 +93,7 @@ def linear_regression_vertical( y = numpy.array(y, copy=False) else: if len(x.shape) != 2 or x.shape[-1] != 2: - raise TypeError('If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape)) + raise TypeError(f'If `y` is not given, x.shape should be (N, 2), given: {x.shape}') y = x[:, 1] x = x[:, 0] if a is not None and b is None: @@ -104,15 +106,16 @@ def linear_regression_vertical( r = numpy.corrcoef(x, y)[0, 1] stderr = (y - a * x - b).std() - return a, b, r, stderr + return a, b, r, stderr # type: ignore # TODO -@is_documented_by(linear_regression_vertical) -def linear_regression(x, y=None, a=None, b=None): - return linear_regression_vertical(x, y, a, b) +linear_regression = linear_regression_vertical -def linear_regression_perpendicular(x, y=None): +def linear_regression_perpendicular( + x: ArrayLike_Float, + y: Optional[ArrayLike_Float] = None, + ) -> Tuple[float, float, float, float]: """ Calculate coefficients of a linear regression y = a * x + b. The fit minimizes *perpendicular* distances between the points and the line. @@ -135,6 +138,7 @@ def linear_regression_perpendicular(x, y=None): """ x = numpy.array(x, copy=False) + if y is not None: y = numpy.array(y, copy=False) data = numpy.hstack((x.reshape((-1, 1)), y.reshape((-1, 1)))) @@ -142,6 +146,7 @@ def linear_regression_perpendicular(x, y=None): if len(x.shape) != 2 or x.shape[-1] != 2: raise TypeError('If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape)) data = x + mu = data.mean(axis=0) eigenvectors, eigenvalues, V = numpy.linalg.svd((data - mu).T, full_matrices=False) a = eigenvectors[0][1] / eigenvectors[0][0] diff --git a/mathematical/outliers.py b/mathematical/outliers.py index 6933c02..6b2de72 100644 --- a/mathematical/outliers.py +++ b/mathematical/outliers.py @@ -29,10 +29,10 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. - -# stdlib # # + +# stdlib from typing import List, Sequence, Tuple # 3rd party @@ -51,7 +51,8 @@ def mad_outliers( Using the Median Absolute Deviation to Find Outliers :param dataset: - :type dataset: list + :param strip_zero: + :type strip_zero: bool :param threshold: The multiple of MAD above which values are considered to be outliers Leys et al (2013) make the following recommendations: 1 In univariate statistics, the Median Absolute Deviation is the most robust @@ -68,7 +69,7 @@ def mad_outliers( See https://dipot.ulb.ac.be/dspace/bitstream/2013/139499/1/Leys_MAD_final-libre.pdf :type threshold: int - :return: #TODO + :return: """ dataset = utils.strip_none_bool_string(dataset) @@ -93,11 +94,13 @@ def mad_outliers( return outliers, data_exc_outliers -def two_stdev(dataset: Sequence[float], strip_zero: bool = True): +def two_stdev(dataset: Sequence[float], strip_zero: bool = True) -> Tuple[List[float], List[float]]: """ Outliers are greater than 2x stdev from mean :param dataset: + :param strip_zero: + :type strip_zero: bool :return: # TODO """ @@ -105,12 +108,17 @@ def two_stdev(dataset: Sequence[float], strip_zero: bool = True): return stdev_outlier(dataset, strip_zero=strip_zero) -def stdev_outlier(dataset: Sequence[float], strip_zero: bool = True, rng: int = 2): +def stdev_outlier(dataset: Sequence[float], + strip_zero: bool = True, + rng: int = 2) -> Tuple[List[float], List[float]]: """ Outliers are greater than rng*stdev from mean :param dataset: + :param strip_zero: + :type strip_zero: bool :param rng: + :type rng: :return: 'TODO """ @@ -138,13 +146,15 @@ def stdev_outlier(dataset: Sequence[float], strip_zero: bool = True, rng: int = return outliers, data_exc_outliers -def quartile_outliers(dataset: Sequence[float], strip_zero: bool = True): +def quartile_outliers(dataset: Sequence[float], strip_zero: bool = True) -> Tuple[List[float], List[float]]: """ outliers are more than 3x inter-quartile range from upper or lower quartile - :param dataset: # + :param dataset: + :param strip_zero: + :type strip_zero: bool - :return: #TODO + :return: """ dataset = utils.strip_none_bool_string(dataset) @@ -176,16 +186,22 @@ def quartile_outliers(dataset: Sequence[float], strip_zero: bool = True): return outliers, data_exc_outliers -def spss_outliers(dataset: Sequence[float], strip_zero: bool = True, mode: str = "all"): +def spss_outliers( + dataset: Sequence[float], + strip_zero: bool = True, + mode: str = "all", + ): # TODO: -> Tuple[List[float], List[float], List[float]] """ - Based on IBM SPSS method for detecting outliers + Based on IBM SPSS method for detecting outliers. + Outliers more than 1.5*IQR from Q1 or Q3 + "Extreme values" more than 3*IQR from Q1 or Q3 :param dataset: :param mode: str - :return: # TODO + :return: """ if len(dataset) < 2: @@ -199,6 +215,7 @@ def spss_outliers(dataset: Sequence[float], strip_zero: bool = True, mode: str = for val in dataset: if val in ['', 0.0, 0]: dataset.remove(val) + if len(dataset) == 0: return float('nan') elif dataset == [None]: diff --git a/mathematical/stats.py b/mathematical/stats.py index a4a0a61..6a577cb 100644 --- a/mathematical/stats.py +++ b/mathematical/stats.py @@ -58,7 +58,6 @@ def mean_none(dataset: Sequence[Union[float, bool, None]]) -> float: Calculate the mean, excluding NaN, strings, boolean values, and zeros :param dataset: list to calculate mean from - :type dataset: list :return: mean :rtype float @@ -75,8 +74,8 @@ def std_none(dataset: Sequence[Union[float, bool, None]], ddof: int = 1) -> floa Calculate the standard deviation, excluding NaN, strings, boolean values, and zeros :param dataset: list to calculate mean from - :type dataset: list - :param ddof: Means Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. By default ddof is 1. + :param ddof: Means Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. By default ddof is 1. :type ddof: int :return: standard deviation :rtype float @@ -89,12 +88,11 @@ def std_none(dataset: Sequence[Union[float, bool, None]], ddof: int = 1) -> floa return float(numpy.nanstd(dataset, ddof=ddof)) -def median_none(dataset: Sequence[Union[float, bool, None]]): +def median_none(dataset: Sequence[Union[float, bool, None]]) -> float: """ Calculate the median, excluding NaN, strings, boolean values, and zeros :param dataset: list to calculate median from - :type dataset: list :return: standard deviation :rtype float @@ -103,7 +101,7 @@ def median_none(dataset: Sequence[Union[float, bool, None]]): dataset = utils.strip_none_bool_string(dataset) dataset = utils.remove_zero(dataset) - return numpy.nanmedian(dataset) + return float(numpy.nanmedian(dataset)) def iqr_none(dataset: Sequence[Union[float, bool, None]]) -> float: @@ -111,8 +109,6 @@ def iqr_none(dataset: Sequence[Union[float, bool, None]]) -> float: Calculate the interquartile range, excluding NaN, strings, boolean values, and zeros :param dataset: list to calculate iqr from - :type dataset: list - :return: interquartile range :rtype float """ @@ -130,8 +126,6 @@ def percentile_none(dataset: Sequence[Union[float, bool, None]], percentage: flo Calculate the given percentile, excluding NaN, strings, boolean values, and zeros :param dataset: list to calculate percentile from - :type dataset: list - :param percentage: :type percentage: float @@ -157,9 +151,7 @@ def pooled_sd(sample1: Sequence[float], sample2: Sequence[float], weighted: bool Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm :param sample1: datapoints for first sample - :type sample1: list :param sample2: datapoints for second sample - :type sample2: list :param weighted: True for weighted pooled SD :return: Pooled Standard Deviation @@ -181,7 +173,7 @@ def d_cohen( sample1: Sequence[float], sample2: Sequence[float], which: int = 1, - tail=1, + tail: int = 1, pooled: bool = False, ) -> float: """ @@ -190,9 +182,7 @@ def d_cohen( Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Edition). Hillsdale, NJ: Lawrence Erlbaum Associates :param sample1: datapoints for first sample - :type sample1: list :param sample2: datapoints for second sample - :type sample2: list :param which: Use the standard deviation of the first sample (1) or the second sample (2) :type which: int :param tail: @@ -226,8 +216,8 @@ def g_hedge(sample1: Sequence[float], sample2: Sequence[float]) -> float: Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm :param sample1: datapoints for first sample - :type sample1: list :param sample2: datapoints for second sample + :return: """ @@ -244,12 +234,12 @@ def g_durlak_bias(g: float, n: float) -> float: n = n1+n2 :param g: - :type g: + :type g: float :param n: - :type n: + :type n: float :return: - :rtype: + :rtype: float """ Durlak = ((n - 3) / (n - 2.25)) * numpy.sqrt((n - 2) / n) @@ -265,7 +255,7 @@ def interpret_d(d_or_g: float) -> str: :type d_or_g: :return: - :rtype: + :rtype: str """ if 0.0 <= d_or_g < 0.2: @@ -317,14 +307,14 @@ def median_absolute_deviation( center: Callable = numpy.median, scale: float = 1.4826, nan_policy: str = 'propagate' - ): #TODO + ) -> numpy.ndarray: """ Compute the median absolute deviation of the data along the given axis. The median absolute deviation (MAD, [1]_) computes the median over the absolute deviations from the median. It is a measure of dispersion similar to the standard deviation, but is more robust to outliers [2]_. The MAD of an empty array is ``numpy.nan``. - .. versionadded:: 1.3.0 + Parameters ---------- x : array_like @@ -405,7 +395,7 @@ def absolute_deviation( axis: int = 0, center: Callable = numpy.median, nan_policy: str = 'propagate', - ): #TODO + ) -> numpy.ndarray: """ Compute the absolute deviations from the median of the data along the given axis. @@ -511,6 +501,7 @@ def absolute_deviation_from_median( will calculate the MAD around the mean - it will not calculate the *mean* absolute deviation. """ + ad = absolute_deviation(x, axis=axis, center=center, nan_policy=nan_policy) if axis is None: @@ -518,13 +509,23 @@ def absolute_deviation_from_median( else: mad = numpy.median(ad, axis=axis) - ad_from_median = ad / mad - - return ad_from_median + return ad / mad def within1min(value1: float, value2: float) -> bool: - if value1 not in [0, None, ''] and value2 not in [0, None, '']: + """ + Returns whether ``value2`` is within one minute of ``value1``. + + :param value1: A time + :type value1: + :param value2: another time + :type value2: + + :return: + :rtype: + """ + + if value1 and value2: return (float(value1) - 1) < (float(value2)) < (float(value1) + 1) else: return False diff --git a/mathematical/utils.py b/mathematical/utils.py index d494c5f..dd506b6 100644 --- a/mathematical/utils.py +++ b/mathematical/utils.py @@ -92,16 +92,15 @@ def intdiv(p: float, q: float) -> int: """ Integer divsions which rounds toward zero - Examples - -------- + **Examples** >>> intdiv(3, 2) 1 >>> intdiv(-3, 2) -1 >>> -3 // 2 -2 - """ + r = p // q if r < 0 and q * r != p: r += 1 @@ -110,8 +109,7 @@ def intdiv(p: float, q: float) -> int: def roman(num: float) -> str: """ - Examples - -------- + **Examples** >>> roman(4) 'IV' >>> roman(17) @@ -165,7 +163,7 @@ def remove_zero(inputlist: Sequence[Union[float, bool, None]]) -> List[float]: return list(inputlist[numpy.nonzero(inputlist)]) -def isint(num: float) -> bool: # Only works with floating point numbers +def isint(num: float) -> bool: # Only works with floating-point numbers """ Checks whether a float is an integer value @@ -178,7 +176,7 @@ def isint(num: float) -> bool: # Only works with floating point numbers return num == int(num) -def RepresentsInt(s: Any) -> bool: +def represents_int(s: Any) -> bool: """ Checks whether a value can be converted to int @@ -193,6 +191,9 @@ def RepresentsInt(s: Any) -> bool: return False +RepresentsInt = represents_int + + def rounders(val_to_round: Union[str, float, Decimal], round_format: str) -> Decimal: """ Round a value to the specified number format, e.g. "0.000" for three decimal places @@ -377,9 +378,11 @@ def modInverse(a: int, m: int) -> Optional[float]: # Calculation using the Extended Euclidean Algorithm u1, u2, u3 = 1, 0, a v1, v2, v3 = 0, 1, m + while v3 != 0: q = u3 // v3 # // forces integer division in Python 3 v1, v2, v3, u1, u2, u3 = (u1 - q * v1), (u2 - q * v2), (u3 - q * v3), v1, v2, v3 + return u1 % m From f4613228142a223223eb4d2cac8c02b711ec84ce Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 8 Jun 2020 12:35:27 +0100 Subject: [PATCH 11/11] Added type annotations --- mathematical/outliers.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mathematical/outliers.py b/mathematical/outliers.py index 6b2de72..eeefb74 100644 --- a/mathematical/outliers.py +++ b/mathematical/outliers.py @@ -108,9 +108,11 @@ def two_stdev(dataset: Sequence[float], strip_zero: bool = True) -> Tuple[List[f return stdev_outlier(dataset, strip_zero=strip_zero) -def stdev_outlier(dataset: Sequence[float], - strip_zero: bool = True, - rng: int = 2) -> Tuple[List[float], List[float]]: +def stdev_outlier( + dataset: Sequence[float], + strip_zero: bool = True, + rng: int = 2, + ) -> Tuple[List[float], List[float]]: """ Outliers are greater than rng*stdev from mean