diff --git a/mathematical/__init__.py b/mathematical/__init__.py index a095050..bf46fba 100644 --- a/mathematical/__init__.py +++ b/mathematical/__init__.py @@ -32,4 +32,5 @@ __version__ = "0.1.11" __email__ = "dominic@davis-foster.co.uk" +# this package from . import data_frames, outliers, stats, utils diff --git a/mathematical/data_frames.py b/mathematical/data_frames.py index 87e61bd..0c0f005 100644 --- a/mathematical/data_frames.py +++ b/mathematical/data_frames.py @@ -27,13 +27,20 @@ # # +# stdlib +from typing import List, Optional, Sequence + +# 3rd party +from pandas import Series # type: ignore + # Outlier Modes + MAD = 1 QUARTILES = 2 STDEV2 = 3 -def df_mean(row, column_label_list=None): +def df_mean(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float: """ Calculate the mean of each row for the specified columns of a data frame @@ -45,21 +52,20 @@ def df_mean(row, column_label_list=None): :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to calculate mean for - :type column_label_list: list :return: Mean :rtype: float """ - from numpy import nanmean + from numpy import nanmean # type: ignore if column_label_list is None: column_label_list = list(row.index) - return nanmean(row[column_label_list]) + return float(nanmean(row[column_label_list])) -def df_median(row, column_label_list=None): +def df_median(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float: """ Calculate the median of each row for the specified columns of a data frame @@ -71,7 +77,6 @@ def df_median(row, column_label_list=None): :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to calculate median for - :type column_label_list: list :return: Median :rtype: float @@ -82,10 +87,10 @@ def df_median(row, column_label_list=None): if column_label_list is None: column_label_list = list(row.index) - return nanmedian(row[column_label_list]) + return float(nanmedian(row[column_label_list])) -def df_stdev(row, column_label_list=None): +def df_stdev(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float: """ Calculate the standard deviation of each row for the specified columns of a data frame @@ -97,7 +102,6 @@ def df_stdev(row, column_label_list=None): :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to calculate standard deviation for - :type column_label_list: list :return: Standard deviation :rtype: float @@ -108,10 +112,10 @@ def df_stdev(row, column_label_list=None): if column_label_list is None: column_label_list = list(row.index) - return nanstd(row[column_label_list]) + return float(nanstd(row[column_label_list])) -def df_log_stdev(row, column_label_list=None): +def df_log_stdev(row: Series, column_label_list: Optional[Sequence[str]] = None) -> float: """ Calculate the standard deviation of the log10 values in each row for the specified columns of a data frame @@ -123,7 +127,6 @@ def df_log_stdev(row, column_label_list=None): :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to calculate standard deviation for - :type column_label_list: list :return: Standard deviation :rtype: float @@ -135,10 +138,10 @@ def df_log_stdev(row, column_label_list=None): if column_label_list is None: column_label_list = list(row.index) - return nanstd([log10(x) if x > 0.0 else nan for x in row[column_label_list]]) + return float(nanstd([log10(x) if x > 0.0 else nan for x in row[column_label_list]])) -def df_percentage(row, column_label, total): +def df_percentage(row: Series, column_label: str, total: float) -> float: """ Returns the value of the specified column as a percentage of the given total The total is usually the sum of the specified column @@ -153,7 +156,7 @@ def df_percentage(row, column_label, total): :param column_label: column label to calculate percentage for :type column_label: str :param total: total value - :type column_label: str + :type total: float :return: Percentage * 100 :rtype: float @@ -162,7 +165,7 @@ def df_percentage(row, column_label, total): return (row[column_label] / float(total)) * 100.0 -def df_log(row, column_label_list, base=10): +def df_log(row: Series, column_label_list: Sequence[str], base: float = 10) -> float: """ Calculate the logarithm of the values in each row for the specified columns of a data frame @@ -174,7 +177,6 @@ def df_log(row, column_label_list, base=10): :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to calculate log for - :type column_label_list: list :param base: logarithmic base :type base: float @@ -190,7 +192,7 @@ def df_log(row, column_label_list, base=10): return 0 -def df_data_points(row, column_label_list): +def df_data_points(row: Series, column_label_list: Sequence[str]) -> List: """ Compile the values for the specified columns in each row into a list @@ -202,7 +204,6 @@ def df_data_points(row, column_label_list): :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to calculate standard deviation for - :type column_label_list: list :return: data points :rtype: list @@ -211,7 +212,7 @@ def df_data_points(row, column_label_list): return [row[column_label] for column_label in column_label_list] -def df_outliers(row, column_label_list=None, outlier_mode=MAD): +def df_outliers(row: Series, column_label_list: Sequence[str] = None, outlier_mode: int = MAD) -> Series: """ Identify outliers in each row @@ -223,7 +224,6 @@ def df_outliers(row, column_label_list=None, outlier_mode=MAD): :param row: row of the data frame :type row: pandas.core.series.Series :param column_label_list: list of column labels to determine outliers for - :type column_label_list: list :param outlier_mode: outlier detection method to use :type outlier_mode: int @@ -246,14 +246,14 @@ def df_outliers(row, column_label_list=None, outlier_mode=MAD): elif outlier_mode == QUARTILES: x = outliers.quartile_outliers(data) elif outlier_mode == STDEV2: - x = outliers.stdev_outlier(data, 2) # outlier classed as more than 2 stdev away from mean + x = outliers.stdev_outlier(data, rng=2) # outlier classed as more than 2 stdev away from mean else: - return None + raise ValueError("Unknown outlier mode.") - return pd.Series(list(x)) + return Series(list(x)) -def df_count(row, column_label_list=None): +def df_count(row: Series, column_label_list: Optional[Sequence[str]] = None) -> int: """ Count the number of occurrences of a non-NaN value in the specified columns of a data frame diff --git a/mathematical/linear_regression.py b/mathematical/linear_regression.py index 6966cc2..ca10a85 100644 --- a/mathematical/linear_regression.py +++ b/mathematical/linear_regression.py @@ -49,12 +49,22 @@ # | DOI: `10.1021/acs.jproteome.8b00717 `_ # +# stdlib +from typing import Optional, Sequence, Tuple, Union + # 3rd party -import numpy -from domdf_python_tools.doctools import is_documented_by +import numpy # type: ignore +from domdf_python_tools.doctools import is_documented_by # type: ignore + +ArrayLike_Float = Union[Sequence[float], numpy.ndarray] -def linear_regression_vertical(x, y=None, a=None, b=None): +def linear_regression_vertical( + x: ArrayLike_Float, + y: Optional[ArrayLike_Float] = None, + a: Optional[float] = None, + b: Optional[float] = None, + ) -> Tuple[float, float, float, float]: """ Calculate coefficients of a linear regression y = a * x + b. The fit minimizes *vertical* distances between the points and the line. @@ -73,7 +83,7 @@ def linear_regression_vertical(x, y=None, a=None, b=None): :return: (a, b, r, stderr), where a -- slope coefficient, b -- free term, - r -- Peason correlation coefficient, + r -- Pearson correlation coefficient, stderr -- standard deviation. :rtype: tuple """ @@ -83,7 +93,7 @@ def linear_regression_vertical(x, y=None, a=None, b=None): y = numpy.array(y, copy=False) else: if len(x.shape) != 2 or x.shape[-1] != 2: - raise TypeError('If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape)) + raise TypeError(f'If `y` is not given, x.shape should be (N, 2), given: {x.shape}') y = x[:, 1] x = x[:, 0] if a is not None and b is None: @@ -96,15 +106,16 @@ def linear_regression_vertical(x, y=None, a=None, b=None): r = numpy.corrcoef(x, y)[0, 1] stderr = (y - a * x - b).std() - return a, b, r, stderr + return a, b, r, stderr # type: ignore # TODO -@is_documented_by(linear_regression_vertical) -def linear_regression(x, y=None, a=None, b=None): - return linear_regression_vertical(x, y, a, b) +linear_regression = linear_regression_vertical -def linear_regression_perpendicular(x, y=None): +def linear_regression_perpendicular( + x: ArrayLike_Float, + y: Optional[ArrayLike_Float] = None, + ) -> Tuple[float, float, float, float]: """ Calculate coefficients of a linear regression y = a * x + b. The fit minimizes *perpendicular* distances between the points and the line. @@ -127,6 +138,7 @@ def linear_regression_perpendicular(x, y=None): """ x = numpy.array(x, copy=False) + if y is not None: y = numpy.array(y, copy=False) data = numpy.hstack((x.reshape((-1, 1)), y.reshape((-1, 1)))) @@ -134,6 +146,7 @@ def linear_regression_perpendicular(x, y=None): if len(x.shape) != 2 or x.shape[-1] != 2: raise TypeError('If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape)) data = x + mu = data.mean(axis=0) eigenvectors, eigenvalues, V = numpy.linalg.svd((data - mu).T, full_matrices=False) a = eigenvectors[0][1] / eigenvectors[0][0] diff --git a/mathematical/outliers.py b/mathematical/outliers.py index d7f7b27..eeefb74 100644 --- a/mathematical/outliers.py +++ b/mathematical/outliers.py @@ -32,21 +32,27 @@ # # -import numpy -from . import utils -from . import stats +# stdlib +from typing import List, Sequence, Tuple + +# 3rd party +import numpy # type: ignore + +# this package +from . import stats, utils def mad_outliers( - dataset, - strip_zero=True, - threshold=3, - ): + dataset: Sequence[float], + strip_zero: bool = True, + threshold: int = 3, + ) -> Tuple[List[float], List[float]]: """ Using the Median Absolute Deviation to Find Outliers :param dataset: - :type dataset: list + :param strip_zero: + :type strip_zero: bool :param threshold: The multiple of MAD above which values are considered to be outliers Leys et al (2013) make the following recommendations: 1 In univariate statistics, the Median Absolute Deviation is the most robust @@ -88,26 +94,35 @@ def mad_outliers( return outliers, data_exc_outliers -def two_stdev(dataset, strip_zero=True): +def two_stdev(dataset: Sequence[float], strip_zero: bool = True) -> Tuple[List[float], List[float]]: """ Outliers are greater than 2x stdev from mean :param dataset: + :param strip_zero: + :type strip_zero: bool - :return: + :return: # TODO """ return stdev_outlier(dataset, strip_zero=strip_zero) -def stdev_outlier(dataset, strip_zero=True, rng=int(2)): +def stdev_outlier( + dataset: Sequence[float], + strip_zero: bool = True, + rng: int = 2, + ) -> Tuple[List[float], List[float]]: """ Outliers are greater than rng*stdev from mean :param dataset: + :param strip_zero: + :type strip_zero: bool :param rng: + :type rng: - :return: + :return: 'TODO """ dataset = utils.strip_none_bool_string(dataset) @@ -133,11 +148,13 @@ def stdev_outlier(dataset, strip_zero=True, rng=int(2)): return outliers, data_exc_outliers -def quartile_outliers(dataset, strip_zero=True): +def quartile_outliers(dataset: Sequence[float], strip_zero: bool = True) -> Tuple[List[float], List[float]]: """ outliers are more than 3x inter-quartile range from upper or lower quartile :param dataset: + :param strip_zero: + :type strip_zero: bool :return: """ @@ -171,14 +188,20 @@ def quartile_outliers(dataset, strip_zero=True): return outliers, data_exc_outliers -def spss_outliers(dataset, strip_zero=True, mode="all"): +def spss_outliers( + dataset: Sequence[float], + strip_zero: bool = True, + mode: str = "all", + ): # TODO: -> Tuple[List[float], List[float], List[float]] """ - Based on IBM SPSS method for detecting outliers + Based on IBM SPSS method for detecting outliers. + Outliers more than 1.5*IQR from Q1 or Q3 + "Extreme values" more than 3*IQR from Q1 or Q3 :param dataset: - :param mode: + :param mode: str :return: """ @@ -194,6 +217,7 @@ def spss_outliers(dataset, strip_zero=True, mode="all"): for val in dataset: if val in ['', 0.0, 0]: dataset.remove(val) + if len(dataset) == 0: return float('nan') elif dataset == [None]: diff --git a/mathematical/stats.py b/mathematical/stats.py index 29bcc5a..6a577cb 100644 --- a/mathematical/stats.py +++ b/mathematical/stats.py @@ -43,21 +43,21 @@ # # stdlib -import warnings +import warnings # type: ignore +from typing import Callable, List, Optional, Sequence, Union # 3rd party -import numpy +import numpy # type: ignore # this package from . import utils -def mean_none(dataset): +def mean_none(dataset: Sequence[Union[float, bool, None]]) -> float: """ Calculate the mean, excluding NaN, strings, boolean values, and zeros :param dataset: list to calculate mean from - :type dataset: list :return: mean :rtype float @@ -66,16 +66,16 @@ def mean_none(dataset): dataset = utils.strip_none_bool_string(dataset) dataset = utils.remove_zero(dataset) - return numpy.nanmean(dataset) + return float(numpy.nanmean(dataset)) -def std_none(dataset, ddof=1): +def std_none(dataset: Sequence[Union[float, bool, None]], ddof: int = 1) -> float: """ Calculate the standard deviation, excluding NaN, strings, boolean values, and zeros :param dataset: list to calculate mean from - :type dataset: list - :param ddof: Means Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. By default ddof is 1. + :param ddof: Means Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. By default ddof is 1. :type ddof: int :return: standard deviation :rtype float @@ -85,15 +85,14 @@ def std_none(dataset, ddof=1): dataset = utils.remove_zero(dataset) print(dataset) - return numpy.nanstd(dataset, ddof=ddof) + return float(numpy.nanstd(dataset, ddof=ddof)) -def median_none(dataset): +def median_none(dataset: Sequence[Union[float, bool, None]]) -> float: """ Calculate the median, excluding NaN, strings, boolean values, and zeros :param dataset: list to calculate median from - :type dataset: list :return: standard deviation :rtype float @@ -102,16 +101,14 @@ def median_none(dataset): dataset = utils.strip_none_bool_string(dataset) dataset = utils.remove_zero(dataset) - return numpy.nanmedian(dataset) + return float(numpy.nanmedian(dataset)) -def iqr_none(dataset): +def iqr_none(dataset: Sequence[Union[float, bool, None]]) -> float: """ Calculate the interquartile range, excluding NaN, strings, boolean values, and zeros :param dataset: list to calculate iqr from - :type dataset: list - :return: interquartile range :rtype float """ @@ -120,17 +117,15 @@ def iqr_none(dataset): q3 = percentile_none(dataset, 75) iq = q3 - q1 - return iq + return float(iq) -def percentile_none(dataset, percentage): +def percentile_none(dataset: Sequence[Union[float, bool, None]], percentage: float) -> float: """ Calculate the given percentile, excluding NaN, strings, boolean values, and zeros :param dataset: list to calculate percentile from - :type dataset: list - :param percentage: :type percentage: float @@ -146,19 +141,17 @@ def percentile_none(dataset, percentage): if len(dataset) < 2: raise ValueError("Dataset too small") - return numpy.percentile(dataset, percentage) + return float(numpy.percentile(dataset, percentage)) -def pooled_sd(sample1, sample2, weighted=False): +def pooled_sd(sample1: Sequence[float], sample2: Sequence[float], weighted: bool = False) -> float: """ Pooled Standard Deviation Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm :param sample1: datapoints for first sample - :type sample1: list :param sample2: datapoints for second sample - :type sample2: list :param weighted: True for weighted pooled SD :return: Pooled Standard Deviation @@ -176,18 +169,22 @@ def pooled_sd(sample1, sample2, weighted=False): return numpy.sqrt(((sd1**2) + (sd2**2)) / 2) -def d_cohen(sample1, sample2, sd=1, tail=1, pooled=False): +def d_cohen( + sample1: Sequence[float], + sample2: Sequence[float], + which: int = 1, + tail: int = 1, + pooled: bool = False, + ) -> float: """ Cohen's d-Statistic Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Edition). Hillsdale, NJ: Lawrence Erlbaum Associates :param sample1: datapoints for first sample - :type sample1: list :param sample2: datapoints for second sample - :type sample2: list - :param sd: Use the standard deviation of the first sample (1) or the second sample (2) - :type sd: int + :param which: Use the standard deviation of the first sample (1) or the second sample (2) + :type which: int :param tail: :param pooled: @@ -198,29 +195,29 @@ def d_cohen(sample1, sample2, sd=1, tail=1, pooled=False): mean1 = numpy.mean(sample1) mean2 = numpy.mean(sample2) - if sd == 1: - sd = numpy.std(sample1) + if which == 1: + stdev = numpy.std(sample1) else: - sd = numpy.std(sample2) + stdev = numpy.std(sample2) if pooled: - sd = pooled_sd(sample1, sample2) + stdev = pooled_sd(sample1, sample2) if tail == 2: - return numpy.abs(mean1 - mean2) / sd + return numpy.abs(mean1 - mean2) / stdev - return (mean1 - mean2) / sd + return (mean1 - mean2) / stdev -def g_hedge(sample1, sample2): +def g_hedge(sample1: Sequence[float], sample2: Sequence[float]) -> float: """ Hedge's g-Statistic Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm :param sample1: datapoints for first sample - :type sample1: list :param sample2: datapoints for second sample + :return: """ @@ -229,7 +226,7 @@ def g_hedge(sample1, sample2): return (mean1 - mean2) / pooled_sd(sample1, sample2, True) -def g_durlak_bias(g, n): +def g_durlak_bias(g: float, n: float) -> float: """ Application of Durlak's bias correction to the Hedge's g statistic. Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm @@ -237,19 +234,19 @@ def g_durlak_bias(g, n): n = n1+n2 :param g: - :type g: + :type g: float :param n: - :type n: + :type n: float :return: - :rtype: + :rtype: float """ Durlak = ((n - 3) / (n - 2.25)) * numpy.sqrt((n - 2) / n) return g * Durlak -def interpret_d(d_or_g): +def interpret_d(d_or_g: float) -> str: """ Interpret Cohen's d or Hedge's g values using Table 1 from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3444174/ @@ -258,12 +255,10 @@ def interpret_d(d_or_g): :type d_or_g: :return: - :rtype: + :rtype: str """ - if d_or_g < 0: - return f"{interpret_d(numpy.abs(d_or_g)).split(' ')[0]} Adverse Effect" - elif 0.0 <= d_or_g < 0.2: + if 0.0 <= d_or_g < 0.2: return "No Effect" elif 0.2 <= d_or_g < 0.5: return "Small Effect" @@ -271,9 +266,11 @@ def interpret_d(d_or_g): return "Intermediate Effect" elif 0.8 <= d_or_g: return "Large Effect" + else: # d_or_g < 0 + return f"{interpret_d(numpy.abs(d_or_g)).split(' ')[0]} Adverse Effect" -def _contains_nan(a, nan_policy='propagate'): +def _contains_nan(a, nan_policy: str = 'propagate'): policies = ['propagate', 'raise', 'omit'] if nan_policy not in policies: raise ValueError("nan_policy must be one of {%s}" % ', '.join(f"'{s}'" for s in policies)) @@ -304,14 +301,20 @@ def _contains_nan(a, nan_policy='propagate'): return contains_nan, nan_policy -def median_absolute_deviation(x, axis=0, center=numpy.median, scale=1.4826, nan_policy='propagate'): +def median_absolute_deviation( + x, + axis: int = 0, + center: Callable = numpy.median, + scale: float = 1.4826, + nan_policy: str = 'propagate' + ) -> numpy.ndarray: """ Compute the median absolute deviation of the data along the given axis. The median absolute deviation (MAD, [1]_) computes the median over the absolute deviations from the median. It is a measure of dispersion similar to the standard deviation, but is more robust to outliers [2]_. The MAD of an empty array is ``numpy.nan``. - .. versionadded:: 1.3.0 + Parameters ---------- x : array_like @@ -387,7 +390,12 @@ def median_absolute_deviation(x, axis=0, center=numpy.median, scale=1.4826, nan_ return scale * mad -def absolute_deviation(x, axis=0, center=numpy.median, nan_policy='propagate'): +def absolute_deviation( + x, + axis: int = 0, + center: Callable = numpy.median, + nan_policy: str = 'propagate', + ) -> numpy.ndarray: """ Compute the absolute deviations from the median of the data along the given axis. @@ -447,7 +455,13 @@ def absolute_deviation(x, axis=0, center=numpy.median, nan_policy='propagate'): return ad -def absolute_deviation_from_median(x, axis=0, center=numpy.median, scale=1.4826, nan_policy='propagate'): +def absolute_deviation_from_median( + x, + axis: int = 0, + center: Callable = numpy.median, + scale: float = 1.4826, + nan_policy: str = 'propagate', + ) -> numpy.ndarray: """ Compute the absolute deviation from the median of each point in the data along the given axis, given in terms of the MAD. @@ -487,6 +501,7 @@ def absolute_deviation_from_median(x, axis=0, center=numpy.median, scale=1.4826, will calculate the MAD around the mean - it will not calculate the *mean* absolute deviation. """ + ad = absolute_deviation(x, axis=axis, center=center, nan_policy=nan_policy) if axis is None: @@ -494,13 +509,23 @@ def absolute_deviation_from_median(x, axis=0, center=numpy.median, scale=1.4826, else: mad = numpy.median(ad, axis=axis) - ad_from_median = ad / mad + return ad / mad + + +def within1min(value1: float, value2: float) -> bool: + """ + Returns whether ``value2`` is within one minute of ``value1``. - return ad_from_median + :param value1: A time + :type value1: + :param value2: another time + :type value2: + :return: + :rtype: + """ -def within1min(value1, value2): - if value1 not in [0, None, ''] and value2 not in [0, None, '']: + if value1 and value2: return (float(value1) - 1) < (float(value2)) < (float(value1) + 1) else: return False diff --git a/mathematical/utils.py b/mathematical/utils.py index 6fd0c1d..dd506b6 100644 --- a/mathematical/utils.py +++ b/mathematical/utils.py @@ -78,37 +78,38 @@ # # stdlib +import decimal import math +from decimal import Decimal from operator import eq, ge, gt, le, lt, ne +from typing import Any, List, Optional, Sequence, Union # 3rd party -import numpy +import numpy # type: ignore -def intdiv(p, q): +def intdiv(p: float, q: float) -> int: """ Integer divsions which rounds toward zero - Examples - -------- + **Examples** >>> intdiv(3, 2) 1 >>> intdiv(-3, 2) -1 >>> -3 // 2 -2 - """ + r = p // q if r < 0 and q * r != p: r += 1 - return r + return int(r) -def roman(num): +def roman(num: float) -> str: """ - Examples - -------- + **Examples** >>> roman(4) 'IV' >>> roman(17) @@ -119,13 +120,13 @@ def roman(num): values = 1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1 result = '' for t, v in zip(tokens, values): - cnt = num // v + cnt = int(num // v) result += t * cnt num -= v * cnt return result -def magnitude(x): +def magnitude(x: float) -> int: """ Determine the magnitude of the given value @@ -146,7 +147,7 @@ def magnitude(x): # return int(math.floor(math.log10(abs(num)))) -def remove_zero(inputlist): +def remove_zero(inputlist: Sequence[Union[float, bool, None]]) -> List[float]: """ Remove zero values from the given list Also removes False and None @@ -162,7 +163,7 @@ def remove_zero(inputlist): return list(inputlist[numpy.nonzero(inputlist)]) -def isint(num): # Only works with floating point numbers +def isint(num: float) -> bool: # Only works with floating-point numbers """ Checks whether a float is an integer value @@ -175,7 +176,7 @@ def isint(num): # Only works with floating point numbers return num == int(num) -def RepresentsInt(s): +def represents_int(s: Any) -> bool: """ Checks whether a value can be converted to int @@ -190,7 +191,10 @@ def RepresentsInt(s): return False -def rounders(val_to_round, round_format): +RepresentsInt = represents_int + + +def rounders(val_to_round: Union[str, float, Decimal], round_format: str) -> Decimal: """ Round a value to the specified number format, e.g. "0.000" for three decimal places @@ -206,7 +210,7 @@ def rounders(val_to_round, round_format): return Decimal(Decimal(val_to_round).quantize(Decimal(str(round_format)), rounding=ROUND_HALF_UP)) -def strip_strings(ls): +def strip_strings(ls: Sequence[Any]) -> List: """ Remove strings from a list @@ -220,7 +224,7 @@ def strip_strings(ls): return [x for x in ls if not isinstance(x, str)] -def strip_booleans(ls): +def strip_booleans(ls: Sequence[Any]) -> List: """ Remove booleans from a list @@ -234,7 +238,7 @@ def strip_booleans(ls): return [x for x in ls if not isinstance(x, bool)] -def strip_nonetype(ls): +def strip_nonetype(ls: Sequence[Any]) -> List: """ Remove None from a list @@ -248,7 +252,7 @@ def strip_nonetype(ls): return [x for x in ls if x is not None] -def strip_none_bool_string(ls): +def strip_none_bool_string(ls: Sequence[Any]) -> List: """ Remove None, Boolean and strings from a list @@ -264,7 +268,7 @@ def strip_none_bool_string(ls): return ls -def gcd(a, b): +def gcd(a: int, b: int) -> int: """ Returns the GCD (HCF) of a and b using Euclid's Algorithm @@ -280,7 +284,7 @@ def gcd(a, b): return math.gcd(a, b) -def gcd_array(array): +def gcd_array(array) -> float: """ Returns the GCD for an array of numbers using Euclid's Algorithm @@ -289,7 +293,7 @@ def gcd_array(array): :param array: :type array: :return: - :rtype: + :rtype: float """ a = array[0] @@ -302,13 +306,13 @@ def gcd_array(array): return x -def gcd2(numbers): +def gcd2(numbers: Sequence[int]) -> int: """ Returns the GCD (HCF) of a list of numbers using Euclid's Algorithm :param numbers: - :return: + :return:float """ c = numbers[0] @@ -317,12 +321,12 @@ def gcd2(numbers): return c -def lcm(numbers): +def lcm(numbers: Sequence[int]) -> float: """ Returns the LCM of a list of numbers using Euclid's Algorithm :param numbers: - :return: + :return: float """ product = numbers[0] @@ -336,30 +340,30 @@ def lcm(numbers): return product -def hcf(a, b): +def hcf(a: int, b: int) -> int: """ :param a: :param b: - :return: + :return:int """ - gcd(a, b) + return gcd(a, b) -def hcf2(numbers): +def hcf2(numbers: Sequence[int]) -> int: """ :param numbers: - :return: + :return:int """ - gcd2(numbers) + return gcd2(numbers) -def modInverse(a, m): +def modInverse(a: int, m: int) -> Optional[float]: """ Returns the modular inverse of a % m, which is the number x such that a*x % m = 1 :param a: @@ -374,9 +378,11 @@ def modInverse(a, m): # Calculation using the Extended Euclidean Algorithm u1, u2, u3 = 1, 0, a v1, v2, v3 = 0, 1, m + while v3 != 0: q = u3 // v3 # // forces integer division in Python 3 v1, v2, v3, u1, u2, u3 = (u1 - q * v1), (u2 - q * v2), (u3 - q * v3), v1, v2, v3 + return u1 % m @@ -385,40 +391,20 @@ def modInverse(a, m): _precalc_fact = numpy.log([math.factorial(n) for n in range(20)]) -def log_factorial(x): - x = numpy.array(x) +def log_factorial(x: float) -> float: + arr = numpy.array(x) pf = _precalc_fact - m = (x >= pf.size) - out = numpy.empty(x.shape) - out[~m] = pf[x[~m].astype(int)] - x = x[m] - out[m] = x * numpy.log(x) - x + 0.5 * numpy.log(2 * numpy.pi * x) - return out + m: bool = (arr >= pf.size) + out = numpy.empty(arr.shape) + out[~m] = pf[arr[~m].astype(int)] + arr = arr[m] + out[m] = arr * numpy.log(arr) - arr + 0.5 * numpy.log(2 * numpy.pi * arr) + return float(out) -def _log_pi_r(d, k, p=0.5): +def _log_pi_r(d: float, k: float, p: float = 0.5) -> float: return k * math.log(p) + log_factorial(k + d) - log_factorial(k) - log_factorial(d) -def _log_pi(d, k, p=0.5): +def _log_pi(d: float, k: float, p: float = 0.5) -> float: return _log_pi_r(d, k, p) + (d + 1) * math.log(1 - p) - - -def _expectation(d, T, p=0.5): - if T is None: - return d + 1 - T = numpy.array(T, dtype=int) - m = numpy.arange(T.max() + 1, dtype=int) - pi = numpy.exp(_log_pi(d, m, p)) - return ((m * pi).cumsum() / pi.cumsum())[T] - - -def _confidence_value(conf, d, T, p=0.5): - if T is not None: - T = numpy.array(T, dtype=int) - m = numpy.arange(T.max() + 1, dtype=int) - else: - m = numpy.arange(max(50 * d, 10000)) - log_pi = _log_pi(d, m, p) - pics = numpy.exp(log_pi).cumsum() - return numpy.searchsorted(pics, conf * (pics[T] if T is not None else 1)) diff --git a/tests/test_data_frames.py b/tests/test_data_frames.py index 419edd2..a9a8249 100644 --- a/tests/test_data_frames.py +++ b/tests/test_data_frames.py @@ -7,20 +7,16 @@ """ +# stdlib import copy -import pandas -import pytest +# 3rd party +import pandas # type: ignore +import pytest # type: ignore +# this package from mathematical.data_frames import ( - df_count, - df_data_points, - df_log, - df_log_stdev, - df_mean, - df_median, - df_percentage, - df_stdev, + df_count, df_data_points, df_log, df_log_stdev, df_mean, df_median, df_percentage, df_stdev ) diff --git a/tests/test_linear_regression.py b/tests/test_linear_regression.py index e67031d..c0afc82 100644 --- a/tests/test_linear_regression.py +++ b/tests/test_linear_regression.py @@ -50,8 +50,8 @@ from itertools import count # 3rd party -import numpy -import pytest +import numpy # type: ignore +import pytest # type: ignore # this package from mathematical import linear_regression diff --git a/tests/test_stats.py b/tests/test_stats.py index 9bae0fc..d6b8fa6 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -6,7 +6,10 @@ Test functions in stats.py """ -import numpy +# 3rd party +import numpy # type: ignore + +# this package from mathematical import stats data = [1, 2, 3, 4, 5, 0, "abc", False, None, numpy.nan] @@ -43,7 +46,7 @@ def test_iqr_none(): def test_mad(): # Based on example from scipy.median_absolute_deviation docstring - import scipy.stats + import scipy.stats # type: ignore x = scipy.stats.norm.rvs(size=100, scale=1, random_state=123456) assert isinstance(stats.median_absolute_deviation(x), float) assert stats.median_absolute_deviation(x) == 1.2280762773108278 diff --git a/tests/test_utils.py b/tests/test_utils.py index da657c4..8081da3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -50,9 +50,13 @@ # | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # +# stdlib import decimal -import numpy +# 3rd party +import numpy # type: ignore + +# this package from mathematical import utils data = [1, 2, 3, 4, 5, 0, "abc", False, None, numpy.nan] diff --git a/tox.ini b/tox.ini index 4179b34..0045943 100644 --- a/tox.ini +++ b/tox.ini @@ -66,7 +66,7 @@ commands = flake8 mathematical tests [testenv:yapf] -basepython = python3.7 +basepython = python3.8 changedir={toxinidir} skip_install = true ignore_errors=true @@ -75,7 +75,7 @@ commands = yapf -i --recursive mathematical tests [testenv:isort] -basepython = python3.6 +basepython = python3.8 skip_install = true ignore_errors=true changedir={toxinidir}