In [895]:
import numpy as np
from typing import *
import pandas as pd

In [978]:
ArrayLike = Union[np.ndarray, Sequence[float], Sequence[Sequence[float]], pd.DataFrame, pd.Series]

def _2D_numeric(data: ArrayLike, name: str = 'Data') -> np.ndarray:
    # TODO: add type hints and docstrings, change the name Data, make the isinstance(ArrayLike thing)

    array = np.asarray(data)
    if array.ndim != 2:
        raise ValueError(f'{name} must be a 2D array; got {array.ndim}D instead.')
    if array.size == 0:
        raise ValueError(f'{name} must be non-empty')
    if not np.issubdtype(array.dtype, np.number):
        try:
            array = array.astype(float, copy = False)
        except (TypeError, ValueError) as e:
            raise TypeError(f'All entries in {name} must be numeric') from e
    else:
        array = array.astype(float, copy = False)
    return array

def _1D_numeric(data_vector: Optional[ArrayLike], name: str = 'Data') -> Optional[np.ndarray]:
    
    # TODO: add type hints and docstrings, change the name Data

    if data_vector is None:
        return None
    vector = np.asarray(data_vector)
    if vector.ndim != 1:
        raise ValueError(f'{name} must be a 1D array; got {vector.ndim}D instead')
    if vector.size == 0:
        raise ValueError(f'{name} must be non-empty')
    return vector

def _shape_match(data_array: np.ndarray, data_vector: Optional[np.ndarray]) -> None:

    # TODO: add type hints and docstrings

    if not isinstance(data_array, np.ndarray):
        raise ValueError(f'Data array must be an array, got {type(data_array).__name__}')
    
    if (data_vector is not None) and (not isinstance(data_vector, np.ndarray)):
        raise ValueError(f'Data vector must be an array, got {type(data_vector).__name__}')

    if data_vector is not None:
        if data_array.shape[0] != data_vector.shape[0]:
            raise ValueError(f'Both arrays must have the same first dimension;'
                             f'got data_array.shape[0] = {data_array.shape[0]} '
                             f'and data_vector.shape[0] = {data_vector.shape[0]} instead')

In [979]:
df1 = np.zeros((3,2))
df2 = np.array([1,0, 3])

df_test = pd.DataFrame({
    'A': [1.0, 5.0, 9.0, np.nan],       # numeric with a NaN
    'B': [2.0, np.nan, 2.0, 4.0],       # numeric with repeated value (mode exists)
    'C': [np.nan, 7.0, 11.0, 15.0],     # numeric with one NaN
    'D': [4.0, 4.0, np.nan, 4.0],       # numeric with mode being 4
})

In [980]:
_shape_match(df1, df2)

In [898]:
from scipy import stats

def missing_data(data_array: np.array, strategy: str):

    # TODO: type hints, docstring, explanation of strategies, add workaround for all unique values in mode

    array = _2D_numeric(data_array)

    possible_strategies = {'drop', 'mean', 'median', 'mode'}
    if strategy not in possible_strategies:
        raise ValueError(f"Strategy must be one of {possible_strategies}, got '{strategy}'")
    
    if strategy == 'drop':
        cleaned_array = array[~np.any(np.isnan(array), axis = 1)]
    else:
        cleaned_array = array.copy()
        for column in range(array.shape[1]):
            empty_elements = np.isnan(array[:, column])
            if strategy == 'mean':
                column_mean = np.nanmean(array[:, column])
                cleaned_array[empty_elements, column] = column_mean
            if strategy == 'median':
                column_median = np.nanmedian(array[:, column])
                cleaned_array[empty_elements, column] = column_median
            if strategy == 'mode':
                column_elements = array[:, column][~np.isnan(array[:, column])]
                column_mode = stats.mode(column_elements).mode
                cleaned_array[empty_elements, column] = column_mode
            
    return(cleaned_array)

In [899]:
df_duplicates = pd.DataFrame({
    'A': [1, 2, 1, 4, 1],
    'B': [5, 6, 5, 8, 5],
    'C': [9, 10, 9, 12, 9]
})

In [900]:

def duplicate_identify(data_array: ArrayLike, drop: bool = False):

    # TODO: type hints, docstring, explanation of strategies, option to print duplicate rows/indicate (?)

    array = _2D_numeric(data_array)
    if not isinstance(drop, bool):
            raise TypeError(f"Drop parameter must be True or False, got {type(drop).__name__}")

    indices = []
    for row in range(array.shape[0]):
        for comparison_row in range(row + 1, array.shape[0]):
            if np.array_equal(array[row], array[comparison_row]) and row != comparison_row:
                indices.append(comparison_row)
    indices = list(set(indices))

    if drop == True:
        cleaned_array = np.delete(array, indices, axis = 0)
    elif drop == False:
        cleaned_array = array.copy()

    return cleaned_array

In [901]:
indices = duplicate_identify(df_duplicates, False)
print(indices)

[[ 1.  5.  9.]
 [ 2.  6. 10.]
 [ 1.  5.  9.]
 [ 4.  8. 12.]
 [ 1.  5.  9.]]


In [1009]:
X = np.array([[10, 20, 30], [12, 22, 29], [11, 21, 31], [100, 20, 30], [13, 200, 28]])


In [None]:
def outlier_identify(data_array: ArrayLike, method: str, *, drop: bool = False, threshold: float = 3):

    # TODO: type hints, docstring, explanation of strategies, option to print outliers/indicate (?)

    array = _2D_numeric(data_array)

    if not isinstance(drop, bool):
        raise TypeError(f"Drop parameter must be True or False, got {type(drop).__name__}")
    if not isinstance(threshold, (float, int)) or isinstance(threshold, bool):
        raise TypeError(f"threshold must be a float or integer, got {type(threshold).__name__}")
    possible_methods = {'IQR', 'zscore'}
    if method not in possible_methods:
        raise ValueError(f"Method of outlier detection must be one of {possible_methods}, got '{method}'")
    
    if method == 'IQR':
        outlier_indices = set()
        for column in range(array.shape[1]):
            q3 = np.percentile(array[:, column], 75)
            q1 = np.percentile(array[:, column], 25)
            iqr = q3 - q1
            indices = (np.where((array[:, column] > q3 + 1.5 * iqr) | (array[:, column] < q1 - 1.5 * iqr))[0]).tolist()
            outlier_indices.update(indices)

    if method == 'zscore':
        outlier_indices = set()
        z_scores = np.zeros_like(array, dtype=float)
        z_scores = z_score_standardize(array, False,1)
        print(z_scores)
        outlier_rows = np.any(np.abs(z_scores) > threshold, axis=1)
        outlier_indices.update(np.where(outlier_rows)[0])


    if drop:
        cleaned_array = np.delete(array, list(outlier_indices), axis = 0)
    else:
        cleaned_array = array.copy()

    return cleaned_array


In [1012]:
test2 = outlier_identify(X, 'zscore', drop = True, threshold = 2)
print(test2)

[[-0.54215661 -0.51043295  0.39223227]
 [-0.48568196 -0.48254044 -0.58834841]
 [-0.51391929 -0.49648669  1.37281295]
 [ 1.99920249 -0.51043295  0.39223227]
 [-0.45744464  1.99989302 -1.56892908]]
[[ 10.  20.  30.]
 [ 12.  22.  29.]
 [ 11.  21.  31.]
 [100.  20.  30.]
 [ 13. 200.  28.]]


In [905]:
X = np.array([
    [10, 200, 5],
    [12, 210, 6],
    [11, 205, 5],
    [13, 215, 6]
])

In [906]:
def z_score_standardize(data_array: ArrayLike, return_params: bool = False, ddof: int = 0)  -> Union[np.ndarray, Tuple[np.ndarray, dict]]:
    
    # TODO: add type hints/docstrings/examples

    array = _2D_numeric(data_array)

    if not isinstance(ddof, int):
        raise TypeError(f"Delta degrees of freedom parameter must be an integer, got {type(ddof).__name__}")
    
    if not isinstance(return_params, bool):
        raise TypeError(f"return_params must be a boolean, got {type(return_params).__name__}")

    columnwise_mean = array.mean(axis = 0)
    scale = array.std(axis = 0, ddof = ddof)
    scale[scale == 0.0] = 1.0
    standardized_array = (array - columnwise_mean) / scale

    if return_params:
        return standardized_array, {'mean': columnwise_mean, 'scale': scale}

    return standardized_array

In [907]:
standard = z_score_standardize(X, True, 0)
standard

(array([[-1.34164079, -1.34164079, -1.        ],
        [ 0.4472136 ,  0.4472136 ,  1.        ],
        [-0.4472136 , -0.4472136 , -1.        ],
        [ 1.34164079,  1.34164079,  1.        ]]),
 {'mean': array([ 11.5, 207.5,   5.5]),
  'scale': array([1.11803399, 5.59016994, 0.5       ])})

In [908]:
X = np.array([
    [1, 10],
    [2, 20],
    [3, 30]
])

In [909]:
def min_max_standardize(data_array: ArrayLike, *, feature_range: Tuple[float, float] = (0.0, 1.0), return_params: bool = False)  -> Union[np.ndarray, Tuple[np.ndarray, dict]]:

    # TODO: add type hints/docstrings/examples

    array = _2D_numeric(data_array)
    
    if not (isinstance(feature_range, Tuple) and len(feature_range) == 2 and all(isinstance(element, (int, float)) for element in feature_range)):
        raise TypeError(f"Feature range must be a tuple of length 2 (min, max) with float or integer elements")
    
    if not isinstance(return_params, bool):
        raise TypeError(f"return_params must be a boolean, got {type(return_params).__name__}")
    
    feature_min, feature_max = feature_range[0], feature_range[1]
    if feature_min >= feature_max:
        raise ValueError(f"Minimum of feature range must be less than maximum")
    
    column_maximums = array.max(axis = 0)
    column_minimums = array.min(axis = 0)
    scale = column_maximums - column_minimums
    scale[scale == 0.0] = 1.0
    standardized_array = feature_min + ((array - column_minimums) / scale) * (feature_max - feature_min)

    if return_params:
        return standardized_array, {'minimum': column_minimums, 'maximum': column_maximums, 'scale': scale, 'feature_range': feature_range}
    return standardized_array

In [910]:
standard = min_max_standardize(X, return_params = True)
standard

(array([[0. , 0. ],
        [0.5, 0.5],
        [1. , 1. ]]),
 {'minimum': array([ 1., 10.]),
  'maximum': array([ 3., 30.]),
  'scale': array([ 2., 20.]),
  'feature_range': (0.0, 1.0)})

In [911]:
X = np.array([
    [ 2, -5,  3],
    [ 4,  0, -6],
    [-2,  5,  0]
])

In [912]:
def max_abs_standardize(data_array: ArrayLike, return_params: bool = False) -> Union[np.ndarray, Tuple[np.ndarray, dict]]:
    
    # TODO: add type hints/docstrings/examples

    array = _2D_numeric(data_array)

    if not isinstance(return_params, bool):
        raise TypeError(f"return_params must be a boolean, got {type(return_params).__name__}")
    
    absolute_value_array = abs(array)
    column_maximums = absolute_value_array.max(axis = 0)
    column_maximums[column_maximums == 0.0] = 1.0
    standardized_array = array / column_maximums

    if return_params:
        return standardized_array, {'scale': column_maximums}
    return standardized_array

In [913]:
standard = max_abs_standardize(X)
standard

array([[ 0.5, -1. ,  0.5],
       [ 1. ,  0. , -1. ],
       [-0.5,  1. ,  0. ]])

In [914]:
X = np.array([
    [3, 4, 0],
    [1, 2, 2],
    [0, 0, 5],
    [0,0,0],
])


In [915]:
def l2_standardize(data_array: ArrayLike, epsilon: float = 1e-15) -> np.ndarray:

    # TODO: add type hints/docstrings/examples

    array = _2D_numeric(data_array)

    if not isinstance(epsilon, (float, int)):
        raise TypeError(f"Floor value must be a float, got {type(epsilon).__name__}")
    
    if epsilon <= 0:
        raise ValueError("Floor value must be greater than zero")
    
    l2_norms = np.linalg.norm(array, axis = 1)
    l2_norms[l2_norms <= epsilon] = epsilon
    l2_norms = l2_norms[:, None]

    standardized_array = array / l2_norms

    return standardized_array

In [916]:
l2_standardize(X, 1)

array([[0.6       , 0.8       , 0.        ],
       [0.33333333, 0.66666667, 0.66666667],
       [0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.        ]])

In [921]:
y = np.array(['cat', 'cat', 'cat',
              'dog', 'dog',
              'mouse', 'cat'])


In [None]:
def _bounded_count(length: int, proportion: float) -> int:
    count = int(round(proportion * length))
    if length <= 1:
        if count < length:
            return 0
        else:
            return 1
        
    return min(max(1, count), length - 1)

In [None]:
def _stratified_indices(data: np.ndarray, 
                        test_size: float, 
                        rng: np.random.Generator, 
                        *, 
                        validation: bool = False,
                        val_size: Optional[float] = None) -> Union[Tuple[np.array, np.array],Tuple[np.array, np.array, np.array]]:

    # TODO: docstrings/type hints, examples

    if not isinstance(test_size, float):
        raise TypeError(f"Test proportion must be a float, got {type(test_size).__name__}")
    if not isinstance(validation, bool):
        raise TypeError(f"Validation parameter must be a boolean, got {type(validation).__name__}")
    
    if not (0.0 < test_size < 1.0):
        raise ValueError(f"Test proportion must be between 0 and 1, got {test_size}")
    
    data = _1D_numeric(data)

    if not validation:
        classes, label_index = np.unique(data, return_inverse = True)
        testing_indices = []
        training_indices = []
        for class_number in range(len(classes)):
            class_index = np.flatnonzero(label_index == class_number)
            rng.shuffle(class_index)

            if len(classes) > 1:
                test_number = max(1, int(round(test_size * len(class_index))))
            else:
                test_number = int(round(test_size * len(class_index)))

            if len(class_index) > 1:
                test_number = min(test_number, len(class_index) - 1)

            testing_indices.append(class_index[0:test_number])
            training_indices.append(class_index[test_number:])
        
        training_indices = np.concatenate(training_indices)
        testing_indices = np.concatenate(testing_indices)

        return testing_indices, training_indices

    elif validation:
        if not isinstance(val_size, float):
            raise TypeError(f"Validation set proportion must be a float, got {type(val_size).__name__}")
        if not (0.0 < val_size < 1.0):
            raise ValueError(f"Validation set proportion must be between 0 and 1, got {val_size}")
        if val_size + test_size >= 1.0:
            raise ValueError("Combined validation and test set proportions must be less than 1.")
        
        val_size_remaining = val_size / (1.0 - test_size)

        classes, label_index = np.unique(data, return_inverse = True)
        testing_indices = []
        training_indices = []
        val_indices = []

        for class_number in range(len(classes)):
            class_index = np.flatnonzero(label_index == class_number)
            rng.shuffle(class_index)

            if len(classes) > 1:
                test_number = max(1, int(round(test_size * len(class_index))))
            else:
                test_number = int(round(test_size * len(class_index)))

            if len(class_index) <= 1:
                if test_number < len(class_index):
                    test_number = 0
                else:
                    test_number = 1
            else:
                 test_number = min(test_number, len(class_index) - 1)

            testing_indices_initial = class_index[0:test_number]
            remaining_indices = class_index[test_number:]

            val_number = int(round(val_size_remaining * len(remaining_indices)))
            if len(remaining_indices) <= 1:
                if val_number < len(remaining_indices):
                    val_number = 0
                else:
                    val_number = 1
            else:
                val_number = min(val_number, len(remaining_indices) - 1)
            
            val_indices_initial = remaining_indices[0:val_number]
            training_indices_initial = remaining_indices[val_number:]

            testing_indices.append(testing_indices_initial)
            training_indices.append(training_indices_initial)
            val_indices.append(val_indices_initial)

        training_indices = np.concatenate(training_indices)
        testing_indices = np.concatenate(testing_indices)
        val_indices = np.concatenate(val_indices)
        
        return testing_indices, training_indices, val_indices

In [942]:
test, train, validate = _stratified_indices(y, 0.3, np.random.default_rng(), validation=True, val_size=0.2)
print(train)
print(test)
print(validate)

[6 0 4]
[2 3 5]
[1]


In [945]:
X = np.array([
    [1.0, 10.0],   # class 'A'
    [1.1, 11.0],
    [0.9, 9.5],
    [1.2, 10.5],
    [5.0, 50.0],   # class 'B'
    [5.1, 49.5],
    [4.9, 51.0],
    [9.0, 90.0],   # class 'C'
    [9.1, 91.0]
])

y = np.array(['A', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C'])

In [None]:
def _random_number(random_state: Optional[int]) -> np.random.Generator:
    
    # TODO: type hints/docstrings

    if random_state is None:
        return np.random.default_rng()
    else:
        if not isinstance(random_state, int):
            raise TypeError(f"Random state must be an integer")
        return np.random.default_rng(int(random_state))

In [None]:
def train_test(data_array: ArrayLike, 
               data_vector: Optional[ArrayLike] = None, 
               test_size: float = 0.3,
               validation: bool = False, 
               val_size: Optional[float] = 0.1,  
               shuffle: bool = True, 
               stratify: Optional[ArrayLike] = None, 
               random_state: Optional[int] = None
               ) -> Union[
                        Union
                            [Tuple[np.ndarray, np.ndarray], 
                             Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]], 
                        Union[
                            Tuple[np.ndarray, np.ndarray, np.ndarray],
                            Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray],
                        ]]:
    
    array = _2D_numeric(data_array,'data array')

    if not isinstance(test_size, float):
        raise TypeError(f"Test proportion must be a float, got {type(test_size).__name__}")
    if not isinstance(validation, bool):
        raise TypeError(f"Validation parameter must be a boolean, got {type(validation).__name__}")
    if not isinstance(shuffle, bool):
        raise TypeError(f"Shuffle parameter must be a boolean, got {type(shuffle).__name__}")
    
    if not (0.0 < test_size < 1.0):
        raise ValueError(f"Test proportion must be between 0 and 1, got {test_size}")


    rng = _random_number(random_state)

    if not validation:
        if not stratify is None:
            stratify_array = _1D_numeric(stratify, 'stratify')
            if len(stratify_array) != array.shape[0]:
                raise ValueError('Stratify must have the same length as the data array')
            testing_indices, training_indices = _stratified_indices(stratify_array, test_size, rng)
        else:
            indices = np.arange(array.shape[0])
            if shuffle:
                rng.shuffle(indices)
            test_number = _bounded_count(len(indices), test_size)
            testing_indices = indices[0:test_number]
            training_indices = indices[test_number:]

        training_array = array[training_indices]
        testing_array = array[testing_indices]

        if data_vector is None:
            return training_array, testing_array
        else:
            vector = _1D_numeric(data_vector, 'label')
            _shape_match(array, vector)
            training_data_vector = vector[training_indices]
            testing_data_vector = vector[testing_indices]
            return training_array, testing_array, training_data_vector, testing_data_vector

    elif validation:
        if not isinstance(val_size, float):
            raise TypeError(f"Validation set proportion must be a float, got {type(val_size).__name__}")
        if not (0.0 < val_size < 1.0):
            raise ValueError(f"Validation set proportion must be between 0 and 1, got {val_size}")
        if val_size + test_size >= 1.0:
            raise ValueError("Combined validation and test set proportions must be less than 1.")
                
        if not stratify is None:
            stratify_array = _1D_numeric(stratify, 'stratify')
            if len(stratify_array) != array.shape[0]:
                raise ValueError('Stratify must have the same length as the data array')
            testing_indices, training_indices, val_indices = _stratified_indices(stratify_array, test_size, rng, validation = True, val_size = val_size)
        else:
            val_prop_remaining = val_size / (1.0 - test_size)
            indices = np.arange(array.shape[0])
            if shuffle:
                rng.shuffle(indices)

            test_number = _bounded_count(len(indices), test_size)
            testing_indices = indices[0:test_number]
            remaining_indices = indices[test_number:]

            val_number = _bounded_count(len(remaining_indices), val_prop_remaining)
            val_indices = remaining_indices[0:val_number]
            training_indices = remaining_indices[val_number:]

        training_array = array[training_indices]
        testing_array = array[testing_indices]
        val_array = array[val_indices]

        if data_vector is None:
            return training_array, testing_array, val_array
        else:
            vector = _1D_numeric(data_vector, 'label')
            _shape_match(array, vector)
            training_data_vector = vector[training_indices]
            testing_data_vector = vector[testing_indices]
            val_data_vector = vector[val_indices]
            return training_array, testing_array, val_array, training_data_vector, testing_data_vector, val_data_vector
        

In [977]:
train_test(X, y, test_size = 0.3, validation = True, val_size = 0.2, stratify = y)

(array([[ 0.9,  9.5],
        [ 1.2, 10.5],
        [ 5.1, 49.5],
        [ 9.1, 91. ]]),
 array([[ 1.1, 11. ],
        [ 5. , 50. ],
        [ 9. , 90. ]]),
 array([[ 1. , 10. ],
        [ 4.9, 51. ]]),
 array(['A', 'A', 'B', 'C'], dtype='<U1'),
 array(['A', 'B', 'C'], dtype='<U1'),
 array(['A', 'B'], dtype='<U1'))