# Sub function to engineer statistical features for each test column

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis, norm, kstest, zscore, shapiro
from sklearn.neighbors import LocalOutlierFactor
import warnings

In [7]:
def get_test_features(test): 
    values = test.dropna()
    if values.empty:
        return pd.DataFrame() # return empty features if no values 
    # statistical properties
    mean = values.mean()
    median = values.median()
    std_dev = values.std()
    iqr = values.quantile(0.75) - values.quantile(0.25)
    unique_values = np.unique(values)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        try:
            if len(unique_values) <= 2:
                skewness = 0
                kurt = 0
            else:
                skewness = skew(values)
                kurt = kurtosis(values)
        except RuntimeWarning:
            skewness = np.nan
            kurt = np.nan
    min_value = values.min()
    max_value = values.max()
    range = max_value - min_value
    p95 = values.quantile(0.95)
    p5 = values.quantile(0.05)
    p99 = values.quantile(0.99)
    p1 = values.quantile(0.01)
    upper_tail_diff = p99 - p95
    lower_tail_diff = p5 - p1
    tail_length_95 = max_value - p95
    tail_length_99 = max_value - p99
    tail_length_05 = p5 - min_value 
    tail_length_01 = p1 - min_value 
    #percentile_ratio = p95 / p5 if p5 != 0 else np.nan
    tail_weight_ratio = np.sum((values > (median + 1.5*iqr)) | (values < (median - 1.5*iqr))) / len(values)
    excess_kurtosis = kurt - 3
    p99 = values.quantile(0.99)
    p1 = values.quantile(0.01)

    # outlier detection
    zscores = zscore(values)
    outliers_zscore = zscores.abs() > 3 
    count_outliers_zscore = outliers_zscore.sum() # using zscore
    Q1 = values.quantile(0.25)
    Q3 = values.quantile(0.75)
    IQR = Q3 - Q1   
    outliers_iqr = ((values < (Q1 - 1.5 * IQR)) | values > (Q3 + 1.5 * IQR)) # using iqr
    count_outliers_iqr = outliers_iqr.sum()
    # if len(values) > 1:
    #     lof = LocalOutlierFactor(n_neighbors=min(20, len(values) - 1)) # using local density of points
    #     lof_scores = lof.fit_predict(values.values.reshape(-1, 1))
    #     count_outliers_lof = np.sum(lof_scores == -1)
    # else:
    #     count_outliers_lof = 0
    # goodness of fit parameters
    warnings.simplefilter("ignore", category=RuntimeWarning)
    fitted_mean, fitted_std_dev = norm.fit(values)
    ks_stat_norm, ks_p_value_norm = kstest(values, 'norm', args=(fitted_mean, fitted_std_dev))
    warnings.filterwarnings("ignore", message="scipy.stats.shapiro: For N > 5000")
    if len(values) >= 3 and np.ptp(values) > 0:
        shapiro_stat, shapiro_p_value = shapiro(values)
    else:
        shapiro_stat, shapiro_p_value = np.nan, np.nan #float('nan'), float('nan')

    feature_vector = {
            'Mean': mean,
            'Median': median,
            'Std_Dev': std_dev,
            'IQR': iqr,
            'Skewness': skewness,
            'Kurtosis': kurt,
            'Min': min_value,
            'Max': max_value,
            'Range': range,
            'Upper_Tail': upper_tail_diff,
            'Lower_Tail': lower_tail_diff,
            'Extreme_Tail_95': tail_length_95,
            'Extreme_Tail_99': tail_length_99,
            'Extreme_Tail_05': tail_length_05,
            'Extreme_Tail_01': tail_length_01,
            #'Percentile_Ratio_95_5': percentile_ratio,
            'Tail_Weight_Ratio': tail_weight_ratio,
            'Excess_Kurtosis': excess_kurtosis,
            'P99': p99,
            'P1': p1,
            'Outliers_Zscore': count_outliers_zscore,
            'Outliers_IQR': count_outliers_iqr,
            #'Outliers_LOF': count_outliers_lof,
            'KS_Stat_norm': ks_stat_norm,
            'KS_P_value_norm': ks_p_value_norm,
            'Shapiro_Stat': shapiro_stat,
            'Shapiro_P_value': shapiro_p_value
    }

    features = pd.DataFrame([feature_vector.values()], columns=feature_vector.keys())
    return features


# Overall function to aggregate chosen test columns into feature dataframe
input consists of columns which are filtered to only include proper parameter/tests with numerical values for classifying distribution  
output consists of a dataframe of features, where each row represent the features corresponding to each test

In [8]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

# input consists of columns which are filtered to only include proper parameter/tests with numerical values for classifying distribution
def get_features_set(df):
    feature_list = []
    scaler = StandardScaler()
    for test_name in df.columns:
        test_data = df[test_name].values.reshape(-1, 1)
        test_transformed = scaler.fit_transform(test_data) 
        test_transformed = test_transformed.flatten()
        features = get_test_features(pd.Series(test_transformed))
        # features['Test'] = test_name 
        feature_list.append(features)

    result = pd.concat(feature_list, axis=0).reset_index(drop=True)
    return result

# Script for testing the functions

In [4]:
YK_data = pd.read_csv('../../data/YK_training_ULT.csv')

In [9]:
column = YK_data.iloc[:,112]
feature = get_test_features(column)
print(feature)

columns = YK_data.iloc[:,112:130]
features = get_features_set(columns)
print(features)

           Mean        Median       Std_Dev           IQR  Skewness  Kurtosis  \
0  1.460703e-08  1.500000e-08  1.656903e-08  1.900000e-08  0.438307 -0.140995   

            Min           Max         Range    Upper_Tail  ...  \
0 -2.500000e-08  9.500000e-08  1.200000e-07  7.000000e-09  ...   

   Tail_Weight_Ratio  Excess_Kurtosis           P99            P1  \
0           0.105283        -3.140995  5.300000e-08 -1.500000e-08   

   Outliers_Zscore  Outliers_IQR  KS_Stat_norm  KS_P_value_norm  Shapiro_Stat  \
0               51         63034      0.090484              0.0      0.969565   

   Shapiro_P_value  
0     3.162291e-74  

[1 rows x 25 columns]


  x = np.asarray((x - loc)/scale, dtype=dtyp)


            Mean    Median   Std_Dev       IQR  Skewness   Kurtosis  \
0   1.569634e-16  0.023717  1.000008  1.146726  0.438307  -0.140995   
1  -4.883305e-17 -0.049132  1.000008  1.319530  0.298119  -0.146116   
2  -1.668071e-16  0.056881  1.000007  1.375858 -0.249061   0.099544   
3  -1.150394e-17 -0.043833  1.000007  1.272665  0.362439  -0.178241   
4   5.341114e-18 -0.280603  1.000007  1.553068  0.430507  -0.836325   
5  -4.149634e-17  0.223481  1.000007  1.599715 -0.140195  -0.908983   
6   3.779865e-17 -0.154912  1.000007  1.149639  0.820983  -0.017176   
7   1.766676e-16 -0.004041  1.000007  1.165563  0.481646   0.161564   
8  -2.465129e-17 -0.005513  1.000007  1.094891  0.500373  -0.091061   
9  -8.134927e-17  0.015065  1.000007  1.262150  0.269995  -0.298510   
10  0.000000e+00  0.000000  0.000000  0.000000  0.000000   0.000000   
11 -9.364588e-16 -0.014600  1.000008  0.000000  0.000000   0.000000   
12  6.199787e-15  0.060719  1.000008  1.373722 -0.209605   0.280254   
13 -8.