# Comparison of different normalization methods/features and impact on model

For this I will perform two different tests:
    - which of the normalisation techniques maximises the KL-Divergence/pearson coeff between
    different labeled samples? (possible similarity metric for k-medoid)
    - visual inspection of normalised data;

In [21]:
from src.data.utils import normalize_col

import pandas as pd
import glob


def make_labeled(input_filepath, norm_type="integral"):
    file_list = glob.glob(input_filepath + '/*')
    for f in file_list:
        df = pd.read_csv(f, delim_whitespace=True, names=['wl', 'ri'])
        wl = df.wl
        data = df.drop(columns=['wl'])

        for col in data:
            df[f'{norm_type}_{col}'] = normalize_col(wl, data[col], norm_type=norm_type)

    return df

In [22]:
data_integral = make_labeled("../data/raw/labeled", "integral")
data_zscore = make_labeled("../data/raw/labeled", "zscore")

data_integral.head(), data_zscore.head()

(         wl        ri  integral_ri
 0  100.0000   0.00000     0.000000
 1  130.3772   0.00000     0.000000
 2  130.8593  29.94379     0.000018
 3  131.3414  40.09552     0.000025
 4  131.8235  32.20599     0.000020,
          wl        ri  zscore_ri
 0  100.0000   0.00000  -0.450377
 1  130.3772   0.00000  -0.450377
 2  130.8593  29.94379  -0.440640
 3  131.3414  40.09552  -0.437339
 4  131.8235  32.20599  -0.439904)

In [23]:
from scipy.stats import entropy


def stat_features(df, col_name):
    # First-order stats
    var = df[col_name].std()
    mean = df[col_name].mean()
    skew = df[col_name].skew()
    kurt = df[col_name].kurtosis()

    # Second-order stats
    signal_entropy = entropy(df[col_name])

    return {
        "var": var,
        "mean": mean,
        "skew": skew,
        "kurt": kurt,
        "entropy": signal_entropy
    }

In [24]:
integral_feat = stat_features(data_integral, "integral_ri")
zscore_feat = stat_features(data_zscore, "zscore_ri")

integral_feat, zscore_feat

({'var': 0.001897976706764272,
  'mean': 0.0008546294407266169,
  'skew': 4.364274107468595,
  'kurt': 24.073100892495738,
  'entropy': 6.544725837429657},
 {'var': 1.000206079342734,
  'mean': -4.684253717413104e-17,
  'skew': 4.364274107468595,
  'kurt': 24.07310089249574,
  'entropy': -inf})

note that entropy is not relevant for zscore since values might be -inf.
Also var ~ 1 since that's the point of the zscore norm