# Curve test analysis
## Imports

In [None]:
import ast
import json
import multiprocessing
from copy import deepcopy
from time import sleep

import pandas as pd
import numpy as np

from pathlib import Path

import matplotlib.pyplot as plt
from IPython.display import display, Markdown, Latex

## Setup

In [None]:
data_folder = Path('../data')   # this comes handy for migrations

## Processing functions
### Loading

In [None]:
def parse_results_file(file_path):
    df_data, df_index = [], []

    with open(file_path, 'r') as rf:
        data = json.load(rf)
        for curve_name, curve_data in data.items():
            for params, results in curve_data.items():
                dictionary = deepcopy(results)
                dictionary.update(ast.literal_eval(params))
                df_data.append(dictionary)
                df_index.append(curve_name)
    return df_index, df_data

def plain_numerical2df(df_index, df_data, drop_cols=()):
    columns = set(df_data[0].keys()).difference(drop_cols)
    df_ = pd.DataFrame(df_data, index=df_index, columns=columns).fillna(0).astype(int)
    df_['name'] = df_.index
    df_['sim'] = df_.name.str.contains('sim').astype(int)
    return df_

### Statistical understanding

In [None]:
from scipy.stats import ks_2samp

def plot_df(df_, drop_cols=()):  # args cannot be mutable -- [] would cause problems
    """Logy histogram and relative density 
    => different sizes of sim groups can be shown together
    """
    cols = df_.columns.drop(['name', 'sim',] + list(drop_cols))
    for col in cols:
        ax = df_.groupby('sim')[col].plot.hist(bins=100, logx=False, logy=False, figsize=(14, 6), 
                                               density=False, alpha=0.42, legend=True, xlim=(0, df[col].max()),)
        ax2 = df_.groupby('sim')[col].plot.density(figsize=(14, 6), alpha=1.0, legend=False, logy=False,
                                                   logx=False, xlim=(1, df[col].max()), ax=ax[0].twinx())
        plt.title(col, fontsize='xx-large')
        ax[0].legend(title='sim')
        plt.show()
        
        from scipy.stats import ks_2samp

def kl_divergence(orig_p, orig_q, epsilon=1e-5):
    p, q = get_bins(orig_p), get_bins(orig_q)
    return np.sum(np.where(p != 0, p * np.log(p / (q + epsilon)), 0))

def get_bins(ser):
    hist = np.histogram(ser,
                        density=True,
                        bins=50,
                        range=(0, 5),
                       )
    return hist[0]

def per_group(drop_cols=()):
    def per_group_inner(df_):
        res_ = {}
        for on_col in df_.columns.drop(['name', 'sim',] + list(drop_cols)):
            res_[(on_col, 'ks_stat', )] = ks_2samp(df_.loc[df_.sim == 0, on_col],
                                                df_.loc[df_.sim == 1, on_col])[0]  # we need only the first value
            res_[(on_col, 'kl_stat', )] = kl_divergence(df_.loc[df_.sim == 0, on_col],
                                                       df_.loc[df_.sim == 1, on_col])
        columns=pd.MultiIndex.from_tuples(res_.keys(), names=['col', 'stat'])
        return pd.Series(res_, index=columns)
    return per_group_inner

### Machine learning understanding
...it's so simple to code, you have to give it a try!
I would test it again, once there are many test results per curve => the random forest / KMeans could find something interesting (== they would produce reasonable results => we can investigate those results)

In [None]:
import matplotlib.pyplot as plt  # doctest: +SKIP
from sklearn.datasets import make_classification
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn import svm
import sklearn
from sklearn import ensemble

def eval_classifier(classifier, X, y, ax):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    classifier.fit(X_train, y_train)
    plot_confusion_matrix(classifier, X_test, y_test, ax=ax)  # doctest: +SKIP
    
def eval_classifiers(df_, drop_cols=()):
    """Check performance of different classifiers on test set (last 20 %)"""
    df_ = df.sample(frac=1, random_state=0)  # should we 
    X = df_.drop(columns=['sim', 'name', ] + list(drop_cols))
    y = df_.sim
    
    classifiers = [   # The hyperparams could be tuned/autotuned
        SVC(random_state=0, degree=3),
        SGDClassifier(loss="hinge", penalty="l2", max_iter=100),
        sklearn.neighbors.KNeighborsClassifier(n_neighbors=2),
        ensemble.RandomForestClassifier(10),
    ]
    
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))   # nrows * ncols = len(classifiers)

    for cls, ax in zip(classifiers, axes.flatten()):
        eval_classifier(cls, X, y, ax=ax)
        ax.title.set_text(type(cls).__name__)
        ax.title.set_fontsize('xx-large')
    
    plt.tight_layout()  
    plt.show()

## A05
> It would be great to have a real description here

### Load data

In [None]:
print(list(data_folder.glob('*.json')))   # list all files -> could be used to auto parse all files

drop_cols = ('l', )  # comma needed for tuples
df = plain_numerical2df(*parse_results_file(data_folder / 'a05_filtered.json'))  # *does tuple unpacking
df

### Numerical comparison

In [None]:
display(df.describe())
display(df.groupby('sim').describe().T)

### Visual comparison

In [None]:
# the 'l' col needs to be skipped as it contains only 1 value 
# and causes LinAlg error when calculating the density
plot_df(df.loc[df.l == 7], drop_cols)

In [None]:
plot_df(df.loc[df.l == 5], drop_cols)

### Metric comparison

In [None]:
df.groupby('l').apply(per_group(drop_cols))  # any groupby producing non-zero groups is supported 

### ML comparison

In [None]:
eval_classifiers(df, drop_cols)

The ML classification is not good for any of the algorithms.

## a25
> It would be great to have a real description here
### Load data

In [None]:
drop_cols = ('trace_factorization', )  # comma needed for tuples
df_index, df_data = parse_results_file(data_folder / 'a25.json')
df = plain_numerical2df(df_index, df_data, drop_cols)  # *does tuple unpacking
df['trace_factorization'] = pd.Series([np.array(x['trace_factorization'])[:,0]   # ignore ones
                                       for x in df_data], index=df_index)
df

### Generate reasonable features

In [None]:
def max_min_ratio(x):
    return np.max(x) / np.min(x)

feature_fns = np.min, np.max, np.mean, np.median, max_min_ratio

for fn in feature_fns:
    df[f'log10({fn.__qualname__})'] = np.log10(df.trace_factorization.apply(fn).astype(float))

### Statistics

In [None]:
display(Markdown('### Numerical comparison'))
display(df.describe())
display(df.groupby('sim').describe().T)

display(Markdown('### Visual comparison'))
plot_df(df, drop_cols)   # subset it or something

display(Markdown('### Metric comparison'))
display(df.groupby(pd.Series(True, index=df.index)).apply(per_group(drop_cols)))   # consistency with groupby
# display(per_group(['trace_factorization'])(df).to_frame().T)   # those 2 lines are equivalent

display(Markdown('### ML comparison'))
eval_classifiers(df, drop_cols)

Those are some reasonable results for basic ML classifiers with un-tuned hyperparameters.

## Playground