In [20]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis, cdist

import plotly.express as px
import pandas as pd

from aiirw import AI_IRW

from tqdm import tqdm

In [2]:
df = pd.read_csv('data/train_latent.csv')
df_in = pd.read_csv('data/in_latent.csv')
df_out = pd.read_csv('data/out_latent.csv')

In [3]:
def MahaDist(X, distrib):
    m = np.mean(distrib, axis=0).reshape(1, 768)
    VI= np.linalg.inv(np.cov(distrib.T))
    return cdist(XA = X, XB = m, metric = 'mahalanobis', VI=VI)

In [4]:
def process_latent_df(df):
    X = np.array(df.T)
    softmax_scores = X[:, -1]
    preds = X[:, -2]
    X = X[:, :-2]
    X = X.reshape((X.shape[0], 768, 12))
    X = np.mean(X, axis = 2)
    return X, softmax_scores, preds

In [5]:
X_out, softmax_out, preds_out = process_latent_df(df_out)
X_in, softmax_in, preds_in = process_latent_df(df_in)
X, softmax, preds = process_latent_df(df)

In [6]:
maha_in = np.concatenate([MahaDist(X_in[preds_in == 0], X[preds == 0]), MahaDist(X_in[preds_in == 1], X[preds == 1])])
maha_out = np.concatenate([MahaDist(X_out[preds_out == 0], X[preds == 0]), MahaDist(X_out[preds_out == 1], X[preds == 1])])

In [7]:
IRW_in = np.concatenate([AI_IRW(X=X[preds == 0], AI=True, X_test=X_in[preds_in == 0], n_dirs=1000), AI_IRW(X=X[preds == 1], AI=True, X_test=X_in[preds_in == 1], n_dirs=1000)])
IRW_out = np.concatenate([AI_IRW(X=X[preds == 0], AI=True, X_test=X_out[preds_out == 0], n_dirs=1000), AI_IRW(X=X[preds == 1], AI=True, X_test=X_out[preds_out == 1], n_dirs=1000)])

In [14]:
sim_df = pd.DataFrame(np.concatenate([maha_in, maha_out]), columns=['Maha'])
sim_df['origin'] = ['in'] * len(maha_in) + ['out'] * len(maha_out)
sim_df['IRW'] = -np.concatenate([IRW_in, IRW_out])

In [15]:
px.histogram(sim_df, x = 'Maha', color = 'origin', template = 'none', title = 'Maha').show()
px.histogram(sim_df, x = 'IRW', color = 'origin', template = 'none', title = 'IRW').show()

### Métriques de perf du classifier

In [22]:
def error_threshold(in_distances, out_distances, threshold):
    err = len([i for i in in_distances if i > threshold]) + len([i for i in out_distances if i <= threshold])
    return err / (len(in_distances) + len(out_distances))

In [26]:
def minimize_error(in_distances, out_distances, threshold_range):
    errors = [error_threshold(in_distances, out_distances, threshold) for threshold in tqdm(threshold_range)]
    return np.min(errors), threshold_range[np.argmin(errors)]

In [31]:
score_maha = minimize_error(
                (sim_df[sim_df['origin'] == 'in']['Maha']),
                (sim_df[sim_df['origin'] == 'out']['Maha']),
                np.linspace(0, 100, 10000)
                        )


score_IRW = minimize_error(
                (sim_df[sim_df['origin'] == 'in']['IRW']),
                (sim_df[sim_df['origin'] == 'out']['IRW']),
                np.linspace(-.3, -.05, 10000)
                        )

100%|██████████| 10000/10000 [00:04<00:00, 2028.01it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1962.97it/s]


In [32]:
score_maha

(0.08993816750983699, 36.71367136713672)

In [33]:
score_IRW

(0.09021922428330523, -0.19843984398439843)