In [17]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from collections import defaultdict
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import itertools
import pickle

In [18]:
df = pd.read_csv("../data/run-over-dataset.csv")
print(df.shape)

columns_to_drop = ['VERBALE', 'DATA', 'Tot Testa', 'Tot Torace', 'Tot Addome', 'Tot Scheletro',
                    'Totale', 'Tot Volta cranica', 'Tot Base cranica', 
                    'Tot Neuroc.', 'Tot Splancnoc.', 'Tot Testa',
                    'Tot Tratto toracico', 'Tot Tratto lombare', 'Tot Rachide',
                    ' Totale coste', 'Sterno in toto', 'Tot Bacino', 'I costa dx', 'II costa dx',
                    'III costa dx', 'IV costa dx', 'V costa dx', 'VI costa dx', 'VII costa dx', 
                    'VIII costa dx', 'IX costa dx', 'X costa dx', 'XI costa dx', 'XII costa dx',
                    'I costa sx', 'II costa sx', 'III costa sx', 'IV costa sx', 'V costa sx', 
                    'VI costa sx', 'VII costa sx', 'VIII costa sx', 'IX costa sx', 
                    'X costa sx', 'XI costa sx', 'XII costa sx']

X = df.drop(columns=columns_to_drop)
print(X.shape)

X['ALTEZZA'] = [int(float(h.replace(',', '.'))*100) for h in X['ALTEZZA']]
X['PESO'] = [int(float(str(h).replace(',', '.'))) for h in X['PESO']]
X['BMI'] = [float(str(h).replace(',', '.')) for h in X['BMI']]

num_unique_values = X.nunique()
constant_columns = num_unique_values[num_unique_values == 1].index.tolist()

X = X.drop(columns=constant_columns)
X = X.T.drop_duplicates().T
print(X.shape)

(130, 367)
(130, 326)
(130, 274)


In [19]:
def reduce_cols(X, components):
    total_variance = 0
    columns_variance = {}
    n_cols = X.shape[1]

    for col in X.columns:
        columns_variance[col] = X[col].var()
        total_variance += X[col].var()

    cols_to_drop = []
    for idx, (col, var) in enumerate(sorted(columns_variance.items(), key=lambda item: item[1])):
        if (n_cols - (idx + 1) >= components):
            cols_to_drop.append(col) 
        else:
            break

    return X.drop(columns=cols_to_drop)

new_X = reduce_cols(X, 250)
new_X.shape

(130, 250)

In [20]:

total_variance = 0
columns_variance = {}
n_cols = X.shape[1]

for col in X.columns:
    columns_variance[col] = X[col].var()
    total_variance += X[col].var()

current_variance = total_variance
for idx, (col, var) in enumerate(sorted(columns_variance.items(), key=lambda item: item[1])):
    current_variance -= var 
    if (current_variance / total_variance <= 0.95):
        print(n_cols - (idx + 1))
        break

16


16 delle colonne totali del data set rapprensentano il 95% della varianza totale.

In [21]:
def nested_cv(X, components, mod_selection_score=accuracy_score):
    n_neighborss = [3, 5, 7, 10, 15, 20, 25, 30]
    contaminations = np.linspace(0.01, 0.5, 50)
    metrics = ['euclidean', 'minkowski', 'manhattan', 'cosine']
    
    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    best_overall_accuracy = 0
    best_overall_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo')

    X = reduce_cols(X, components)
    X = X.values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

            best_score = 0
            for trainval_idx, valid_idx in inner_cv.split(X_train, y_train):
                X_trainval, X_valid = X_train[trainval_idx], X_train[valid_idx]
                y_trainval, y_valid = y_train[trainval_idx], y_train[valid_idx]

                idxs_neg = np.where(y_trainval == 1)[0]
                X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
                y_valid = np.append(y_valid, y_trainval[idxs_neg])

                X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
                y_trainval = np.delete(y_trainval, idxs_neg)

                for params in itertools.product(n_neighborss, contaminations, metrics):
                    clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)

                    clf.fit(X_trainval)

                    pred_values = clf.predict(X_valid)
                    true_values = [1 if y == 0 else -1 for y in y_valid]

                    score = 0
                    if (mod_selection_score == precision_score):
                        score = mod_selection_score(true_values, pred_values, zero_division=0.0)
                    else:
                        score = mod_selection_score(true_values, pred_values)
                        
                    if score > best_score:
                        best_score = score
                        best_params['n_neighbors'] = params[0]
                        best_params['contamination'] = params[1]
                        best_params['metric'] = params[2]

            clf = LocalOutlierFactor(**best_params, novelty=True)

            idxs_neg = np.where(y_train == 1)[0]
            X_train = np.delete(X_train, idxs_neg, axis=0)

            clf.fit(X_train)

            pred_values = clf.predict(X_test)
            true_values = [1 if y == 0 else -1 for y in y_test]

            if accuracy_score(true_values, pred_values) > best_overall_accuracy:
                best_overall_accuracy = accuracy_score(true_values, pred_values)
                best_overall_params['n_neighbors'] = best_params['n_neighbors']
                best_overall_params['contamination'] = best_params['contamination']
                best_overall_params['metric'] = best_params['metric']


            accuracy_scores.append(accuracy_score(true_values, pred_values))
            precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
            recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'LocalOutlierFactor',
            'best n_neighbors': best_overall_params['n_neighbors'],
            'best contamination': best_overall_params['contamination'],
            'best metric': best_overall_params['metric'],
            'score used for model selection': mod_selection_score.__name__,
            'method used for model selection': 'nested cv',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'best overall accuracy': best_overall_accuracy * 100,
            'components': components}

In [22]:
results = nested_cv(X, 250, accuracy_score)
scores_df = pd.DataFrame(results, index=[0])
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,best overall accuracy,components
0,LocalOutlierFactor,7,0.4,euclidean,accuracy_score,nested cv,61.615385,8.158562,69.883444,11.017541,53.0,13.100086,76.923077,250


In [23]:
def add_record(df, record):
    new_record = pd.DataFrame(record, index=[0])
    df = pd.concat([df, new_record], ignore_index=True)
    return df  

In [24]:
scores_df = add_record(scores_df, nested_cv(X, 250, f1_score))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,best overall accuracy,components
0,LocalOutlierFactor,7,0.4,euclidean,accuracy_score,nested cv,61.615385,8.158562,69.883444,11.017541,53.0,13.100086,76.923077,250
1,LocalOutlierFactor,25,0.48,cosine,f1_score,nested cv,63.230769,10.180617,68.408477,11.583186,63.142857,16.742955,80.769231,250


In [25]:
scores_df = add_record(scores_df, nested_cv(X, 250, precision_score))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,best overall accuracy,components
0,LocalOutlierFactor,7,0.4,euclidean,accuracy_score,nested cv,61.615385,8.158562,69.883444,11.017541,53.0,13.100086,76.923077,250
1,LocalOutlierFactor,25,0.48,cosine,f1_score,nested cv,63.230769,10.180617,68.408477,11.583186,63.142857,16.742955,80.769231,250
2,LocalOutlierFactor,20,0.5,cosine,precision_score,nested cv,63.538462,7.546307,73.380902,10.814779,52.714286,14.97549,76.923077,250


In [26]:
scores_df = add_record(scores_df, nested_cv(X, 250, recall_score))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,best overall accuracy,components
0,LocalOutlierFactor,7,0.4,euclidean,accuracy_score,nested cv,61.615385,8.158562,69.883444,11.017541,53.0,13.100086,76.923077,250
1,LocalOutlierFactor,25,0.48,cosine,f1_score,nested cv,63.230769,10.180617,68.408477,11.583186,63.142857,16.742955,80.769231,250
2,LocalOutlierFactor,20,0.5,cosine,precision_score,nested cv,63.538462,7.546307,73.380902,10.814779,52.714286,14.97549,76.923077,250
3,LocalOutlierFactor,3,0.01,euclidean,recall_score,nested cv,55.769231,3.865337,55.084306,2.227862,97.714286,3.896623,65.384615,250


In [27]:
scores_df = add_record(scores_df, nested_cv(X, 200, accuracy_score))
scores_df = add_record(scores_df, nested_cv(X, 200, f1_score))
scores_df = add_record(scores_df, nested_cv(X, 200, precision_score))
scores_df = add_record(scores_df, nested_cv(X, 200, recall_score))
scores_df = add_record(scores_df, nested_cv(X, 150, accuracy_score))
scores_df = add_record(scores_df, nested_cv(X, 150, f1_score))
scores_df = add_record(scores_df, nested_cv(X, 150, precision_score))
scores_df = add_record(scores_df, nested_cv(X, 150, recall_score))
scores_df = add_record(scores_df, nested_cv(X, 100, accuracy_score))
scores_df = add_record(scores_df, nested_cv(X, 100, f1_score))
scores_df = add_record(scores_df, nested_cv(X, 100, precision_score))
scores_df = add_record(scores_df, nested_cv(X, 100, recall_score))
scores_df = add_record(scores_df, nested_cv(X, 50, accuracy_score))
scores_df = add_record(scores_df, nested_cv(X, 50, f1_score))
scores_df = add_record(scores_df, nested_cv(X, 50, precision_score))
scores_df = add_record(scores_df, nested_cv(X, 50, recall_score))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,best overall accuracy,components
0,LocalOutlierFactor,7,0.4,euclidean,accuracy_score,nested cv,61.615385,8.158562,69.883444,11.017541,53.0,13.100086,76.923077,250
1,LocalOutlierFactor,25,0.48,cosine,f1_score,nested cv,63.230769,10.180617,68.408477,11.583186,63.142857,16.742955,80.769231,250
2,LocalOutlierFactor,20,0.5,cosine,precision_score,nested cv,63.538462,7.546307,73.380902,10.814779,52.714286,14.97549,76.923077,250
3,LocalOutlierFactor,3,0.01,euclidean,recall_score,nested cv,55.769231,3.865337,55.084306,2.227862,97.714286,3.896623,65.384615,250
4,LocalOutlierFactor,7,0.46,manhattan,accuracy_score,nested cv,61.692308,7.729143,69.221975,10.685435,55.142857,15.388838,76.923077,200
5,LocalOutlierFactor,25,0.33,euclidean,f1_score,nested cv,63.0,11.914141,68.426691,12.097514,61.0,18.692463,88.461538,200
6,LocalOutlierFactor,15,0.44,euclidean,precision_score,nested cv,61.615385,8.719495,69.414228,15.143985,52.857143,16.903085,76.923077,200
7,LocalOutlierFactor,3,0.01,euclidean,recall_score,nested cv,55.307692,3.7598,54.801578,2.114106,97.714286,4.15024,65.384615,200
8,LocalOutlierFactor,10,0.46,euclidean,accuracy_score,nested cv,64.230769,8.978937,73.946071,12.158582,53.571429,14.586127,84.615385,150
9,LocalOutlierFactor,20,0.38,cosine,f1_score,nested cv,64.461538,10.478493,71.631499,13.08298,59.428571,16.063649,84.615385,150


In [28]:
file_path = 'lof_exp2_df.pickle'

with open(file_path, "wb") as file:
    pickle.dump(scores_df, file)