In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.neighbors import LocalOutlierFactor 
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import StandardScaler
import itertools
import pickle

In [2]:
df = pd.read_csv("../data/run-over-dataset.csv")

columns_to_mantain = ['SESSO', 'ANNI', 'ALTEZZA', 'PESO', 'BMI', 'Mezzo', 'Tot Testa', 'Tot Torace', 'Tot Addome', 
                    'Tot Scheletro', 'Tot Volta cranica', 'Tot Base cranica', 
                    'Tot Neuroc.', 'Tot Splancnoc.', 'Tot Testa',
                    'Tot Tratto toracico', 'Tot Tratto lombare', 'Tot Rachide',
                    ' Totale coste', 'Sterno in toto', 'Tot Bacino', 'I costa dx', 'II costa dx',
                    'III costa dx', 'IV costa dx', 'V costa dx', 'VI costa dx', 'VII costa dx', 
                    'VIII costa dx', 'IX costa dx', 'X costa dx', 'XI costa dx', 'XII costa dx',
                    'I costa sx', 'II costa sx', 'III costa sx', 'IV costa sx', 'V costa sx', 
                    'VI costa sx', 'VII costa sx', 'VIII costa sx', 'IX costa sx', 
                    'X costa sx', 'XI costa sx', 'XII costa sx', 'Omero dx', 'Radio dx', 'Ulna dx',
                    'Omero sx', 'Radio sx', 'Ulna sx', 'carpo dx', 'I metacarpo dx', 'II metacarpo dx', 'III metacarpo dx',
                    'IV  metacarpo dx', 'V metacarpo dx', 'I raggio dx', 'II raggio dx', 'III raggio dx',
                    'IV raggio dx', 'V raggio dx', 'carpo sx', 'I metacarpo sx', 'II metacarpo sx', 'III metacarpo sx',
                    'IV metacarpo sx', 'V metacarpo sx', 'I raggio sx', 'II raggio sx', 'III raggio sx', 'IV raggio sx',
                    'V raggio sx', 'Spalla dx', 'Spalla sx', 'Gomito dx', 'Gomito sx', 'Polso dx', 'Polso sx', 
                    'femore dx', 'tibia dx', 'fibula dx', 'rotula dx', 'femore sx', 'tibia sx', 'fibula sx', 
                    'rotula sx', 'tarso dx', 'I metatarso dx', 'II metatarso dx', 'III metatarso dx', 'IV metatarso dx',
                    'V metatarso dx', 'I raggio dx', 'II raggio dx', 'III raggio dx', 'IV raggio dx', 'V raggio dx', 
                    'tarso sx', 'I metatarso sx', 'II metatarso sx', 'III metatarso sx', 'IV metatarso sx', 
                    'V metatarso sx', 'I raggio sx', 'II raggio sx', 'III raggio sx', 'IV raggio sx', 'V raggio sx',
                    'Art. coxo-femorale dx', 'Art. coxo-femorale sx', 'Rotula o Ginocchio dx', 'Rotula o Ginocchio sx',
                    'Caviglia dx', 'Caviglia sx']

X = df.loc[:, columns_to_mantain]

X['ALTEZZA'] = [int(float(h.replace(',', '.'))*100) for h in X['ALTEZZA']]
X['PESO'] = [int(float(str(h).replace(',', '.'))) for h in X['PESO']]
X['BMI'] = [float(str(h).replace(',', '.')) for h in X['BMI']]

print(X.shape)

num_unique_values = X.nunique()
constant_columns = num_unique_values[num_unique_values == 1].index.tolist()

X = X.drop(columns=constant_columns)
X = X.T.drop_duplicates().T
print(X.shape)

(130, 115)
(130, 74)


In [3]:
def nested_cv_accuracy(X):
    n_neighborss = [3, 5, 7, 10, 15, 20, 25, 30]
    contaminations = np.linspace(0.01, 0.5, 50)
    metrics = ['euclidean', 'minkowski', 'manhattan', 'cosine']
    
    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

            best_accuracy = 0
            for trainval_idx, valid_idx in inner_cv.split(X_train, y_train):
                X_trainval, X_valid = X_train[trainval_idx], X_train[valid_idx]
                y_trainval, y_valid = y_train[trainval_idx], y_train[valid_idx]

                idxs_neg = np.where(y_trainval == 1)[0]
                X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
                y_valid = np.append(y_valid, y_trainval[idxs_neg])

                X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
                y_trainval = np.delete(y_trainval, idxs_neg)

                for params in itertools.product(n_neighborss, contaminations, metrics):
                    clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)

                    clf.fit(X_trainval)

                    pred_values = clf.predict(X_valid)
                    true_values = [1 if y == 0 else -1 for y in y_valid]

                    accuracy = accuracy_score(true_values, pred_values)
                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
                        best_params['n_neighbors'] = params[0]
                        best_params['contamination'] = params[1]
                        best_params['metric'] = params[2]

            clf = LocalOutlierFactor(**best_params, novelty=True)

            idxs_neg = np.where(y_train == 1)[0]
            X_train = np.delete(X_train, idxs_neg, axis=0)

            clf.fit(X_train)

            pred_values = clf.predict(X_test)
            true_values = [1 if y == 0 else -1 for y in y_test]

            accuracy_scores.append(accuracy_score(true_values, pred_values))
            precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
            recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'LocalOutlierFactor',
            'best n_neighbors': best_params['n_neighbors'],
            'best contamination': best_params['contamination'],
            'best metric': best_params['metric'],
            'method used for model selection': 'accuracy on nested cv',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'tot columns, standard scaled'}

In [4]:
results = nested_cv_accuracy(X)
scores_df = pd.DataFrame(results, index=[0])
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,LocalOutlierFactor,30,0.45,cosine,accuracy on nested cv,59.0,9.899196,64.079883,11.34871,55.571429,14.658702,"tot columns, standard scaled"


In [5]:
def add_record(df, record):
    new_record = pd.DataFrame(record, index=[0])
    df = pd.concat([df, new_record], ignore_index=True)
    return df  

In [6]:
def nested_cv_f1(X):
    n_neighborss = [3, 5, 7, 10, 15, 20, 25, 30]
    contaminations = np.linspace(0.01, 0.5, 50)
    metrics = ['euclidean', 'minkowski', 'manhattan', 'cosine']
    
    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

            best_f1 = 0
            for trainval_idx, valid_idx in inner_cv.split(X_train, y_train):
                X_trainval, X_valid = X_train[trainval_idx], X_train[valid_idx]
                y_trainval, y_valid = y_train[trainval_idx], y_train[valid_idx]

                idxs_neg = np.where(y_trainval == 1)[0]
                X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
                y_valid = np.append(y_valid, y_trainval[idxs_neg])

                X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
                y_trainval = np.delete(y_trainval, idxs_neg)

                for params in itertools.product(n_neighborss, contaminations, metrics):
                    clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)

                    clf.fit(X_trainval)

                    pred_values = clf.predict(X_valid)
                    true_values = [1 if y == 0 else -1 for y in y_valid]

                    f1 = f1_score(true_values, pred_values)
                    if f1 > best_f1:
                        best_f1 = f1
                        best_params['n_neighbors'] = params[0]
                        best_params['contamination'] = params[1]
                        best_params['metric'] = params[2]

            clf = LocalOutlierFactor(**best_params, novelty=True)

            idxs_neg = np.where(y_train == 1)[0]
            X_train = np.delete(X_train, idxs_neg, axis=0)

            clf.fit(X_train)

            pred_values = clf.predict(X_test)
            true_values = [1 if y == 0 else -1 for y in y_test]

            accuracy_scores.append(accuracy_score(true_values, pred_values))
            precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
            recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'LocalOutlierFactor',
            'best n_neighbors': best_params['n_neighbors'],
            'best contamination': best_params['contamination'],
            'best metric': best_params['metric'],
            'method used for model selection': 'f1 on nested cv',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'tot columns, standard scaled'}

In [7]:
scores_df = add_record(scores_df, nested_cv_f1(X))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,LocalOutlierFactor,30,0.45,cosine,accuracy on nested cv,59.0,9.899196,64.079883,11.34871,55.571429,14.658702,"tot columns, standard scaled"
1,LocalOutlierFactor,25,0.48,cosine,f1 on nested cv,64.538462,8.813988,66.971464,9.141962,69.428571,15.8462,"tot columns, standard scaled"


In [8]:
def outer_cv_inner_holdout_accuracy(X):
    n_neighborss = [3, 5, 7, 10, 15, 20, 25, 30]
    contaminations = np.linspace(0.01, 0.5, 50)
    metrics = ['euclidean', 'minkowski', 'manhattan', 'cosine']
    
    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_trainval, X_valid, y_trainval, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)
            best_accuracy = 0

            idxs_neg = np.where(y_trainval == 1)[0]
            X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
            y_valid = np.append(y_valid, y_trainval[idxs_neg])

            X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
            y_trainval = np.delete(y_trainval, idxs_neg)

            for params in itertools.product(n_neighborss, contaminations, metrics):
                clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)

                clf.fit(X_trainval)

                pred_values = clf.predict(X_valid)
                true_values = [1 if y == 0 else -1 for y in y_valid]

                accuracy = accuracy_score(true_values, pred_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['n_neighbors'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['metric'] = params[2]

            clf = LocalOutlierFactor(**best_params, novelty=True)

            idxs_neg = np.where(y_train == 1)[0]
            X_train = np.delete(X_train, idxs_neg, axis=0)

            clf.fit(X_train)

            pred_values = clf.predict(X_test)
            true_values = [1 if y == 0 else -1 for y in y_test]

            accuracy_scores.append(accuracy_score(true_values, pred_values))
            precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
            recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'LocalOutlierFactor',
            'best n_neighbors': best_params['n_neighbors'],
            'best contamination': best_params['contamination'],
            'best metric': best_params['metric'],
            'method used for model selection': 'accuracy on outer cv inner holdout',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'tot columns, standard scaled'}

In [9]:
scores_df = add_record(scores_df, outer_cv_inner_holdout_accuracy(X))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,LocalOutlierFactor,30,0.45,cosine,accuracy on nested cv,59.0,9.899196,64.079883,11.34871,55.571429,14.658702,"tot columns, standard scaled"
1,LocalOutlierFactor,25,0.48,cosine,f1 on nested cv,64.538462,8.813988,66.971464,9.141962,69.428571,15.8462,"tot columns, standard scaled"
2,LocalOutlierFactor,7,0.5,euclidean,accuracy on outer cv inner holdout,57.307692,10.095696,61.183518,13.594559,53.142857,18.414723,"tot columns, standard scaled"


In [10]:
def outer_cv_inner_holdout_f1(X):
    n_neighborss = [3, 5, 7, 10, 15, 20, 25, 30]
    contaminations = np.linspace(0.01, 0.5, 50)
    metrics = ['euclidean', 'minkowski', 'manhattan', 'cosine']
    
    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_trainval, X_valid, y_trainval, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)
            best_f1 = 0

            idxs_neg = np.where(y_trainval == 1)[0]
            X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
            y_valid = np.append(y_valid, y_trainval[idxs_neg])

            X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
            y_trainval = np.delete(y_trainval, idxs_neg)

            for params in itertools.product(n_neighborss, contaminations, metrics):
                clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)

                clf.fit(X_trainval)

                pred_values = clf.predict(X_valid)
                true_values = [1 if y == 0 else -1 for y in y_valid]

                f1 = f1_score(true_values, pred_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['n_neighbors'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['metric'] = params[2]

            clf = LocalOutlierFactor(**best_params, novelty=True)

            idxs_neg = np.where(y_train == 1)[0]
            X_train = np.delete(X_train, idxs_neg, axis=0)

            clf.fit(X_train)

            pred_values = clf.predict(X_test)
            true_values = [1 if y == 0 else -1 for y in y_test]

            accuracy_scores.append(accuracy_score(true_values, pred_values))
            precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
            recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'LocalOutlierFactor',
            'best n_neighbors': best_params['n_neighbors'],
            'best contamination': best_params['contamination'],
            'best metric': best_params['metric'],
            'method used for model selection': 'f1 on outer cv inner holdout',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'tot columns, standard scaled'}

In [11]:
scores_df = add_record(scores_df, outer_cv_inner_holdout_f1(X))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,LocalOutlierFactor,30,0.45,cosine,accuracy on nested cv,59.0,9.899196,64.079883,11.34871,55.571429,14.658702,"tot columns, standard scaled"
1,LocalOutlierFactor,25,0.48,cosine,f1 on nested cv,64.538462,8.813988,66.971464,9.141962,69.428571,15.8462,"tot columns, standard scaled"
2,LocalOutlierFactor,7,0.5,euclidean,accuracy on outer cv inner holdout,57.307692,10.095696,61.183518,13.594559,53.142857,18.414723,"tot columns, standard scaled"
3,LocalOutlierFactor,30,0.36,cosine,f1 on outer cv inner holdout,62.692308,10.124959,65.556688,10.182327,67.0,15.511023,"tot columns, standard scaled"


In [12]:
def outer_holdout_inner_cv_accuracy(X):
    n_neighborss = [3, 5, 7, 10, 15, 20, 25, 30]
    contaminations = np.linspace(0.01, 0.5, 50)
    metrics = ['euclidean', 'minkowski', 'manhattan', 'cosine']
    
    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        best_accuracy = 0
        for trainval_idx, valid_idx in cv.split(X_train, y_train):
            X_trainval, X_valid = X_train[trainval_idx], X_train[valid_idx]
            y_trainval, y_valid = y_train[trainval_idx], y_train[valid_idx]

            idxs_neg = np.where(y_trainval == 1)[0]
            X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
            y_valid = np.append(y_valid, y_trainval[idxs_neg])

            X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
            y_trainval = np.delete(y_trainval, idxs_neg)

            for params in itertools.product(n_neighborss, contaminations, metrics):
                clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)

                clf.fit(X_trainval)

                pred_values = clf.predict(X_valid)
                true_values = [1 if y == 0 else -1 for y in y_valid]

                accuracy = accuracy_score(true_values, pred_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['n_neighbors'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['metric'] = params[2]

        clf = LocalOutlierFactor(**best_params, novelty=True)

        idxs_neg = np.where(y_train == 1)[0]
        X_train = np.delete(X_train, idxs_neg, axis=0)

        clf.fit(X_train)

        pred_values = clf.predict(X_test)
        true_values = [1 if y == 0 else -1 for y in y_test]

        accuracy_scores.append(accuracy_score(true_values, pred_values))
        precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
        recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'LocalOutlierFactor',
            'best n_neighbors': best_params['n_neighbors'],
            'best contamination': best_params['contamination'],
            'best metric': best_params['metric'],
            'method used for model selection': 'accuracy on outer holdout inner cv',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'tot columns, standard scaled'}

In [13]:
scores_df = add_record(scores_df, outer_holdout_inner_cv_accuracy(X))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,LocalOutlierFactor,30,0.45,cosine,accuracy on nested cv,59.0,9.899196,64.079883,11.34871,55.571429,14.658702,"tot columns, standard scaled"
1,LocalOutlierFactor,25,0.48,cosine,f1 on nested cv,64.538462,8.813988,66.971464,9.141962,69.428571,15.8462,"tot columns, standard scaled"
2,LocalOutlierFactor,7,0.5,euclidean,accuracy on outer cv inner holdout,57.307692,10.095696,61.183518,13.594559,53.142857,18.414723,"tot columns, standard scaled"
3,LocalOutlierFactor,30,0.36,cosine,f1 on outer cv inner holdout,62.692308,10.124959,65.556688,10.182327,67.0,15.511023,"tot columns, standard scaled"
4,LocalOutlierFactor,15,0.49,cosine,accuracy on outer holdout inner cv,55.0,11.153846,63.275641,16.617987,48.350356,15.681914,"tot columns, standard scaled"


In [14]:
def outer_holdout_inner_cv_f1(X):
    n_neighborss = [3, 5, 7, 10, 15, 20, 25, 30]
    contaminations = np.linspace(0.01, 0.5, 50)
    metrics = ['euclidean', 'minkowski', 'manhattan', 'cosine']
    
    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        best_f1 = 0

        for trainval_idx, valid_idx in cv.split(X_train, y_train):
            X_trainval, X_valid = X_train[trainval_idx], X_train[valid_idx]
            y_trainval, y_valid = y_train[trainval_idx], y_train[valid_idx]

            idxs_neg = np.where(y_trainval == 1)[0]
            X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
            y_valid = np.append(y_valid, y_trainval[idxs_neg])

            X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
            y_trainval = np.delete(y_trainval, idxs_neg)

            for params in itertools.product(n_neighborss, contaminations, metrics):
                clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)

                clf.fit(X_trainval)

                pred_values = clf.predict(X_valid)
                true_values = [1 if y == 0 else -1 for y in y_valid]

                f1 = f1_score(true_values, pred_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['n_neighbors'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['metric'] = params[2]

        clf = LocalOutlierFactor(**best_params, novelty=True)

        idxs_neg = np.where(y_train == 1)[0]
        X_train = np.delete(X_train, idxs_neg, axis=0)

        clf.fit(X_train)

        pred_values = clf.predict(X_test)
        true_values = [1 if y == 0 else -1 for y in y_test]

        accuracy_scores.append(accuracy_score(true_values, pred_values))
        precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
        recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'LocalOutlierFactor',
            'best n_neighbors': best_params['n_neighbors'],
            'best contamination': best_params['contamination'],
            'best metric': best_params['metric'],
            'method used for model selection': 'f1 on outer holdout inner cv',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'tot columns, standard scaled'}

In [15]:
scores_df = add_record(scores_df, outer_holdout_inner_cv_f1(X))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,LocalOutlierFactor,30,0.45,cosine,accuracy on nested cv,59.0,9.899196,64.079883,11.34871,55.571429,14.658702,"tot columns, standard scaled"
1,LocalOutlierFactor,25,0.48,cosine,f1 on nested cv,64.538462,8.813988,66.971464,9.141962,69.428571,15.8462,"tot columns, standard scaled"
2,LocalOutlierFactor,7,0.5,euclidean,accuracy on outer cv inner holdout,57.307692,10.095696,61.183518,13.594559,53.142857,18.414723,"tot columns, standard scaled"
3,LocalOutlierFactor,30,0.36,cosine,f1 on outer cv inner holdout,62.692308,10.124959,65.556688,10.182327,67.0,15.511023,"tot columns, standard scaled"
4,LocalOutlierFactor,15,0.49,cosine,accuracy on outer holdout inner cv,55.0,11.153846,63.275641,16.617987,48.350356,15.681914,"tot columns, standard scaled"
5,LocalOutlierFactor,20,0.26,manhattan,f1 on outer holdout inner cv,56.538462,12.645602,63.076146,11.48824,53.048967,16.85817,"tot columns, standard scaled"


In [16]:
def double_holdout_accuracy(X):
    n_neighborss = [3, 5, 7, 10, 15, 20, 25, 30]
    contaminations = np.linspace(0.01, 0.5, 50)
    metrics = ['euclidean', 'minkowski', 'manhattan', 'cosine']
    
    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

        best_accuracy = 0

        X_trainval, X_valid, y_trainval, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)

        idxs_neg = np.where(y_trainval == 1)[0]
        X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
        y_valid = np.append(y_valid, y_trainval[idxs_neg])

        X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
        y_trainval = np.delete(y_trainval, idxs_neg)

        for params in itertools.product(n_neighborss, contaminations, metrics):
            clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)

            clf.fit(X_trainval)

            pred_values = clf.predict(X_valid)
            true_values = [1 if y == 0 else -1 for y in y_valid]

            accuracy = accuracy_score(true_values, pred_values)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params['n_neighbors'] = params[0]
                best_params['contamination'] = params[1]
                best_params['metric'] = params[2]

        clf = LocalOutlierFactor(**best_params, novelty=True)

        idxs_neg = np.where(y_train == 1)[0]
        X_train = np.delete(X_train, idxs_neg, axis=0)

        clf.fit(X_train)

        pred_values = clf.predict(X_test)
        true_values = [1 if y == 0 else -1 for y in y_test]

        accuracy_scores.append(accuracy_score(true_values, pred_values))
        precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
        recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'LocalOutlierFactor',
            'best n_neighbors': best_params['n_neighbors'],
            'best contamination': best_params['contamination'],
            'best metric': best_params['metric'],
            'method used for model selection': 'accuracy on double holdout',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'tot columns, standard scaled'}

In [17]:
scores_df = add_record(scores_df, double_holdout_accuracy(X))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,LocalOutlierFactor,30,0.45,cosine,accuracy on nested cv,59.0,9.899196,64.079883,11.34871,55.571429,14.658702,"tot columns, standard scaled"
1,LocalOutlierFactor,25,0.48,cosine,f1 on nested cv,64.538462,8.813988,66.971464,9.141962,69.428571,15.8462,"tot columns, standard scaled"
2,LocalOutlierFactor,7,0.5,euclidean,accuracy on outer cv inner holdout,57.307692,10.095696,61.183518,13.594559,53.142857,18.414723,"tot columns, standard scaled"
3,LocalOutlierFactor,30,0.36,cosine,f1 on outer cv inner holdout,62.692308,10.124959,65.556688,10.182327,67.0,15.511023,"tot columns, standard scaled"
4,LocalOutlierFactor,15,0.49,cosine,accuracy on outer holdout inner cv,55.0,11.153846,63.275641,16.617987,48.350356,15.681914,"tot columns, standard scaled"
5,LocalOutlierFactor,20,0.26,manhattan,f1 on outer holdout inner cv,56.538462,12.645602,63.076146,11.48824,53.048967,16.85817,"tot columns, standard scaled"
6,LocalOutlierFactor,30,0.5,cosine,accuracy on double holdout,56.153846,7.919715,64.4059,14.331823,44.188187,11.630585,"tot columns, standard scaled"


In [18]:
def double_holdout_f1(X):
    n_neighborss = [3, 5, 7, 10, 15, 20, 25, 30]
    contaminations = np.linspace(0.01, 0.5, 50)
    metrics = ['euclidean', 'minkowski', 'manhattan', 'cosine']
    
    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

        best_f1 = 0

        X_trainval, X_valid, y_trainval, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)

        idxs_neg = np.where(y_trainval == 1)[0]
        X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
        y_valid = np.append(y_valid, y_trainval[idxs_neg])

        X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
        y_trainval = np.delete(y_trainval, idxs_neg)

        for params in itertools.product(n_neighborss, contaminations, metrics):
            clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)

            clf.fit(X_trainval)

            pred_values = clf.predict(X_valid)
            true_values = [1 if y == 0 else -1 for y in y_valid]

            f1 = f1_score(true_values, pred_values)
            if f1 > best_f1:
                best_f1 = f1
                best_params['n_neighbors'] = params[0]
                best_params['contamination'] = params[1]
                best_params['metric'] = params[2]

        clf = LocalOutlierFactor(**best_params, novelty=True)

        idxs_neg = np.where(y_train == 1)[0]
        X_train = np.delete(X_train, idxs_neg, axis=0)

        clf.fit(X_train)

        pred_values = clf.predict(X_test)
        true_values = [1 if y == 0 else -1 for y in y_test]

        accuracy_scores.append(accuracy_score(true_values, pred_values))
        precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
        recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'LocalOutlierFactor',
            'best n_neighbors': best_params['n_neighbors'],
            'best contamination': best_params['contamination'],
            'best metric': best_params['metric'],
            'method used for model selection': 'f1 on double holdout',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'tot columns, standard scaled'}

In [19]:
scores_df = add_record(scores_df, double_holdout_f1(X))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,LocalOutlierFactor,30,0.45,cosine,accuracy on nested cv,59.0,9.899196,64.079883,11.34871,55.571429,14.658702,"tot columns, standard scaled"
1,LocalOutlierFactor,25,0.48,cosine,f1 on nested cv,64.538462,8.813988,66.971464,9.141962,69.428571,15.8462,"tot columns, standard scaled"
2,LocalOutlierFactor,7,0.5,euclidean,accuracy on outer cv inner holdout,57.307692,10.095696,61.183518,13.594559,53.142857,18.414723,"tot columns, standard scaled"
3,LocalOutlierFactor,30,0.36,cosine,f1 on outer cv inner holdout,62.692308,10.124959,65.556688,10.182327,67.0,15.511023,"tot columns, standard scaled"
4,LocalOutlierFactor,15,0.49,cosine,accuracy on outer holdout inner cv,55.0,11.153846,63.275641,16.617987,48.350356,15.681914,"tot columns, standard scaled"
5,LocalOutlierFactor,20,0.26,manhattan,f1 on outer holdout inner cv,56.538462,12.645602,63.076146,11.48824,53.048967,16.85817,"tot columns, standard scaled"
6,LocalOutlierFactor,30,0.5,cosine,accuracy on double holdout,56.153846,7.919715,64.4059,14.331823,44.188187,11.630585,"tot columns, standard scaled"
7,LocalOutlierFactor,30,0.2,cosine,f1 on double holdout,58.846154,9.109784,59.857843,11.910135,61.257802,23.693186,"tot columns, standard scaled"


In [20]:
file_path = "lof_exp4_df.pickle"

with open(file_path, "wb") as file:
    pickle.dump(scores_df, file)