In [5]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from collections import defaultdict
from sklearn.decomposition import PCA
import itertools

In [2]:
df = pd.read_csv("../data/run-over-dataset.csv")
cols_to_drop = ['VERBALE', 'DATA', 'BMI']
X = df.drop(columns=cols_to_drop)
X['ALTEZZA'] = [int(float(h.replace(',', '.'))*100) for h in X['ALTEZZA']]
X['PESO'] = [int(float(str(h).replace(',', '.'))) for h in X['PESO']]
X.head()

Unnamed: 0,SESSO,ANNI,PESO,ALTEZZA,Mezzo,Testa:Neurocranio,Testa:Splancnocranio,Testa:Telencefalo,Testa:Cervelletto,Testa:Tronco encefalico,...,II raggio sx.1,III raggio sx.1,IV raggio sx.1,V raggio sx.1,Art. coxo-femorale dx,Art. coxo-femorale sx,Rotula o Ginocchio dx,Rotula o Ginocchio sx,Caviglia dx,Caviglia sx
0,0,81,84,175,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,69,69,162,1,4,4,4,4,4,...,0,0,0,0,0,0,0,0,0,0
2,1,71,67,155,1,2,0,1,1,2,...,0,0,0,0,0,0,0,0,0,0
3,1,54,60,159,1,4,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,78,69,167,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
num_unique_values = X.nunique()
constant_columns = num_unique_values[num_unique_values == 1].index.tolist()
X = X.drop(columns=constant_columns)
X = X.T.drop_duplicates().T
X.shape

(130, 312)

In [4]:
mezzo_1 = X[X['Mezzo'] == 1].drop(columns='Mezzo').values
mezzo_0 = X[X['Mezzo'] == 0].drop(columns='Mezzo').values

In [10]:
def nested_cv_accuracy(X, Y, pca_components=0.9):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
        
        best_accuracy = 0
        for trainval_idx, test_idx in outer_cv.split(X):
            inner_cv = KFold(n_splits=7, shuffle=True, random_state=seed)
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            for train_idx, valid_idx in inner_cv.split(X_trainval):
                X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

                params_list = list(itertools.product(n_neighborss, contaminations, metrics))
                for params in params_list:
                    clf = LocalOutlierFactor(n_jobs=params[0], contamination=params[1], metric=params[2], novelty=True)
                    clf.fit(X_train)

                    positive_class_predictions = clf.predict(X_valid)
                    negative_class_predictions = clf.predict(Y)

                    true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                    predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                    accuracy = accuracy_score(true_values, predicted_values)
                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
                        best_params['n_neighbors'] = params[0]
                        best_params['contamination'] = params[1]
                        best_params['metric'] = params[2]

            clf = LocalOutlierFactor(**best_params, novelty=True)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'accuracy score',
               'method used': 'nested cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}

    return results

In [11]:
results = nested_cv_accuracy(mezzo_0, mezzo_1, 0.95)
results

{'algorythm': 'LocalOutlierFactor',
 'best n_neighbors': 5,
 'best contamination': 0.5,
 'best metric': 'euclidean',
 'score used for model selection': 'accuracy score',
 'method used': 'nested cv',
 'accuracy score mean': 68.34328358208957,
 'accuracy score std': 2.730119618235256,
 'precision score mean': 13.383475224067498,
 'precision score std': 5.497602201878275,
 'recall score mean': 39.142857142857146,
 'recall score std': 18.911177992426182,
 'f1 score mean': 19.876606844643298,
 'f1 score std': 8.471238517774978,
 'data set': 'no dup, no const, pca 0.95'}

In [12]:
scores_df = pd.DataFrame(results, index=[0])
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,euclidean,accuracy score,nested cv,68.343284,2.73012,13.383475,5.497602,39.142857,18.911178,19.876607,8.471239,"no dup, no const, pca 0.95"


In [13]:
def add_record(df, record):
    new_record = pd.DataFrame(record, index=[0])
    df = pd.concat([df, new_record], ignore_index=True)
    return df  

In [14]:
def nested_cv_f1(X, Y, pca_components=0.9):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
        
        best_f1 = 0
        for trainval_idx, test_idx in outer_cv.split(X):
            inner_cv = KFold(n_splits=5, shuffle=True, random_state=seed)
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            for train_idx, valid_idx in inner_cv.split(X_trainval):
                X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

                params_list = list(itertools.product(n_neighborss, contaminations, metrics))
                for params in params_list:
                    clf = LocalOutlierFactor(n_jobs=params[0], contamination=params[1], metric=params[2], novelty=True)
                    clf.fit(X_train)

                    positive_class_predictions = clf.predict(X_valid)
                    negative_class_predictions = clf.predict(Y)

                    true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                    predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                    f1 = f1_score(true_values, predicted_values)
                    if f1 > best_f1:
                        best_f1 = f1
                        best_params['n_neighbors'] = params[0]
                        best_params['contamination'] = params[1]
                        best_params['metric'] = params[2]

            clf = LocalOutlierFactor(**best_params, novelty=True)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'f1 score',
               'method used': 'nested cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}

    return results

In [15]:
scores_df = add_record(scores_df, nested_cv_f1(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,euclidean,accuracy score,nested cv,68.343284,2.73012,13.383475,5.497602,39.142857,18.911178,19.876607,8.471239,"no dup, no const, pca 0.95"
1,LocalOutlierFactor,5,0.5,euclidean,f1 score,nested cv,67.776119,3.994288,13.615344,5.291189,41.285714,20.887942,20.34296,8.344743,"no dup, no const, pca 0.95"


In [18]:
def outer_cv_inner_holdout_accuracy(X, Y, pca_components=0.9):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        cv = KFold(n_splits=10, shuffle=True, random_state=seed)

        for trainval_idx, test_idx in cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            best_accuracy = 0

            X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed)
            params_list = list(itertools.product(n_neighborss, contaminations, metrics))

            for params in params_list:
                clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)
                clf.fit(X_train)

                X_predictions = clf.predict(X_valid)
                Y_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(X_predictions, 1), np.full_like(Y_predictions, -1)))
                predicted_values = np.concatenate((X_predictions, Y_predictions))

                accuracy = accuracy_score(true_values, predicted_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['n_neighbors'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['metric'] = params[2]

            clf = LocalOutlierFactor(**best_params, novelty=True)
            clf.fit(X_trainval)

            X_predictions = clf.predict(X_test)
            Y_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(X_predictions, 1), np.full_like(Y_predictions, -1)))
            predicted_values = np.concatenate((X_predictions, Y_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'accuracy score',
               'method used': 'outer cv inner holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}
    
    return results

In [19]:
scores_df = add_record(scores_df, outer_cv_inner_holdout_accuracy(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,euclidean,accuracy score,nested cv,68.343284,2.73012,13.383475,5.497602,39.142857,18.911178,19.876607,8.471239,"no dup, no const, pca 0.95"
1,LocalOutlierFactor,5,0.5,euclidean,f1 score,nested cv,67.776119,3.994288,13.615344,5.291189,41.285714,20.887942,20.34296,8.344743,"no dup, no const, pca 0.95"
2,LocalOutlierFactor,5,0.5,euclidean,accuracy score,outer cv inner holdout,68.970149,2.842059,12.98716,5.63841,36.428571,19.099631,19.072595,8.683841,"no dup, no const, pca 0.95"


In [20]:
def outer_cv_inner_holdout_f1(X, Y, pca_components=0.9):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        cv = KFold(n_splits=10, shuffle=True, random_state=seed)

        for trainval_idx, test_idx in cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            best_f1 = 0

            X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed)
            params_list = list(itertools.product(n_neighborss, contaminations, metrics))

            for params in params_list:
                clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)
                clf.fit(X_train)

                X_predictions = clf.predict(X_valid)
                Y_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(X_predictions, 1), np.full_like(Y_predictions, -1)))
                predicted_values = np.concatenate((X_predictions, Y_predictions))

                f1 = f1_score(true_values, predicted_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['n_neighbors'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['metric'] = params[2]

            clf = LocalOutlierFactor(**best_params, novelty=True)
            clf.fit(X_trainval)

            X_predictions = clf.predict(X_test)
            Y_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(X_predictions, 1), np.full_like(Y_predictions, -1)))
            predicted_values = np.concatenate((X_predictions, Y_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'f1 score',
               'method used': 'outer cv inner holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}
    
    return results

In [21]:
scores_df = add_record(scores_df, outer_cv_inner_holdout_f1(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,euclidean,accuracy score,nested cv,68.343284,2.73012,13.383475,5.497602,39.142857,18.911178,19.876607,8.471239,"no dup, no const, pca 0.95"
1,LocalOutlierFactor,5,0.5,euclidean,f1 score,nested cv,67.776119,3.994288,13.615344,5.291189,41.285714,20.887942,20.34296,8.344743,"no dup, no const, pca 0.95"
2,LocalOutlierFactor,5,0.5,euclidean,accuracy score,outer cv inner holdout,68.970149,2.842059,12.98716,5.63841,36.428571,19.099631,19.072595,8.683841,"no dup, no const, pca 0.95"
3,LocalOutlierFactor,5,0.3,manhattan,f1 score,outer cv inner holdout,61.044776,8.533607,15.651919,3.748587,63.285714,23.057603,24.815575,6.27101,"no dup, no const, pca 0.95"


In [22]:
def outer_holdout_inner_cv_accuracy(X, Y, pca_components=0.9):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed)

        cv = KFold(n_splits=10, shuffle=True, random_state=seed)
        best_accuracy = 0
        
        for train_idx, valid_idx in cv.split(X_trainval):
            X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

            params_list = list(itertools.product(n_neighborss, contaminations, metrics))
            for params in params_list:
                clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                accuracy = accuracy_score(true_values, predicted_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['n_neighbors'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['metric'] = params[2]

        clf = LocalOutlierFactor(**best_params, novelty=True)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = precision_score(true_values, predicted_values)
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'accuracy score',
               'method used': 'outer holdout inner cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}
    
    return results

In [23]:
scores_df = add_record(scores_df, outer_holdout_inner_cv_accuracy(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,euclidean,accuracy score,nested cv,68.343284,2.73012,13.383475,5.497602,39.142857,18.911178,19.876607,8.471239,"no dup, no const, pca 0.95"
1,LocalOutlierFactor,5,0.5,euclidean,f1 score,nested cv,67.776119,3.994288,13.615344,5.291189,41.285714,20.887942,20.34296,8.344743,"no dup, no const, pca 0.95"
2,LocalOutlierFactor,5,0.5,euclidean,accuracy score,outer cv inner holdout,68.970149,2.842059,12.98716,5.63841,36.428571,19.099631,19.072595,8.683841,"no dup, no const, pca 0.95"
3,LocalOutlierFactor,5,0.3,manhattan,f1 score,outer cv inner holdout,61.044776,8.533607,15.651919,3.748587,63.285714,23.057603,24.815575,6.27101,"no dup, no const, pca 0.95"
4,LocalOutlierFactor,5,0.5,manhattan,accuracy score,outer holdout inner cv,67.162162,2.772741,26.942262,4.933954,45.0,15.665509,33.3868,7.913892,"no dup, no const, pca 0.95"


In [24]:
def outer_holdout_inner_cv_f1(X, Y, pca_components=0.9):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed)

        cv = KFold(n_splits=10, shuffle=True, random_state=seed)
        best_f1 = 0
        
        for train_idx, valid_idx in cv.split(X_trainval):
            X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

            params_list = list(itertools.product(n_neighborss, contaminations, metrics))
            for params in params_list:
                clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                f1 = f1_score(true_values, predicted_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['n_neighbors'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['metric'] = params[2]

        clf = LocalOutlierFactor(**best_params, novelty=True)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = precision_score(true_values, predicted_values)
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'f1 score',
               'method used': 'outer holdout inner cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}
    
    return results

In [25]:
scores_df = add_record(scores_df, outer_holdout_inner_cv_f1(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,euclidean,accuracy score,nested cv,68.343284,2.73012,13.383475,5.497602,39.142857,18.911178,19.876607,8.471239,"no dup, no const, pca 0.95"
1,LocalOutlierFactor,5,0.5,euclidean,f1 score,nested cv,67.776119,3.994288,13.615344,5.291189,41.285714,20.887942,20.34296,8.344743,"no dup, no const, pca 0.95"
2,LocalOutlierFactor,5,0.5,euclidean,accuracy score,outer cv inner holdout,68.970149,2.842059,12.98716,5.63841,36.428571,19.099631,19.072595,8.683841,"no dup, no const, pca 0.95"
3,LocalOutlierFactor,5,0.3,manhattan,f1 score,outer cv inner holdout,61.044776,8.533607,15.651919,3.748587,63.285714,23.057603,24.815575,6.27101,"no dup, no const, pca 0.95"
4,LocalOutlierFactor,5,0.5,manhattan,accuracy score,outer holdout inner cv,67.162162,2.772741,26.942262,4.933954,45.0,15.665509,33.3868,7.913892,"no dup, no const, pca 0.95"
5,LocalOutlierFactor,5,0.5,manhattan,f1 score,outer holdout inner cv,64.324324,3.879108,20.681405,4.69148,32.857143,14.356965,24.949138,7.05509,"no dup, no const, pca 0.95"


In [26]:
def double_holdout_accuracy(X, Y, pca_components=0.9):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed)

        best_accuracy = 0
        X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed)

        for params in list(itertools.product(n_neighborss, contaminations, metrics)):
            clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)
            clf.fit(X_train)

            positive_class_predictions = clf.predict(X_valid)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params['n_neighbors'] = params[0]
                best_params['contamination'] = params[1]
                best_params['metric'] = params[2]

        clf = LocalOutlierFactor(**best_params, novelty=True)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = precision_score(true_values, predicted_values)
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'accuracy score',
               'method used': 'double holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}
    
    return results

In [27]:
scores_df = add_record(scores_df, double_holdout_accuracy(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,euclidean,accuracy score,nested cv,68.343284,2.73012,13.383475,5.497602,39.142857,18.911178,19.876607,8.471239,"no dup, no const, pca 0.95"
1,LocalOutlierFactor,5,0.5,euclidean,f1 score,nested cv,67.776119,3.994288,13.615344,5.291189,41.285714,20.887942,20.34296,8.344743,"no dup, no const, pca 0.95"
2,LocalOutlierFactor,5,0.5,euclidean,accuracy score,outer cv inner holdout,68.970149,2.842059,12.98716,5.63841,36.428571,19.099631,19.072595,8.683841,"no dup, no const, pca 0.95"
3,LocalOutlierFactor,5,0.3,manhattan,f1 score,outer cv inner holdout,61.044776,8.533607,15.651919,3.748587,63.285714,23.057603,24.815575,6.27101,"no dup, no const, pca 0.95"
4,LocalOutlierFactor,5,0.5,manhattan,accuracy score,outer holdout inner cv,67.162162,2.772741,26.942262,4.933954,45.0,15.665509,33.3868,7.913892,"no dup, no const, pca 0.95"
5,LocalOutlierFactor,5,0.5,manhattan,f1 score,outer holdout inner cv,64.324324,3.879108,20.681405,4.69148,32.857143,14.356965,24.949138,7.05509,"no dup, no const, pca 0.95"
6,LocalOutlierFactor,5,0.5,euclidean,accuracy score,double holdout,66.621622,3.024734,26.175926,5.346236,43.571429,15.135443,32.453918,7.995537,"no dup, no const, pca 0.95"


In [28]:
def double_holdout_f1(X, Y, pca_components=0.9):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed)

        best_f1 = 0
        X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed)

        for params in list(itertools.product(n_neighborss, contaminations, metrics)):
            clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)
            clf.fit(X_train)

            positive_class_predictions = clf.predict(X_valid)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            f1 = f1_score(true_values, predicted_values)
            if f1 > best_f1:
                best_f1 = f1
                best_params['n_neighbors'] = params[0]
                best_params['contamination'] = params[1]
                best_params['metric'] = params[2]

        clf = LocalOutlierFactor(**best_params, novelty=True)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = precision_score(true_values, predicted_values)
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'f1 score',
               'method used': 'double holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}
    
    return results

In [29]:
scores_df = add_record(scores_df, double_holdout_f1(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,euclidean,accuracy score,nested cv,68.343284,2.73012,13.383475,5.497602,39.142857,18.911178,19.876607,8.471239,"no dup, no const, pca 0.95"
1,LocalOutlierFactor,5,0.5,euclidean,f1 score,nested cv,67.776119,3.994288,13.615344,5.291189,41.285714,20.887942,20.34296,8.344743,"no dup, no const, pca 0.95"
2,LocalOutlierFactor,5,0.5,euclidean,accuracy score,outer cv inner holdout,68.970149,2.842059,12.98716,5.63841,36.428571,19.099631,19.072595,8.683841,"no dup, no const, pca 0.95"
3,LocalOutlierFactor,5,0.3,manhattan,f1 score,outer cv inner holdout,61.044776,8.533607,15.651919,3.748587,63.285714,23.057603,24.815575,6.27101,"no dup, no const, pca 0.95"
4,LocalOutlierFactor,5,0.5,manhattan,accuracy score,outer holdout inner cv,67.162162,2.772741,26.942262,4.933954,45.0,15.665509,33.3868,7.913892,"no dup, no const, pca 0.95"
5,LocalOutlierFactor,5,0.5,manhattan,f1 score,outer holdout inner cv,64.324324,3.879108,20.681405,4.69148,32.857143,14.356965,24.949138,7.05509,"no dup, no const, pca 0.95"
6,LocalOutlierFactor,5,0.5,euclidean,accuracy score,double holdout,66.621622,3.024734,26.175926,5.346236,43.571429,15.135443,32.453918,7.995537,"no dup, no const, pca 0.95"
7,LocalOutlierFactor,5,0.1,manhattan,f1 score,double holdout,56.081081,9.44497,26.038907,3.540066,70.714286,20.812575,37.361865,5.659579,"no dup, no const, pca 0.95"


In [30]:
import pickle

file_path = "lof_exp2_df.pickle"

with open(file_path, "wb") as file:
    pickle.dump(scores_df, file)