In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from collections import defaultdict
import itertools

In [4]:
df = pd.read_csv("../data/run-over-dataset.csv")
cols_to_drop = ['VERBALE', 'DATA', 'BMI']
X = df.drop(columns=cols_to_drop)
X['ALTEZZA'] = [int(float(h.replace(',', '.'))*100) for h in X['ALTEZZA']]
X['PESO'] = [int(float(str(h).replace(',', '.'))) for h in X['PESO']]
X.head()

Unnamed: 0,SESSO,ANNI,PESO,ALTEZZA,Mezzo,Testa:Neurocranio,Testa:Splancnocranio,Testa:Telencefalo,Testa:Cervelletto,Testa:Tronco encefalico,...,II raggio sx.1,III raggio sx.1,IV raggio sx.1,V raggio sx.1,Art. coxo-femorale dx,Art. coxo-femorale sx,Rotula o Ginocchio dx,Rotula o Ginocchio sx,Caviglia dx,Caviglia sx
0,0,81,84,175,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,69,69,162,1,4,4,4,4,4,...,0,0,0,0,0,0,0,0,0,0
2,1,71,67,155,1,2,0,1,1,2,...,0,0,0,0,0,0,0,0,0,0
3,1,54,60,159,1,4,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,78,69,167,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
def low_std_cols(X, beta):
    columns_low_std = []
    for column in X.columns:
        if X[column].std() <= beta:
            columns_low_std.append(column)

    return columns_low_std, len(columns_low_std)

def zero_columns(X):
    zero_cols = []
    cols, n = low_std_cols(X, 0)

    for col in cols:
        if sum(X[col]) == 0:
            zero_cols.append(col)
    
    return zero_cols, (len(zero_cols) == n)

cols_to_drop, ok = zero_columns(X)
X = X.drop(columns=cols_to_drop)
X = X.T.drop_duplicates().T
X.shape

(130, 312)

In [6]:
mezzo_1 = X[X['Mezzo'] == 1].drop(columns='Mezzo').values
mezzo_0 = X[X['Mezzo'] == 0].drop(columns='Mezzo').values
classes = X['Mezzo'].values

In [14]:
def nested_cv_accuracy(X, Y):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
        
        best_accuracy = 0
        for trainval_idx, test_idx in outer_cv.split(X):
            inner_cv = KFold(n_splits=5, shuffle=True, random_state=seed)
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            for train_idx, valid_idx in inner_cv.split(X_trainval):
                X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

                params_list = list(itertools.product(n_neighborss, contaminations, metrics))
                for params in params_list:
                    clf = LocalOutlierFactor(n_jobs=params[0], contamination=params[1], metric=params[2], novelty=True)
                    clf.fit(X_train)

                    positive_class_predictions = clf.predict(X_valid)
                    negative_class_predictions = clf.predict(Y)

                    true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                    predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                    accuracy = accuracy_score(true_values, predicted_values)
                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
                        best_params['n_neighbors'] = params[0]
                        best_params['contamination'] = params[1]
                        best_params['metric'] = params[2]

            clf = LocalOutlierFactor(**best_params, novelty=True)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'accuracy score',
               'method used': 'nested cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100}

    return results

In [15]:
results = nested_cv_accuracy(mezzo_0, mezzo_1)
results

{'algorythm': 'LocalOutlierFactor',
 'best n_neighbors': 5,
 'best contamination': 0.5,
 'best metric': 'manhattan',
 'score used for model selection': 'accuracy score',
 'method used': 'nested cv',
 'accuracy score mean': 73.52238805970148,
 'accuracy score std': 3.677946292225033,
 'precision score mean': 18.803037103075802,
 'precision score std': 6.721384660336612,
 'recall score mean': 47.42857142857143,
 'recall score std': 19.25977091011739,
 'f1 score mean': 26.804977814429325,
 'f1 score std': 9.81540768879125}

In [16]:
results['data set'] = 'no 0 columns, no equal columns'
scores_df = pd.DataFrame(results, index=[0])
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,manhattan,accuracy score,nested cv,73.522388,3.677946,18.803037,6.721385,47.428571,19.259771,26.804978,9.815408,"no 0 columns, no equal columns"


In [17]:
def nested_cv_f1(X, Y):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
        
        best_f1 = 0
        for trainval_idx, test_idx in outer_cv.split(X):
            inner_cv = KFold(n_splits=5, shuffle=True, random_state=seed)
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            for train_idx, valid_idx in inner_cv.split(X_trainval):
                X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

                params_list = list(itertools.product(n_neighborss, contaminations, metrics))
                for params in params_list:
                    clf = LocalOutlierFactor(n_jobs=params[0], contamination=params[1], metric=params[2], novelty=True)
                    clf.fit(X_train)

                    positive_class_predictions = clf.predict(X_valid)
                    negative_class_predictions = clf.predict(Y)

                    true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                    predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                    f1 = f1_score(true_values, predicted_values)
                    if f1 > best_f1:
                        best_f1 = f1
                        best_params['n_neighbors'] = params[0]
                        best_params['contamination'] = params[1]
                        best_params['metric'] = params[2]

            clf = LocalOutlierFactor(**best_params, novelty=True)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'f1 score',
               'method used': 'nested cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100}

    return results

In [18]:
results = nested_cv_f1(mezzo_0, mezzo_1)
results['data set'] = 'no 0 columns, no equal columns'

In [19]:
def add_record(df, record):
    new_record = pd.DataFrame(record, index=[0])
    df = pd.concat([df, new_record], ignore_index=True)
    return df   

scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,manhattan,accuracy score,nested cv,73.522388,3.677946,18.803037,6.721385,47.428571,19.259771,26.804978,9.815408,"no 0 columns, no equal columns"
1,LocalOutlierFactor,5,0.3,manhattan,f1 score,nested cv,67.761194,6.704799,17.596069,5.555557,56.857143,22.404446,26.479408,8.335304,"no 0 columns, no equal columns"


In [20]:
import pickle

file_path = "lof_exp1_df.pickle"

with open(file_path, "wb") as file:
    pickle.dump(scores_df, file)

In [1]:
import pickle

file_path = "lof_exp1_df.pickle"

with open(file_path, "rb") as file:
    scores_df = pickle.load(file)


In [2]:
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,manhattan,accuracy score,nested cv,73.522388,3.677946,18.803037,6.721385,47.428571,19.259771,26.804978,9.815408,"no 0 columns, no equal columns"
1,LocalOutlierFactor,5,0.3,manhattan,f1 score,nested cv,67.761194,6.704799,17.596069,5.555557,56.857143,22.404446,26.479408,8.335304,"no 0 columns, no equal columns"


In [9]:
def outer_cv_inner_holdout_accuracy(X, Y):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        cv = KFold(n_splits=10, shuffle=True, random_state=seed)

        for trainval_idx, test_idx in cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            best_accuracy = 0

            X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed)
            params_list = list(itertools.product(n_neighborss, contaminations, metrics))

            for params in params_list:
                clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)
                clf.fit(X_train)

                X_predictions = clf.predict(X_valid)
                Y_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(X_predictions, 1), np.full_like(Y_predictions, -1)))
                predicted_values = np.concatenate((X_predictions, Y_predictions))

                accuracy = accuracy_score(true_values, predicted_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['n_neighbors'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['metric'] = params[2]

            clf = LocalOutlierFactor(**best_params, novelty=True)
            clf.fit(X_trainval)

            X_predictions = clf.predict(X_test)
            Y_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(X_predictions, 1), np.full_like(Y_predictions, -1)))
            predicted_values = np.concatenate((X_predictions, Y_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'accuracy score',
               'method used': 'outer cv inner holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100}
    
    return results

In [10]:
results = outer_cv_inner_holdout_accuracy(mezzo_0, mezzo_1)

In [11]:
results['data set'] = 'no 0 columns, no equal columns'

In [12]:
def add_record(df, record):
    new_record = pd.DataFrame(record, index=[0])
    df = pd.concat([df, new_record], ignore_index=True)
    return df 

In [13]:
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,manhattan,accuracy score,nested cv,73.522388,3.677946,18.803037,6.721385,47.428571,19.259771,26.804978,9.815408,"no 0 columns, no equal columns"
1,LocalOutlierFactor,5,0.3,manhattan,f1 score,nested cv,67.761194,6.704799,17.596069,5.555557,56.857143,22.404446,26.479408,8.335304,"no 0 columns, no equal columns"
2,LocalOutlierFactor,5,0.5,manhattan,accuracy score,outer cv inner holdout,76.716418,3.674189,20.999816,7.947706,45.857143,19.825257,28.663682,11.21941,"no 0 columns, no equal columns"


In [14]:
def outer_cv_inner_holdout_f1(X, Y):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        cv = KFold(n_splits=10, shuffle=True, random_state=seed)

        for trainval_idx, test_idx in cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            best_f1 = 0

            X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed)
            params_list = list(itertools.product(n_neighborss, contaminations, metrics))

            for params in params_list:
                clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)
                clf.fit(X_train)

                X_predictions = clf.predict(X_valid)
                Y_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(X_predictions, 1), np.full_like(Y_predictions, -1)))
                predicted_values = np.concatenate((X_predictions, Y_predictions))

                f1 = f1_score(true_values, predicted_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['n_neighbors'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['metric'] = params[2]

            clf = LocalOutlierFactor(**best_params, novelty=True)
            clf.fit(X_trainval)

            X_predictions = clf.predict(X_test)
            Y_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(X_predictions, 1), np.full_like(Y_predictions, -1)))
            predicted_values = np.concatenate((X_predictions, Y_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'f1 score',
               'method used': 'outer cv inner holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100}
    
    return results

In [15]:
results = outer_cv_inner_holdout_f1(mezzo_0, mezzo_1)
results['data set'] = 'no 0 columns, no equal columns'

In [16]:
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,manhattan,accuracy score,nested cv,73.522388,3.677946,18.803037,6.721385,47.428571,19.259771,26.804978,9.815408,"no 0 columns, no equal columns"
1,LocalOutlierFactor,5,0.3,manhattan,f1 score,nested cv,67.761194,6.704799,17.596069,5.555557,56.857143,22.404446,26.479408,8.335304,"no 0 columns, no equal columns"
2,LocalOutlierFactor,5,0.5,manhattan,accuracy score,outer cv inner holdout,76.716418,3.674189,20.999816,7.947706,45.857143,19.825257,28.663682,11.21941,"no 0 columns, no equal columns"
3,LocalOutlierFactor,10,0.3,manhattan,f1 score,outer cv inner holdout,68.567164,8.261218,20.285015,4.891918,66.0,20.28169,30.501707,6.907672,"no 0 columns, no equal columns"


In [18]:
def outer_holdout_inner_cv_accuracy(X, Y):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed)

        cv = KFold(n_splits=10, shuffle=True, random_state=seed)
        best_accuracy = 0
        
        for train_idx, valid_idx in cv.split(X_trainval):
            X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

            params_list = list(itertools.product(n_neighborss, contaminations, metrics))
            for params in params_list:
                clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                accuracy = accuracy_score(true_values, predicted_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['n_neighbors'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['metric'] = params[2]

        clf = LocalOutlierFactor(**best_params, novelty=True)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = precision_score(true_values, predicted_values)
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'accuracy score',
               'method used': 'outer holdout inner cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100}
    
    return results

In [19]:
results = outer_holdout_inner_cv_accuracy(mezzo_0, mezzo_1)
results['data set'] = 'no 0 columns, no equal columns'

In [20]:
scores_df = add_record(scores_df, results)

In [21]:
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,manhattan,accuracy score,nested cv,73.522388,3.677946,18.803037,6.721385,47.428571,19.259771,26.804978,9.815408,"no 0 columns, no equal columns"
1,LocalOutlierFactor,5,0.3,manhattan,f1 score,nested cv,67.761194,6.704799,17.596069,5.555557,56.857143,22.404446,26.479408,8.335304,"no 0 columns, no equal columns"
2,LocalOutlierFactor,5,0.5,manhattan,accuracy score,outer cv inner holdout,76.716418,3.674189,20.999816,7.947706,45.857143,19.825257,28.663682,11.21941,"no 0 columns, no equal columns"
3,LocalOutlierFactor,10,0.3,manhattan,f1 score,outer cv inner holdout,68.567164,8.261218,20.285015,4.891918,66.0,20.28169,30.501707,6.907672,"no 0 columns, no equal columns"
4,LocalOutlierFactor,10,0.5,manhattan,accuracy score,outer holdout inner cv,73.918919,2.706079,32.919096,8.184228,42.142857,19.285714,36.598147,12.289942,"no 0 columns, no equal columns"


In [22]:
def outer_holdout_inner_cv_f1(X, Y):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed)

        cv = KFold(n_splits=10, shuffle=True, random_state=seed)
        best_f1 = 0
        
        for train_idx, valid_idx in cv.split(X_trainval):
            X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

            params_list = list(itertools.product(n_neighborss, contaminations, metrics))
            for params in params_list:
                clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                f1 = f1_score(true_values, predicted_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['n_neighbors'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['metric'] = params[2]

        clf = LocalOutlierFactor(**best_params, novelty=True)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = precision_score(true_values, predicted_values)
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'f1 score',
               'method used': 'outer holdout inner cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100}
    
    return results

In [23]:
results = outer_holdout_inner_cv_f1(mezzo_0, mezzo_1)
results['data set'] = 'no 0 columns, no equal columns'

In [24]:
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,manhattan,accuracy score,nested cv,73.522388,3.677946,18.803037,6.721385,47.428571,19.259771,26.804978,9.815408,"no 0 columns, no equal columns"
1,LocalOutlierFactor,5,0.3,manhattan,f1 score,nested cv,67.761194,6.704799,17.596069,5.555557,56.857143,22.404446,26.479408,8.335304,"no 0 columns, no equal columns"
2,LocalOutlierFactor,5,0.5,manhattan,accuracy score,outer cv inner holdout,76.716418,3.674189,20.999816,7.947706,45.857143,19.825257,28.663682,11.21941,"no 0 columns, no equal columns"
3,LocalOutlierFactor,10,0.3,manhattan,f1 score,outer cv inner holdout,68.567164,8.261218,20.285015,4.891918,66.0,20.28169,30.501707,6.907672,"no 0 columns, no equal columns"
4,LocalOutlierFactor,10,0.5,manhattan,accuracy score,outer holdout inner cv,73.918919,2.706079,32.919096,8.184228,42.142857,19.285714,36.598147,12.289942,"no 0 columns, no equal columns"
5,LocalOutlierFactor,10,0.5,manhattan,f1 score,outer holdout inner cv,77.837838,2.847474,42.746413,6.21664,54.285714,13.627703,47.511396,9.069557,"no 0 columns, no equal columns"


In [25]:
def double_holdout_accuracy(X, Y):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed)

        best_accuracy = 0
        X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed)

        for params in list(itertools.product(n_neighborss, contaminations, metrics)):
            clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)
            clf.fit(X_train)

            positive_class_predictions = clf.predict(X_valid)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params['n_neighbors'] = params[0]
                best_params['contamination'] = params[1]
                best_params['metric'] = params[2]

        clf = LocalOutlierFactor(**best_params, novelty=True)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = precision_score(true_values, predicted_values)
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'accuracy score',
               'method used': 'double holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100}
    
    return results

In [29]:
results = double_holdout_accuracy(mezzo_0, mezzo_1)
results['data set'] = 'no 0 columns, no equal columns'
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,manhattan,accuracy score,nested cv,73.522388,3.677946,18.803037,6.721385,47.428571,19.259771,26.804978,9.815408,"no 0 columns, no equal columns"
1,LocalOutlierFactor,5,0.3,manhattan,f1 score,nested cv,67.761194,6.704799,17.596069,5.555557,56.857143,22.404446,26.479408,8.335304,"no 0 columns, no equal columns"
2,LocalOutlierFactor,5,0.5,manhattan,accuracy score,outer cv inner holdout,76.716418,3.674189,20.999816,7.947706,45.857143,19.825257,28.663682,11.21941,"no 0 columns, no equal columns"
3,LocalOutlierFactor,10,0.3,manhattan,f1 score,outer cv inner holdout,68.567164,8.261218,20.285015,4.891918,66.0,20.28169,30.501707,6.907672,"no 0 columns, no equal columns"
4,LocalOutlierFactor,10,0.5,manhattan,accuracy score,outer holdout inner cv,73.918919,2.706079,32.919096,8.184228,42.142857,19.285714,36.598147,12.289942,"no 0 columns, no equal columns"
5,LocalOutlierFactor,10,0.5,manhattan,f1 score,outer holdout inner cv,77.837838,2.847474,42.746413,6.21664,54.285714,13.627703,47.511396,9.069557,"no 0 columns, no equal columns"
6,LocalOutlierFactor,10,0.5,manhattan,accuracy score,double holdout,74.324324,2.417371,33.929562,9.318618,45.0,18.911718,38.309272,12.969994,"no 0 columns, no equal columns"


In [27]:
def double_holdout_f1(X, Y):
    n_neighborss = [5, 10, 15]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    metrics = ['euclidean', 'minkowski', 'manhattan']

    best_params = {'n_neighbors': 0, 'contamination': 0, 'metric': ''}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed)

        best_f1 = 0
        X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed)

        for params in list(itertools.product(n_neighborss, contaminations, metrics)):
            clf = LocalOutlierFactor(n_neighbors=params[0], contamination=params[1], metric=params[2], novelty=True)
            clf.fit(X_train)

            positive_class_predictions = clf.predict(X_valid)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            f1 = f1_score(true_values, predicted_values)
            if f1 > best_f1:
                best_f1 = f1
                best_params['n_neighbors'] = params[0]
                best_params['contamination'] = params[1]
                best_params['metric'] = params[2]

        clf = LocalOutlierFactor(**best_params, novelty=True)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = precision_score(true_values, predicted_values)
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    results = {'algorythm': 'LocalOutlierFactor',
               'best n_neighbors': best_params['n_neighbors'],
               'best contamination': best_params['contamination'],
               'best metric': best_params['metric'],
               'score used for model selection': 'f1 score',
               'method used': 'double holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100}
    
    return results

In [30]:
results = double_holdout_f1(mezzo_0, mezzo_1)
results['data set'] = 'no 0 columns, no equal columns'
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_neighbors,best contamination,best metric,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,LocalOutlierFactor,5,0.5,manhattan,accuracy score,nested cv,73.522388,3.677946,18.803037,6.721385,47.428571,19.259771,26.804978,9.815408,"no 0 columns, no equal columns"
1,LocalOutlierFactor,5,0.3,manhattan,f1 score,nested cv,67.761194,6.704799,17.596069,5.555557,56.857143,22.404446,26.479408,8.335304,"no 0 columns, no equal columns"
2,LocalOutlierFactor,5,0.5,manhattan,accuracy score,outer cv inner holdout,76.716418,3.674189,20.999816,7.947706,45.857143,19.825257,28.663682,11.21941,"no 0 columns, no equal columns"
3,LocalOutlierFactor,10,0.3,manhattan,f1 score,outer cv inner holdout,68.567164,8.261218,20.285015,4.891918,66.0,20.28169,30.501707,6.907672,"no 0 columns, no equal columns"
4,LocalOutlierFactor,10,0.5,manhattan,accuracy score,outer holdout inner cv,73.918919,2.706079,32.919096,8.184228,42.142857,19.285714,36.598147,12.289942,"no 0 columns, no equal columns"
5,LocalOutlierFactor,10,0.5,manhattan,f1 score,outer holdout inner cv,77.837838,2.847474,42.746413,6.21664,54.285714,13.627703,47.511396,9.069557,"no 0 columns, no equal columns"
6,LocalOutlierFactor,10,0.5,manhattan,accuracy score,double holdout,74.324324,2.417371,33.929562,9.318618,45.0,18.911718,38.309272,12.969994,"no 0 columns, no equal columns"
7,LocalOutlierFactor,15,0.3,euclidean,f1 score,double holdout,71.756757,4.540621,36.614174,5.839411,67.857143,17.857143,47.050068,8.657139,"no 0 columns, no equal columns"


In [31]:
file_path = "lof_exp1_df.pickle"

with open(file_path, "wb") as file:
    pickle.dump(scores_df, file)