In [5]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.decomposition import PCA
import itertools
import pickle

In [2]:
df = pd.read_csv("../data/run-over-dataset.csv")
cols_to_drop = ['VERBALE', 'DATA', 'BMI']
X = df.drop(columns=cols_to_drop)
X['ALTEZZA'] = [int(float(h.replace(',', '.'))*100) for h in X['ALTEZZA']]
X['PESO'] = [int(float(str(h).replace(',', '.'))) for h in X['PESO']]
X.head()

Unnamed: 0,SESSO,ANNI,PESO,ALTEZZA,Mezzo,Testa:Neurocranio,Testa:Splancnocranio,Testa:Telencefalo,Testa:Cervelletto,Testa:Tronco encefalico,...,II raggio sx.1,III raggio sx.1,IV raggio sx.1,V raggio sx.1,Art. coxo-femorale dx,Art. coxo-femorale sx,Rotula o Ginocchio dx,Rotula o Ginocchio sx,Caviglia dx,Caviglia sx
0,0,81,84,175,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,69,69,162,1,4,4,4,4,4,...,0,0,0,0,0,0,0,0,0,0
2,1,71,67,155,1,2,0,1,1,2,...,0,0,0,0,0,0,0,0,0,0
3,1,54,60,159,1,4,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,78,69,167,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
num_unique_values = X.nunique()
constant_columns = num_unique_values[num_unique_values == 1].index.tolist()
X = X.drop(columns=constant_columns)
X = X.T.drop_duplicates().T
X.shape

(130, 312)

In [4]:
mezzo_1 = X[X['Mezzo'] == 1].drop(columns='Mezzo').values
mezzo_0 = X[X['Mezzo'] == 0].drop(columns='Mezzo').values

In [6]:
def nested_cv_accuracy(X, Y, pca_components=0.9):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=5)
    for seed in random_seeds:
        outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        for trainval_idx, test_idx in outer_cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]
            
            best_accuracy = 0
            inner_cv = KFold(n_splits=3, shuffle=True, random_state=seed)
            
            for train_idx, valid_idx in inner_cv.split(X_trainval):
                X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

                for params in list(itertools.product(n_estimatorss, contaminations, max_featuress)):
                    clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])
                    clf.fit(X_train)

                    positive_class_predictions = clf.predict(X_valid)
                    negative_class_predictions = clf.predict(Y)

                    true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                    predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                    accuracy = accuracy_score(true_values, predicted_values)
                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
                        best_params['n_estimators'] = params[0]
                        best_params['contamination'] = params[1]
                        best_params['max_features'] = params[2]

            clf = IsolationForest(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'IsolationForest',
               'best n_estimators': best_params['n_estimators'],
               'best contamination': best_params['contamination'],
               'best max_features': best_params['max_features'],
               'score used for model selection': 'accuracy score',
               'method used': 'nested cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}

    return results

In [7]:
results = nested_cv_accuracy(mezzo_0, mezzo_1, 0.95)
results

{'algorythm': 'IsolationForest',
 'best n_estimators': 150,
 'best contamination': 0.5,
 'best max_features': 0.2,
 'score used for model selection': 'accuracy score',
 'method used': 'nested cv',
 'accuracy score mean': 65.51351351351352,
 'accuracy score std': 3.824491448931617,
 'precision score mean': 20.543154434145148,
 'precision score std': 6.1600677565395,
 'recall score mean': 29.142857142857142,
 'recall score std': 11.050902631893551,
 'f1 score mean': 23.893782211697367,
 'f1 score std': 7.729260049443611,
 'data set': 'no dup, no const, pca 0.95'}

In [8]:
scores_df = pd.DataFrame(results, index=[0])
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,IsolationForest,150,0.5,0.2,accuracy score,nested cv,65.513514,3.824491,20.543154,6.160068,29.142857,11.050903,23.893782,7.72926,"no dup, no const, pca 0.95"


In [9]:
def add_record(df, record):
    new_record = pd.DataFrame(record, index=[0])
    df = pd.concat([df, new_record], ignore_index=True)
    return df  

In [10]:
def nested_cv_f1(X, Y, pca_components=0.9):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=5)
    for seed in random_seeds:
        outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        for trainval_idx, test_idx in outer_cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]
            
            best_f1 = 0
            inner_cv = KFold(n_splits=3, shuffle=True, random_state=seed)
            
            for train_idx, valid_idx in inner_cv.split(X_trainval):
                X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

                for params in list(itertools.product(n_estimatorss, contaminations, max_featuress)):
                    clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])
                    clf.fit(X_train)

                    positive_class_predictions = clf.predict(X_valid)
                    negative_class_predictions = clf.predict(Y)

                    true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                    predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                    f1 = f1_score(true_values, predicted_values)
                    if f1 > best_f1:
                        best_f1 = f1
                        best_params['n_estimators'] = params[0]
                        best_params['contamination'] = params[1]
                        best_params['max_features'] = params[2]

            clf = IsolationForest(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'IsolationForest',
               'best n_estimators': best_params['n_estimators'],
               'best contamination': best_params['contamination'],
               'best max_features': best_params['max_features'],
               'score used for model selection': 'f1 score',
               'method used': 'nested cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}

    return results

In [11]:
scores_df = add_record(scores_df, nested_cv_f1(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,IsolationForest,150,0.5,0.2,accuracy score,nested cv,65.513514,3.824491,20.543154,6.160068,29.142857,11.050903,23.893782,7.72926,"no dup, no const, pca 0.95"
1,IsolationForest,50,0.3,0.5,f1 score,nested cv,50.540541,9.624051,23.853895,3.118673,70.857143,14.972083,35.146971,3.280418,"no dup, no const, pca 0.95"


In [12]:
def outer_cv_inner_holdout_accuracy(X, Y, pca_components=0.9):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=5)
    for seed in random_seeds:
        cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        for trainval_idx, test_idx in cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            best_accuracy = 0

            X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed)
            
            for params in list(itertools.product(n_estimatorss, contaminations, max_featuress)):
                clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_prediction = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_prediction, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_prediction))

                accuracy = accuracy_score(true_values, predicted_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['n_estimators'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['max_features'] = params[2]

            clf = IsolationForest(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_prediction = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_prediction, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_prediction))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'IsolationForest',
               'best n_estimators': best_params['n_estimators'],
               'best contamination': best_params['contamination'],
               'best max_features': best_params['max_features'],
               'score used for model selection': 'accuracy score',
               'method used': 'outer cv inner holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}

    return results

In [13]:
scores_df = add_record(scores_df, outer_cv_inner_holdout_accuracy(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,IsolationForest,150,0.5,0.2,accuracy score,nested cv,65.513514,3.824491,20.543154,6.160068,29.142857,11.050903,23.893782,7.72926,"no dup, no const, pca 0.95"
1,IsolationForest,50,0.3,0.5,f1 score,nested cv,50.540541,9.624051,23.853895,3.118673,70.857143,14.972083,35.146971,3.280418,"no dup, no const, pca 0.95"
2,IsolationForest,50,0.5,0.2,accuracy score,outer cv inner holdout,65.135135,4.789201,22.571847,7.486996,34.857143,13.897585,27.020017,9.172669,"no dup, no const, pca 0.95"


In [14]:
def outer_cv_inner_holdout_f1(X, Y, pca_components=0.9):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=5)
    for seed in random_seeds:
        cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        for trainval_idx, test_idx in cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            best_f1 = 0

            X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed)
            
            for params in list(itertools.product(n_estimatorss, contaminations, max_featuress)):
                clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_prediction = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_prediction, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_prediction))

                f1 = f1_score(true_values, predicted_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['n_estimators'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['max_features'] = params[2]

            clf = IsolationForest(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_prediction = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_prediction, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_prediction))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'IsolationForest',
               'best n_estimators': best_params['n_estimators'],
               'best contamination': best_params['contamination'],
               'best max_features': best_params['max_features'],
               'score used for model selection': 'f1 score',
               'method used': 'outer cv inner holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}

    return results

In [15]:
scores_df = add_record(scores_df, outer_cv_inner_holdout_f1(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,IsolationForest,150,0.5,0.2,accuracy score,nested cv,65.513514,3.824491,20.543154,6.160068,29.142857,11.050903,23.893782,7.72926,"no dup, no const, pca 0.95"
1,IsolationForest,50,0.3,0.5,f1 score,nested cv,50.540541,9.624051,23.853895,3.118673,70.857143,14.972083,35.146971,3.280418,"no dup, no const, pca 0.95"
2,IsolationForest,50,0.5,0.2,accuracy score,outer cv inner holdout,65.135135,4.789201,22.571847,7.486996,34.857143,13.897585,27.020017,9.172669,"no dup, no const, pca 0.95"
3,IsolationForest,100,0.3,1.0,f1 score,outer cv inner holdout,49.945946,12.673803,22.46108,4.815357,66.857143,24.064539,32.805536,7.042558,"no dup, no const, pca 0.95"


In [16]:
def outer_holdout_inner_cv_accuracy(X, Y, pca_components=0.9):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=5)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed)

        best_accuracy = 0
        cv = KFold(n_splits=5, shuffle=True, random_state=seed)
        
        for train_idx, valid_idx in cv.split(X_trainval):
            X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

            for params in list(itertools.product(n_estimatorss, contaminations, max_featuress)):
                clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_prediction = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_prediction, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_prediction))

                accuracy = accuracy_score(true_values, predicted_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['n_estimators'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['max_features'] = params[2]

        clf = IsolationForest(**best_params)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_prediction = clf.predict(Y)

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_prediction, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_prediction))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = precision_score(true_values, predicted_values)
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    results = {'algorythm': 'IsolationForest',
               'best n_estimators': best_params['n_estimators'],
               'best contamination': best_params['contamination'],
               'best max_features': best_params['max_features'],
               'score used for model selection': 'accuracy score',
               'method used': 'outer holdout inner cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}

    return results       

In [17]:
scores_df = add_record(scores_df, outer_holdout_inner_cv_accuracy(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,IsolationForest,150,0.5,0.2,accuracy score,nested cv,65.513514,3.824491,20.543154,6.160068,29.142857,11.050903,23.893782,7.72926,"no dup, no const, pca 0.95"
1,IsolationForest,50,0.3,0.5,f1 score,nested cv,50.540541,9.624051,23.853895,3.118673,70.857143,14.972083,35.146971,3.280418,"no dup, no const, pca 0.95"
2,IsolationForest,50,0.5,0.2,accuracy score,outer cv inner holdout,65.135135,4.789201,22.571847,7.486996,34.857143,13.897585,27.020017,9.172669,"no dup, no const, pca 0.95"
3,IsolationForest,100,0.3,1.0,f1 score,outer cv inner holdout,49.945946,12.673803,22.46108,4.815357,66.857143,24.064539,32.805536,7.042558,"no dup, no const, pca 0.95"
4,IsolationForest,150,0.5,0.2,accuracy score,outer holdout inner cv,64.594595,3.461148,16.586466,7.413145,22.857143,12.289036,19.066999,9.061964,"no dup, no const, pca 0.95"


In [20]:
def outer_holdout_inner_cv_f1(X, Y, pca_components=0.9):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=5)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed)

        best_f1 = 0
        cv = KFold(n_splits=5, shuffle=True, random_state=seed)
        
        for train_idx, valid_idx in cv.split(X_trainval):
            X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

            for params in list(itertools.product(n_estimatorss, contaminations, max_featuress)):
                clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_prediction = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_prediction, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_prediction))

                f1 = f1_score(true_values, predicted_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['n_estimators'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['max_features'] = params[2]

        clf = IsolationForest(**best_params)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_prediction = clf.predict(Y)

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_prediction, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_prediction))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = precision_score(true_values, predicted_values)
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    results = {'algorythm': 'IsolationForest',
               'best n_estimators': best_params['n_estimators'],
               'best contamination': best_params['contamination'],
               'best max_features': best_params['max_features'],
               'score used for model selection': 'f1 score',
               'method used': 'outer holdout inner cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}

    return results       

In [21]:
scores_df = add_record(scores_df, outer_holdout_inner_cv_f1(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,IsolationForest,150,0.5,0.2,accuracy score,nested cv,65.513514,3.824491,20.543154,6.160068,29.142857,11.050903,23.893782,7.72926,"no dup, no const, pca 0.95"
1,IsolationForest,50,0.3,0.5,f1 score,nested cv,50.540541,9.624051,23.853895,3.118673,70.857143,14.972083,35.146971,3.280418,"no dup, no const, pca 0.95"
2,IsolationForest,50,0.5,0.2,accuracy score,outer cv inner holdout,65.135135,4.789201,22.571847,7.486996,34.857143,13.897585,27.020017,9.172669,"no dup, no const, pca 0.95"
3,IsolationForest,100,0.3,1.0,f1 score,outer cv inner holdout,49.945946,12.673803,22.46108,4.815357,66.857143,24.064539,32.805536,7.042558,"no dup, no const, pca 0.95"
4,IsolationForest,150,0.5,0.2,accuracy score,outer holdout inner cv,64.594595,3.461148,16.586466,7.413145,22.857143,12.289036,19.066999,9.061964,"no dup, no const, pca 0.95"
5,IsolationForest,200,0.3,0.2,f1 score,outer holdout inner cv,64.324324,6.486486,25.287598,6.68343,41.428571,13.093073,30.292269,5.938793,"no dup, no const, pca 0.95"


In [22]:
def double_holdout_accuracy(X, Y, pca_components=0.9):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)
    
    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=5)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed)

        X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed)
        best_accuracy = 0

        for params in list(itertools.product(n_estimatorss, contaminations, max_featuress)):
            clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])

            clf.fit(X_train)

            positive_class_predictions = clf.predict(X_valid)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params['n_estimators'], best_params['contamination'], best_params['max_features'] = params[0], params[1], params[2]
        
        clf = IsolationForest(**best_params)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_prediction = clf.predict(Y)

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_prediction, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_prediction))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = precision_score(true_values, predicted_values)
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    results = {'algorythm': 'IsolationForest',
               'best n_estimators': best_params['n_estimators'],
               'best contamination': best_params['contamination'],
               'best max_features': best_params['max_features'],
               'score used for model selection': 'accuracy score',
               'method used': 'double holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}

    return results   

In [23]:
scores_df = add_record(scores_df, double_holdout_accuracy(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,IsolationForest,150,0.5,0.2,accuracy score,nested cv,65.513514,3.824491,20.543154,6.160068,29.142857,11.050903,23.893782,7.72926,"no dup, no const, pca 0.95"
1,IsolationForest,50,0.3,0.5,f1 score,nested cv,50.540541,9.624051,23.853895,3.118673,70.857143,14.972083,35.146971,3.280418,"no dup, no const, pca 0.95"
2,IsolationForest,50,0.5,0.2,accuracy score,outer cv inner holdout,65.135135,4.789201,22.571847,7.486996,34.857143,13.897585,27.020017,9.172669,"no dup, no const, pca 0.95"
3,IsolationForest,100,0.3,1.0,f1 score,outer cv inner holdout,49.945946,12.673803,22.46108,4.815357,66.857143,24.064539,32.805536,7.042558,"no dup, no const, pca 0.95"
4,IsolationForest,150,0.5,0.2,accuracy score,outer holdout inner cv,64.594595,3.461148,16.586466,7.413145,22.857143,12.289036,19.066999,9.061964,"no dup, no const, pca 0.95"
5,IsolationForest,200,0.3,0.2,f1 score,outer holdout inner cv,64.324324,6.486486,25.287598,6.68343,41.428571,13.093073,30.292269,5.938793,"no dup, no const, pca 0.95"
6,IsolationForest,150,0.5,0.2,accuracy score,double holdout,63.783784,4.045035,20.949123,8.397468,34.285714,14.568627,25.9232,10.549803,"no dup, no const, pca 0.95"


In [24]:
def double_holdout_f1(X, Y, pca_components=0.9):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    pca = PCA(n_components=pca_components)
    
    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=5)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed)

        X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed)
        best_f1 = 0

        for params in list(itertools.product(n_estimatorss, contaminations, max_featuress)):
            clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])

            clf.fit(X_train)

            positive_class_predictions = clf.predict(X_valid)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            f1 = f1_score(true_values, predicted_values)
            if f1 > best_f1:
                best_f1 = f1
                best_params['n_estimators'], best_params['contamination'], best_params['max_features'] = params[0], params[1], params[2]
        
        clf = IsolationForest(**best_params)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_prediction = clf.predict(Y)

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_prediction, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_prediction))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = precision_score(true_values, predicted_values)
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    results = {'algorythm': 'IsolationForest',
               'best n_estimators': best_params['n_estimators'],
               'best contamination': best_params['contamination'],
               'best max_features': best_params['max_features'],
               'score used for model selection': 'f1 score',
               'method used': 'double holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}

    return results   

In [25]:
scores_df = add_record(scores_df, double_holdout_f1(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,IsolationForest,150,0.5,0.2,accuracy score,nested cv,65.513514,3.824491,20.543154,6.160068,29.142857,11.050903,23.893782,7.72926,"no dup, no const, pca 0.95"
1,IsolationForest,50,0.3,0.5,f1 score,nested cv,50.540541,9.624051,23.853895,3.118673,70.857143,14.972083,35.146971,3.280418,"no dup, no const, pca 0.95"
2,IsolationForest,50,0.5,0.2,accuracy score,outer cv inner holdout,65.135135,4.789201,22.571847,7.486996,34.857143,13.897585,27.020017,9.172669,"no dup, no const, pca 0.95"
3,IsolationForest,100,0.3,1.0,f1 score,outer cv inner holdout,49.945946,12.673803,22.46108,4.815357,66.857143,24.064539,32.805536,7.042558,"no dup, no const, pca 0.95"
4,IsolationForest,150,0.5,0.2,accuracy score,outer holdout inner cv,64.594595,3.461148,16.586466,7.413145,22.857143,12.289036,19.066999,9.061964,"no dup, no const, pca 0.95"
5,IsolationForest,200,0.3,0.2,f1 score,outer holdout inner cv,64.324324,6.486486,25.287598,6.68343,41.428571,13.093073,30.292269,5.938793,"no dup, no const, pca 0.95"
6,IsolationForest,150,0.5,0.2,accuracy score,double holdout,63.783784,4.045035,20.949123,8.397468,34.285714,14.568627,25.9232,10.549803,"no dup, no const, pca 0.95"
7,IsolationForest,100,0.1,0.2,f1 score,double holdout,42.972973,4.864865,23.130029,1.105552,87.142857,13.850514,36.439018,2.431405,"no dup, no const, pca 0.95"


In [26]:
file_path = "if_exp2_df.pickle"

with open(file_path, "wb") as file:
    pickle.dump(scores_df, file)