In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from collections import defaultdict
import itertools
import pickle

In [6]:
df = pd.read_csv("../data/run-over-dataset.csv")

cols_to_drop = ['VERBALE', 'DATA']
X = df.drop(columns=cols_to_drop)

X['ALTEZZA'] = [int(float(h.replace(',', '.'))*100) for h in X['ALTEZZA']]
X['PESO'] = [int(float(str(h).replace(',', '.'))) for h in X['PESO']]
X['BMI'] = [float(str(h).replace(',', '.')) for h in X['BMI']]

num_unique_values = X.nunique()
constant_columns = num_unique_values[num_unique_values == 1].index.tolist()

X = X.drop(columns=constant_columns)
X = X.T.drop_duplicates().T
X.head

<bound method NDFrame.head of      SESSO  ANNI  PESO  ALTEZZA        BMI  Mezzo  Testa:Neurocranio  \
0      0.0  81.0  84.0    175.0  27.428571    0.0                1.0   
1      1.0  69.0  69.0    162.0  26.291724    1.0                4.0   
2      1.0  71.0  67.0    155.0  27.887617    1.0                2.0   
3      1.0  54.0  60.0    159.0  23.733238    1.0                4.0   
4      1.0  78.0  69.0    167.0  24.740937    1.0                2.0   
..     ...   ...   ...      ...        ...    ...                ...   
125    1.0  82.0  80.0    162.0  30.559366    0.0                1.0   
126    1.0  70.0  75.0    157.0  30.670615    0.0                2.0   
127    0.0  45.0  45.0    177.0  14.363689    1.0                0.0   
128    0.0  54.0  89.0    168.0  31.746032    0.0                0.0   
129    1.0  86.0  50.0    155.0  20.811655    0.0                1.0   

     Testa:Splancnocranio  Testa:Telencefalo  Testa:Cervelletto  ...  \
0                     0.0        

In [5]:
def nested_cv_accuracy(X):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

            best_accuracy = 0
            for trainval_idx, valid_idx in inner_cv.split(X_train, y_train):
                X_trainval, X_valid = X_train[trainval_idx], X_train[valid_idx]
                y_trainval, y_valid = y_train[trainval_idx], y_train[valid_idx]

                idxs_neg = np.where(y_trainval == 1)[0]
                X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
                y_valid = np.append(y_valid, y_trainval[idxs_neg])

                X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
                y_trainval = np.delete(y_trainval, idxs_neg)

                for params in itertools.product(n_estimatorss, contaminations, max_featuress):
                    clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])

                    clf.fit(X_trainval)

                    pred_values = clf.predict(X_valid)
                    true_values = [1 if y == 0 else -1 for y in y_valid]

                    accuracy = accuracy_score(true_values, pred_values)
                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
                        best_params['n_estimators'] = params[0]
                        best_params['contamination'] = params[1]
                        best_params['max_features'] = params[2]

            clf = IsolationForest(**best_params)

            idxs_neg = np.where(y_train == 1)[0]
            X_train = np.delete(X_train, idxs_neg, axis=0)

            clf.fit(X_train)

            pred_values = clf.predict(X_test)
            true_values = [1 if y == 0 else -1 for y in y_test]

            accuracy_scores.append(accuracy_score(true_values, pred_values))
            precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
            recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'IsolationForest',
            'best n_estimators': best_params['n_estimators'],
            'best contamination': best_params['contamination'],
            'best max_features': best_params['max_features'],
            'method used for model selection': 'accuracy on nested cv',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'no const, no equals'}

In [6]:
results = nested_cv_accuracy(X)
results

{'algorythm': 'IsolationForest',
 'best n_estimators': 200,
 'best contamination': 0.5,
 'best max_features': 1.0,
 'method used for model selection': 'accuracy on nested cv',
 'accuracy mean': 65.6923076923077,
 'accuracy std': 9.161276880590403,
 'precision mean': 79.17525807525806,
 'precision  std': 12.253175929813887,
 'recall mean': 50.0,
 'recall std': 15.518257844571737,
 'data set': 'no const, no equals'}

In [7]:
scores_df = pd.DataFrame(results, index=[0])
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,IsolationForest,200,0.5,1.0,accuracy on nested cv,65.692308,9.161277,79.175258,12.253176,50.0,15.518258,"no const, no equals"


In [8]:
def nested_cv_f1(X):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

            best_f1 = 0
            for trainval_idx, valid_idx in inner_cv.split(X_train, y_train):
                X_trainval, X_valid = X_train[trainval_idx], X_train[valid_idx]
                y_trainval, y_valid = y_train[trainval_idx], y_train[valid_idx]

                idxs_neg = np.where(y_trainval == 1)[0]
                X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
                y_valid = np.append(y_valid, y_trainval[idxs_neg])

                X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
                y_trainval = np.delete(y_trainval, idxs_neg)

                for params in itertools.product(n_estimatorss, contaminations, max_featuress):
                    clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])

                    clf.fit(X_trainval)

                    pred_values = clf.predict(X_valid)
                    true_values = [1 if y == 0 else -1 for y in y_valid]

                    f1 = f1_score(true_values, pred_values)
                    if f1 > best_f1:
                        best_f1 = f1
                        best_params['n_estimators'] = params[0]
                        best_params['contamination'] = params[1]
                        best_params['max_features'] = params[2]

            clf = IsolationForest(**best_params)

            idxs_neg = np.where(y_train == 1)[0]
            X_train = np.delete(X_train, idxs_neg, axis=0)

            clf.fit(X_train)

            pred_values = clf.predict(X_test)
            true_values = [1 if y == 0 else -1 for y in y_test]

            accuracy_scores.append(accuracy_score(true_values, pred_values))
            precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
            recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'IsolationForest',
            'best n_estimators': best_params['n_estimators'],
            'best contamination': best_params['contamination'],
            'best max_features': best_params['max_features'],
            'method used for model selection': 'f1 on nested cv',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'no const, no equals'}

In [9]:
def add_record(df, record):
    new_record = pd.DataFrame(record, index=[0])
    df = pd.concat([df, new_record], ignore_index=True)
    return df  

In [10]:
results = nested_cv_f1(X)
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,IsolationForest,200,0.5,1.0,accuracy on nested cv,65.692308,9.161277,79.175258,12.253176,50.0,15.518258,"no const, no equals"
1,IsolationForest,200,0.3,1.0,f1 on nested cv,66.846154,7.211513,76.819757,10.570936,58.0,16.881339,"no const, no equals"


In [11]:
file_path = "if_exp1_df.pickle"

with open(file_path, "wb") as file:
    pickle.dump(scores_df, file)

In [2]:
file_path = "if_exp1_df.pickle"

with open(file_path, "rb") as file:
    scores_df = pickle.load(file)

In [3]:
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,IsolationForest,200,0.5,1.0,accuracy on nested cv,65.692308,9.161277,79.175258,12.253176,50.0,15.518258,"no const, no equals"
1,IsolationForest,200,0.3,1.0,f1 on nested cv,66.846154,7.211513,76.819757,10.570936,58.0,16.881339,"no const, no equals"


In [4]:
def outer_cv_inner_holdout_accuracy(X):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_trainval, X_valid, y_trainval, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)
            best_accuracy = 0

            idxs_neg = np.where(y_trainval == 1)[0]
            X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
            y_valid = np.append(y_valid, y_trainval[idxs_neg])

            X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
            y_trainval = np.delete(y_trainval, idxs_neg)

            for params in itertools.product(n_estimatorss, contaminations, max_featuress):
                clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])

                clf.fit(X_trainval)

                pred_values = clf.predict(X_valid)
                true_values = [1 if y == 0 else -1 for y in y_valid]

                accuracy = accuracy_score(true_values, pred_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['n_estimators'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['max_features'] = params[2]

            clf = IsolationForest(**best_params)

            idxs_neg = np.where(y_train == 1)[0]
            X_train = np.delete(X_train, idxs_neg, axis=0)

            clf.fit(X_train)

            pred_values = clf.predict(X_test)
            true_values = [1 if y == 0 else -1 for y in y_test]

            accuracy_scores.append(accuracy_score(true_values, pred_values))
            precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
            recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'IsolationForest',
            'best n_estimators': best_params['n_estimators'],
            'best contamination': best_params['contamination'],
            'best max_features': best_params['max_features'],
            'method used for model selection': 'accuracy on outer cv inner holdout',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'no const, no equals'}

In [7]:
results = outer_cv_inner_holdout_accuracy(X)
results

{'algorythm': 'IsolationForest',
 'best n_estimators': 150,
 'best contamination': 0.5,
 'best max_features': 0.5,
 'method used for model selection': 'accuracy on outer cv inner holdout',
 'accuracy mean': 66.0,
 'accuracy std': 6.852633061944643,
 'precision mean': 79.60381285381287,
 'precision  std': 9.54288691939038,
 'recall mean': 50.71428571428571,
 'recall std': 13.1125426790613,
 'data set': 'no const, no equals'}

In [10]:
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,IsolationForest,200,0.5,1.0,accuracy on nested cv,65.692308,9.161277,79.175258,12.253176,50.0,15.518258,"no const, no equals"
1,IsolationForest,200,0.3,1.0,f1 on nested cv,66.846154,7.211513,76.819757,10.570936,58.0,16.881339,"no const, no equals"
2,IsolationForest,150,0.5,0.5,accuracy on outer cv inner holdout,66.0,6.852633,79.603813,9.542887,50.714286,13.112543,"no const, no equals"


In [11]:
def outer_cv_inner_holdout_f1(X):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_trainval, X_valid, y_trainval, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)
            best_f1 = 0

            idxs_neg = np.where(y_trainval == 1)[0]
            X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
            y_valid = np.append(y_valid, y_trainval[idxs_neg])

            X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
            y_trainval = np.delete(y_trainval, idxs_neg)

            for params in itertools.product(n_estimatorss, contaminations, max_featuress):
                clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])

                clf.fit(X_trainval)

                pred_values = clf.predict(X_valid)
                true_values = [1 if y == 0 else -1 for y in y_valid]

                f1 = f1_score(true_values, pred_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['n_estimators'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['max_features'] = params[2]

            clf = IsolationForest(**best_params)

            idxs_neg = np.where(y_train == 1)[0]
            X_train = np.delete(X_train, idxs_neg, axis=0)

            clf.fit(X_train)

            pred_values = clf.predict(X_test)
            true_values = [1 if y == 0 else -1 for y in y_test]

            accuracy_scores.append(accuracy_score(true_values, pred_values))
            precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
            recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'IsolationForest',
            'best n_estimators': best_params['n_estimators'],
            'best contamination': best_params['contamination'],
            'best max_features': best_params['max_features'],
            'method used for model selection': 'f1 on outer cv inner holdout',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'no const, no equals'}

In [12]:
results = outer_cv_inner_holdout_f1(X)
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,IsolationForest,200,0.5,1.0,accuracy on nested cv,65.692308,9.161277,79.175258,12.253176,50.0,15.518258,"no const, no equals"
1,IsolationForest,200,0.3,1.0,f1 on nested cv,66.846154,7.211513,76.819757,10.570936,58.0,16.881339,"no const, no equals"
2,IsolationForest,150,0.5,0.5,accuracy on outer cv inner holdout,66.0,6.852633,79.603813,9.542887,50.714286,13.112543,"no const, no equals"
3,IsolationForest,50,0.3,0.5,f1 on outer cv inner holdout,65.769231,10.858149,71.321477,11.873186,62.285714,24.118754,"no const, no equals"


In [26]:
def outer_holdout_inner_cv_accuracy(X):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}

    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        best_accuracy = 0
        for trainval_idx, valid_idx in cv.split(X_train, y_train):
            X_trainval, X_valid = X_train[trainval_idx], X_train[valid_idx]
            y_trainval, y_valid = y_train[trainval_idx], y_train[valid_idx]

            idxs_neg = np.where(y_trainval == 1)[0]
            X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
            y_valid = np.append(y_valid, y_trainval[idxs_neg])

            X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
            y_trainval = np.delete(y_trainval, idxs_neg)

            for params in itertools.product(n_estimatorss, contaminations, max_featuress):
                clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])

                clf.fit(X_trainval)

                pred_values = clf.predict(X_valid)
                true_values = [1 if y == 0 else -1 for y in y_valid]

                accuracy = accuracy_score(true_values, pred_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['n_estimators'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['max_features'] = params[2]

        clf = IsolationForest(**best_params)

        idxs_neg = np.where(y_train == 1)[0]
        X_train = np.delete(X_train, idxs_neg, axis=0)

        clf.fit(X_train)

        pred_values = clf.predict(X_test)
        true_values = [1 if y == 0 else -1 for y in y_test]

        accuracy_scores.append(accuracy_score(true_values, pred_values))
        precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
        recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'IsolationForest',
            'best n_estimators': best_params['n_estimators'],
            'best contamination': best_params['contamination'],
            'best max_features': best_params['max_features'],
            'method used for model selection': 'accuracy on outer holdout inner cv',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'no const, no equals'}

In [27]:
results = outer_holdout_inner_cv_accuracy(X)
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,IsolationForest,200,0.5,1.0,accuracy on nested cv,65.692308,9.161277,79.175258,12.253176,50.0,15.518258,"no const, no equals"
1,IsolationForest,200,0.3,1.0,f1 on nested cv,66.846154,7.211513,76.819757,10.570936,58.0,16.881339,"no const, no equals"
2,IsolationForest,150,0.5,0.5,accuracy on outer cv inner holdout,66.0,6.852633,79.603813,9.542887,50.714286,13.112543,"no const, no equals"
3,IsolationForest,50,0.3,0.5,f1 on outer cv inner holdout,65.769231,10.858149,71.321477,11.873186,62.285714,24.118754,"no const, no equals"
4,IsolationForest,50,0.5,0.2,accuracy on outer holdout inner cv,66.153846,6.389711,74.196165,11.800068,49.883372,15.596901,"no const, no equals"


In [28]:
def outer_holdout_inner_cv_f1(X):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}

    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        best_f1 = 0
        for trainval_idx, valid_idx in cv.split(X_train, y_train):
            X_trainval, X_valid = X_train[trainval_idx], X_train[valid_idx]
            y_trainval, y_valid = y_train[trainval_idx], y_train[valid_idx]

            idxs_neg = np.where(y_trainval == 1)[0]
            X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
            y_valid = np.append(y_valid, y_trainval[idxs_neg])

            X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
            y_trainval = np.delete(y_trainval, idxs_neg)

            for params in itertools.product(n_estimatorss, contaminations, max_featuress):
                clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])

                clf.fit(X_trainval)

                pred_values = clf.predict(X_valid)
                true_values = [1 if y == 0 else -1 for y in y_valid]

                f1 = f1_score(true_values, pred_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['n_estimators'] = params[0]
                    best_params['contamination'] = params[1]
                    best_params['max_features'] = params[2]

        clf = IsolationForest(**best_params)

        idxs_neg = np.where(y_train == 1)[0]
        X_train = np.delete(X_train, idxs_neg, axis=0)

        clf.fit(X_train)

        pred_values = clf.predict(X_test)
        true_values = [1 if y == 0 else -1 for y in y_test]

        accuracy_scores.append(accuracy_score(true_values, pred_values))
        precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
        recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'IsolationForest',
            'best n_estimators': best_params['n_estimators'],
            'best contamination': best_params['contamination'],
            'best max_features': best_params['max_features'],
            'method used for model selection': 'f1 on outer holdout inner cv',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'no const, no equals'}

In [29]:
results = outer_holdout_inner_cv_f1(X)
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,IsolationForest,200,0.5,1.0,accuracy on nested cv,65.692308,9.161277,79.175258,12.253176,50.0,15.518258,"no const, no equals"
1,IsolationForest,200,0.3,1.0,f1 on nested cv,66.846154,7.211513,76.819757,10.570936,58.0,16.881339,"no const, no equals"
2,IsolationForest,150,0.5,0.5,accuracy on outer cv inner holdout,66.0,6.852633,79.603813,9.542887,50.714286,13.112543,"no const, no equals"
3,IsolationForest,50,0.3,0.5,f1 on outer cv inner holdout,65.769231,10.858149,71.321477,11.873186,62.285714,24.118754,"no const, no equals"
4,IsolationForest,50,0.5,0.2,accuracy on outer holdout inner cv,66.153846,6.389711,74.196165,11.800068,49.883372,15.596901,"no const, no equals"
5,IsolationForest,100,0.5,0.5,f1 on outer holdout inner cv,64.230769,7.307692,77.372711,12.980495,50.702714,14.381863,"no const, no equals"


In [30]:
def double_holdout_accuracy(X):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}

    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

        best_accuracy = 0

        X_trainval, X_valid, y_trainval, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)

        idxs_neg = np.where(y_trainval == 1)[0]
        X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
        y_valid = np.append(y_valid, y_trainval[idxs_neg])

        X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
        y_trainval = np.delete(y_trainval, idxs_neg)

        for params in itertools.product(n_estimatorss, contaminations, max_featuress):
            clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])

            clf.fit(X_trainval)

            pred_values = clf.predict(X_valid)
            true_values = [1 if y == 0 else -1 for y in y_valid]

            accuracy = accuracy_score(true_values, pred_values)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params['n_estimators'] = params[0]
                best_params['contamination'] = params[1]
                best_params['max_features'] = params[2]

        clf = IsolationForest(**best_params)

        idxs_neg = np.where(y_train == 1)[0]
        X_train = np.delete(X_train, idxs_neg, axis=0)

        clf.fit(X_train)

        pred_values = clf.predict(X_test)
        true_values = [1 if y == 0 else -1 for y in y_test]

        accuracy_scores.append(accuracy_score(true_values, pred_values))
        precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
        recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'IsolationForest',
            'best n_estimators': best_params['n_estimators'],
            'best contamination': best_params['contamination'],
            'best max_features': best_params['max_features'],
            'method used for model selection': 'accuracy on double holdout',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'no const, no equals'}

In [31]:
results = double_holdout_accuracy(X)
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,IsolationForest,200,0.5,1.0,accuracy on nested cv,65.692308,9.161277,79.175258,12.253176,50.0,15.518258,"no const, no equals"
1,IsolationForest,200,0.3,1.0,f1 on nested cv,66.846154,7.211513,76.819757,10.570936,58.0,16.881339,"no const, no equals"
2,IsolationForest,150,0.5,0.5,accuracy on outer cv inner holdout,66.0,6.852633,79.603813,9.542887,50.714286,13.112543,"no const, no equals"
3,IsolationForest,50,0.3,0.5,f1 on outer cv inner holdout,65.769231,10.858149,71.321477,11.873186,62.285714,24.118754,"no const, no equals"
4,IsolationForest,50,0.5,0.2,accuracy on outer holdout inner cv,66.153846,6.389711,74.196165,11.800068,49.883372,15.596901,"no const, no equals"
5,IsolationForest,100,0.5,0.5,f1 on outer holdout inner cv,64.230769,7.307692,77.372711,12.980495,50.702714,14.381863,"no const, no equals"
6,IsolationForest,50,0.5,1.0,accuracy on double holdout,63.076923,8.804249,75.392857,15.458603,45.105199,14.179488,"no const, no equals"


In [32]:
def double_holdout_f1(X):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}

    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    y = X['Mezzo'].values
    X = X.drop(columns='Mezzo').values

    random_seeds = np.random.randint(2343, 3485327, size=10)
    for seed in random_seeds:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

        best_f1 = 0

        X_trainval, X_valid, y_trainval, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)

        idxs_neg = np.where(y_trainval == 1)[0]
        X_valid = np.append(X_valid, X_trainval[idxs_neg], axis=0)
        y_valid = np.append(y_valid, y_trainval[idxs_neg])

        X_trainval = np.delete(X_trainval, idxs_neg, axis=0)
        y_trainval = np.delete(y_trainval, idxs_neg)

        for params in itertools.product(n_estimatorss, contaminations, max_featuress):
            clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])

            clf.fit(X_trainval)

            pred_values = clf.predict(X_valid)
            true_values = [1 if y == 0 else -1 for y in y_valid]

            f1 = f1_score(true_values, pred_values)
            if f1 > best_f1:
                best_f1 = f1
                best_params['n_estimators'] = params[0]
                best_params['contamination'] = params[1]
                best_params['max_features'] = params[2]

        clf = IsolationForest(**best_params)

        idxs_neg = np.where(y_train == 1)[0]
        X_train = np.delete(X_train, idxs_neg, axis=0)

        clf.fit(X_train)

        pred_values = clf.predict(X_test)
        true_values = [1 if y == 0 else -1 for y in y_test]

        accuracy_scores.append(accuracy_score(true_values, pred_values))
        precision_scores.append(precision_score(true_values, pred_values, zero_division=0.0))
        recall_scores.append(recall_score(true_values, pred_values))

    return {'algorythm': 'IsolationForest',
            'best n_estimators': best_params['n_estimators'],
            'best contamination': best_params['contamination'],
            'best max_features': best_params['max_features'],
            'method used for model selection': 'f1 on double holdout',
            'accuracy mean': np.mean(accuracy_scores) * 100,
            'accuracy std': np.std(accuracy_scores) * 100,
            'precision mean': np.mean(precision_scores) * 100,
            'precision  std': np.std(precision_scores) * 100,
            'recall mean': np.mean(recall_scores) * 100,
            'recall std': np.std(recall_scores) * 100,
            'data set': 'no const, no equals'}

In [33]:
results = double_holdout_f1(X)
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,method used for model selection,accuracy mean,accuracy std,precision mean,precision std,recall mean,recall std,data set
0,IsolationForest,200,0.5,1.0,accuracy on nested cv,65.692308,9.161277,79.175258,12.253176,50.0,15.518258,"no const, no equals"
1,IsolationForest,200,0.3,1.0,f1 on nested cv,66.846154,7.211513,76.819757,10.570936,58.0,16.881339,"no const, no equals"
2,IsolationForest,150,0.5,0.5,accuracy on outer cv inner holdout,66.0,6.852633,79.603813,9.542887,50.714286,13.112543,"no const, no equals"
3,IsolationForest,50,0.3,0.5,f1 on outer cv inner holdout,65.769231,10.858149,71.321477,11.873186,62.285714,24.118754,"no const, no equals"
4,IsolationForest,50,0.5,0.2,accuracy on outer holdout inner cv,66.153846,6.389711,74.196165,11.800068,49.883372,15.596901,"no const, no equals"
5,IsolationForest,100,0.5,0.5,f1 on outer holdout inner cv,64.230769,7.307692,77.372711,12.980495,50.702714,14.381863,"no const, no equals"
6,IsolationForest,50,0.5,1.0,accuracy on double holdout,63.076923,8.804249,75.392857,15.458603,45.105199,14.179488,"no const, no equals"
7,IsolationForest,150,0.3,1.0,f1 on double holdout,63.846154,9.760444,75.217879,8.689068,54.766484,13.967951,"no const, no equals"


In [34]:
file_path = "if_exp1_df.pickle"

with open(file_path, "wb") as file:
    pickle.dump(scores_df, file)