In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from collections import defaultdict
import itertools

In [2]:
df = pd.read_csv("../data/run-over-dataset.csv")
cols_to_drop = ['VERBALE', 'DATA', 'BMI']
X = df.drop(columns=cols_to_drop)
X['ALTEZZA'] = [int(float(h.replace(',', '.'))*100) for h in X['ALTEZZA']]
X['PESO'] = [int(float(str(h).replace(',', '.'))) for h in X['PESO']]
X.head()

Unnamed: 0,SESSO,ANNI,PESO,ALTEZZA,Mezzo,Testa:Neurocranio,Testa:Splancnocranio,Testa:Telencefalo,Testa:Cervelletto,Testa:Tronco encefalico,...,II raggio sx.1,III raggio sx.1,IV raggio sx.1,V raggio sx.1,Art. coxo-femorale dx,Art. coxo-femorale sx,Rotula o Ginocchio dx,Rotula o Ginocchio sx,Caviglia dx,Caviglia sx
0,0,81,84,175,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,69,69,162,1,4,4,4,4,4,...,0,0,0,0,0,0,0,0,0,0
2,1,71,67,155,1,2,0,1,1,2,...,0,0,0,0,0,0,0,0,0,0
3,1,54,60,159,1,4,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,78,69,167,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
def low_std_cols(X, beta):
    columns_low_std = []
    for column in X.columns:
        if X[column].std() <= beta:
            columns_low_std.append(column)

    return columns_low_std, len(columns_low_std)

def zero_columns(X):
    zero_cols = []
    cols, n = low_std_cols(X, 0)

    for col in cols:
        if sum(X[col]) == 0:
            zero_cols.append(col)
    
    return zero_cols, (len(zero_cols) == n)

cols_to_drop, ok = zero_columns(X)
X = X.drop(columns=cols_to_drop)
X = X.T.drop_duplicates().T
X.shape

(130, 312)

In [4]:
mezzo_1 = X[X['Mezzo'] == 1].drop(columns='Mezzo').values
mezzo_0 = X[X['Mezzo'] == 0].drop(columns='Mezzo').values
classes = X['Mezzo'].values

In [7]:
def nested_cv_accuracy(X, Y):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    random_seeds = np.random.randint(20, 100000, size=5)
    for seed in random_seeds:
        outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        for trainval_idx, test_idx in outer_cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]
            
            best_accuracy = 0
            inner_cv = KFold(n_splits=3, shuffle=True, random_state=seed)
            
            for train_idx, valid_idx in inner_cv.split(X_trainval):
                X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

                for params in list(itertools.product(n_estimatorss, contaminations, max_featuress)):
                    clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])
                    clf.fit(X_train)

                    positive_class_predictions = clf.predict(X_valid)
                    negative_class_predictions = clf.predict(Y)

                    true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                    predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                    accuracy = accuracy_score(true_values, predicted_values)
                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
                        best_params['n_estimators'] = params[0]
                        best_params['contamination'] = params[1]
                        best_params['max_features'] = params[2]

            clf = IsolationForest(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'IsolationForest',
               'best n_estimators': best_params['n_estimators'],
               'best contamination': best_params['contamination'],
               'best max_features': best_params['max_features'],
               'score used for model selection': 'accuracy score',
               'method used': 'nested cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100}

    return results

In [8]:
results = nested_cv_accuracy(mezzo_0, mezzo_1)
results

{'algorythm': 'IsolationForest',
 'best n_estimators': 150,
 'best contamination': 0.5,
 'best max_features': 0.5,
 'score used for model selection': 'accuracy score',
 'method used': 'nested cv',
 'accuracy score mean': 77.24324324324324,
 'accuracy score std': 2.61143185495809,
 'precision score mean': 41.69940085489957,
 'precision score std': 5.3962032916370095,
 'recall score mean': 50.0,
 'recall score std': 10.101525445522109,
 'f1 score mean': 45.10944048368909,
 'f1 score std': 6.456481244630773}

In [9]:
results['data set'] = 'no 0 columns, no equal columns'
scores_df = pd.DataFrame(results, index=[0])
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,IsolationForest,150,0.5,0.5,accuracy score,nested cv,77.243243,2.611432,41.699401,5.396203,50.0,10.101525,45.10944,6.456481,"no 0 columns, no equal columns"


In [10]:
def nested_cv_f1(X, Y):
    n_estimatorss = [50, 100, 150, 200]
    contaminations = [0.01, 0.05, 0.1, 0.3, 0.5]
    max_featuress = [0.2, 0.5, 1.0]

    best_params = {'n_estimators': 0, 'contamination': 0, 'max_features': 0}
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    random_seeds = np.random.randint(20, 100000, size=5)
    for seed in random_seeds:
        outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        for trainval_idx, test_idx in outer_cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]
            
            best_f1 = 0
            inner_cv = KFold(n_splits=3, shuffle=True, random_state=seed)
            
            for train_idx, valid_idx in inner_cv.split(X_trainval):
                X_train, X_valid = X_trainval[train_idx], X_trainval[valid_idx]

                for params in list(itertools.product(n_estimatorss, contaminations, max_featuress)):
                    clf = IsolationForest(n_estimators=params[0], contamination=params[1], max_features=params[2])
                    clf.fit(X_train)

                    positive_class_predictions = clf.predict(X_valid)
                    negative_class_predictions = clf.predict(Y)

                    true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                    predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                    f1 = f1_score(true_values, predicted_values)
                    if f1 > best_f1:
                        best_f1 = f1
                        best_params['n_estimators'] = params[0]
                        best_params['contamination'] = params[1]
                        best_params['max_features'] = params[2]

            clf = IsolationForest(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = precision_score(true_values, predicted_values)
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'IsolationForest',
               'best n_estimators': best_params['n_estimators'],
               'best contamination': best_params['contamination'],
               'best max_features': best_params['max_features'],
               'score used for model selection': 'f1 score',
               'method used': 'nested cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100}

    return results

In [11]:
def add_record(df, record):
    new_record = pd.DataFrame(record, index=[0])
    df = pd.concat([df, new_record], ignore_index=True)
    return df  

In [13]:
results = nested_cv_f1(mezzo_0, mezzo_1)
results['data set'] = 'no 0 columns, no equal columns'
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,IsolationForest,150,0.5,0.5,accuracy score,nested cv,77.243243,2.611432,41.699401,5.396203,50.0,10.101525,45.10944,6.456481,"no 0 columns, no equal columns"
1,IsolationForest,150,0.3,0.5,f1 score,nested cv,73.621622,5.74755,38.369669,7.57561,58.571429,15.649216,45.379131,8.397582,"no 0 columns, no equal columns"


In [14]:
import pickle

file_path = "if_exp1_df.pickle"

with open(file_path, "wb") as file:
    pickle.dump(scores_df, file)

In [15]:
import pickle

file_path = "if_exp1_df.pickle"

with open(file_path, "rb") as file:
    scores_df = pickle.load(file)

In [16]:
scores_df

Unnamed: 0,algorythm,best n_estimators,best contamination,best max_features,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,IsolationForest,150,0.5,0.5,accuracy score,nested cv,77.243243,2.611432,41.699401,5.396203,50.0,10.101525,45.10944,6.456481,"no 0 columns, no equal columns"
1,IsolationForest,150,0.3,0.5,f1 score,nested cv,73.621622,5.74755,38.369669,7.57561,58.571429,15.649216,45.379131,8.397582,"no 0 columns, no equal columns"
