In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn import svm 
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from collections import defaultdict
import itertools

In [2]:
df = pd.read_csv("../data/run-over-dataset.csv")
df.head()

Unnamed: 0,VERBALE,DATA,SESSO,ANNI,PESO,ALTEZZA,BMI,Mezzo,Testa:Neurocranio,Testa:Splancnocranio,...,II raggio sx.1,III raggio sx.1,IV raggio sx.1,V raggio sx.1,Art. coxo-femorale dx,Art. coxo-femorale sx,Rotula o Ginocchio dx,Rotula o Ginocchio sx,Caviglia dx,Caviglia sx
0,85567,10/29/1999,0,81,84,175,274285714285714,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,85829,1/14/2000,1,69,69,162,26291723822588,1,4,4,...,0,0,0,0,0,0,0,0,0,0
2,85977,3/10/2000,1,71,67,155,278876170655567,1,2,0,...,0,0,0,0,0,0,0,0,0,0
3,86220,6/14/2000,1,54,60,159,237332384003797,1,4,0,...,0,0,0,0,0,0,0,0,0,0
4,86247,6/22/2000,1,78,69,167,247409372871024,1,2,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
cols_to_drop = ['VERBALE', 'DATA', 'BMI']
X = df.drop(columns=cols_to_drop)
X['ALTEZZA'] = [int(float(h.replace(',', '.'))*100) for h in X['ALTEZZA']]
X['PESO'] = [int(float(str(h).replace(',', '.'))) for h in X['PESO']]
X.head()

Unnamed: 0,SESSO,ANNI,PESO,ALTEZZA,Mezzo,Testa:Neurocranio,Testa:Splancnocranio,Testa:Telencefalo,Testa:Cervelletto,Testa:Tronco encefalico,...,II raggio sx.1,III raggio sx.1,IV raggio sx.1,V raggio sx.1,Art. coxo-femorale dx,Art. coxo-femorale sx,Rotula o Ginocchio dx,Rotula o Ginocchio sx,Caviglia dx,Caviglia sx
0,0,81,84,175,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,69,69,162,1,4,4,4,4,4,...,0,0,0,0,0,0,0,0,0,0
2,1,71,67,155,1,2,0,1,1,2,...,0,0,0,0,0,0,0,0,0,0
3,1,54,60,159,1,4,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,78,69,167,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X.shape

(130, 364)

In [5]:
def low_std_cols(X, beta):
    columns_low_std = []
    for column in X.columns:
        if X[column].std() <= beta:
            columns_low_std.append(column)

    return columns_low_std, len(columns_low_std)

In [6]:
def zero_columns(X):
    zero_cols = []
    cols, n = low_std_cols(X, 0)

    for col in cols:
        if sum(X[col]) == 0:
            zero_cols.append(col)
    
    return zero_cols, (len(zero_cols) == n)

In [7]:
cols_to_drop, ok = zero_columns(X)
X = X.drop(columns=cols_to_drop)
X.shape

(130, 324)

Eliminate 40 colonne composte da soli 0 e per cui ininfluenti per la classificazione.

In [8]:
def equal_columns(X):
    cols = set()

    for column1 in X.columns:
        for column2 in X.columns:
            equals = X[column1] == X[column2]
            if (equals.all() and column1 != column2):
                cols.add((column1, column2))

    return cols

cols = equal_columns(X)
cols

{('Gomito sx', 'Polso sx'),
 ('Linea angolo-scapolare dx.10', 'Linea angolo-scapolare dx.9'),
 ('Linea angolo-scapolare dx.9', 'Linea angolo-scapolare dx.10'),
 ('Linea ascellare anteriore dx.10', 'Linea ascellare anteriore dx.11'),
 ('Linea ascellare anteriore dx.11', 'Linea ascellare anteriore dx.10'),
 ('Linea ascellare anteriore dx.8', 'Linea ascellare anteriore dx.9'),
 ('Linea ascellare anteriore dx.9', 'Linea ascellare anteriore dx.8'),
 ('Linea ascellare media dx.10', 'Linea ascellare media dx.11'),
 ('Linea ascellare media dx.11', 'Linea ascellare media dx.10'),
 ('Linea ascellare media sx.10', 'Linea ascellare media sx.9'),
 ('Linea ascellare media sx.9', 'Linea ascellare media sx.10'),
 ('Linea ascellare posteriore dx.2', 'Linea ascellare posteriore dx.3'),
 ('Linea ascellare posteriore dx.3', 'Linea ascellare posteriore dx.2'),
 ('Linea emiclaveare dx.10', 'Linea emiclaveare dx.11'),
 ('Linea emiclaveare dx.11', 'Linea emiclaveare dx.10'),
 ('Linea parasternale dx.10', 'Lin

In [9]:
X = X.T.drop_duplicates().T
X.shape

(130, 312)

Eliminate 12 colonne duplicate.

In [10]:
mezzo_1 = X[X['Mezzo'] == 1].drop(columns='Mezzo').values
mezzo_0 = X[X['Mezzo'] == 0].drop(columns='Mezzo').values
classes = X['Mezzo'].values

In [11]:
def nested_cv_svm_accuracy(X, Y):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)
        best_accuracy = 0

        for trainval_idx, test_idx in outer_cv.split(X):
            X_trainval = X[trainval_idx]
            X_test = X[test_idx]

            inner_cv = KFold(n_splits=3, shuffle=True, random_state=seed)
            for train_idx, valid_idx in inner_cv.split(X_trainval):
                X_train = X_trainval[train_idx]
                X_valid = X_trainval[valid_idx]

                params_list = list(itertools.product(kernels, gammas, nus))
                for params in params_list:
                    clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
                    clf.fit(X_train)

                    positive_class_predictions = clf.predict(X_valid)
                    negative_class_predictions = clf.predict(Y)

                    true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                    predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                    accuracy = accuracy_score(true_values, predicted_values)
                    if accuracy> best_accuracy:
                        best_accuracy = accuracy
                        best_params['kernel'] = params[0]
                        best_params['gamma'] = params[1]
                        best_params['nu'] = params[2]
                    
            clf = svm.OneClassSVM(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)
            
            tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
            fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = tp / (tp + fp) if (tp + fp) != 0 else 0
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'accuracy score',
               'method used': 'nested cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq'}

    return results

In [12]:
results = nested_cv_svm_accuracy(mezzo_0, mezzo_1)

In [13]:
results

{'algorythm': 'OneClassSVM',
 'best kernel': 'rbf',
 'best gamma': 0.003593813663804626,
 'best nu': 0.01,
 'score used for model selection': 'accuracy score',
 'method used': 'nested cv',
 'accuracy score mean': 79.02702702702705,
 'accuracy score std': 2.062571246306911,
 'precision score mean': 18.53253968253968,
 'precision score std': 18.23165387308145,
 'recall score mean': 8.142857142857142,
 'recall score std': 9.2593030989136,
 'f1 score mean': 11.15588245977262,
 'f1 score std': 12.063343118072059,
 'data set': 'no dup, no eq'}

In [14]:
scores_df = pd.DataFrame(results, index=[0])
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.003594,0.01,accuracy score,nested cv,79.027027,2.062571,18.53254,18.231654,8.142857,9.259303,11.155882,12.063343,"no dup, no eq"


In [15]:
def nested_cv_svm_f1(X, Y):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)
        best_f1 = 0

        for trainval_idx, test_idx in outer_cv.split(X):
            X_trainval = X[trainval_idx]
            X_test = X[test_idx]

            inner_cv = KFold(n_splits=3, shuffle=True, random_state=seed)
            for train_idx, valid_idx in inner_cv.split(X_trainval):
                X_train = X_trainval[train_idx]
                X_valid = X_trainval[valid_idx]

                params_list = list(itertools.product(kernels, gammas, nus))
                for params in params_list:
                    clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
                    clf.fit(X_train)

                    positive_class_predictions = clf.predict(X_valid)
                    negative_class_predictions = clf.predict(Y)

                    true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                    predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                    f1 = f1_score(true_values, predicted_values)
                    if f1 > best_f1:
                        best_f1 = f1
                        best_params['kernel'] = params[0]
                        best_params['gamma'] = params[1]
                        best_params['nu'] = params[2]
                    
            clf = svm.OneClassSVM(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)
            
            tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
            fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = tp / (tp + fp) if (tp + fp) != 0 else 0
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'f1 score',
               'method used': 'nested cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq'}

    return results

In [16]:
results = nested_cv_svm_f1(mezzo_0, mezzo_1)

In [17]:
def add_record(df, record):
    new_record = pd.DataFrame(record, index=[0])
    df = pd.concat([df, new_record], ignore_index=True)
    return df  

In [18]:
scores_df = add_record(scores_df, results)

In [19]:
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.003594,0.01,accuracy score,nested cv,79.027027,2.062571,18.53254,18.231654,8.142857,9.259303,11.155882,12.063343,"no dup, no eq"
1,OneClassSVM,rbf,0.001,0.01,f1 score,nested cv,65.702703,2.210441,27.982083,4.669921,53.571429,14.160163,36.632178,7.301241,"no dup, no eq"


In [23]:
def outer_cv_inner_holdout_accuracy(X, Y):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        best_accuracy = 0
        for trainval_idx, test_idx in cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed, shuffle=True)
            
            params_list = list(itertools.product(kernels, gammas, nus))
            for params in params_list:
                clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                accuracy = accuracy_score(true_values, predicted_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['kernel'] = params[0]
                    best_params['gamma'] = params[1]
                    best_params['nu'] = params[2]
            
            clf = svm.OneClassSVM(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)
            
            tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
            fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = tp / (tp + fp) if (tp + fp) != 0 else 0
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'accuracy score',
               'method used': 'outer cv inner holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq'}
        
    return results

In [21]:
results = outer_cv_inner_holdout_accuracy(mezzo_0, mezzo_1)
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.003594,0.01,accuracy score,nested cv,79.027027,2.062571,18.53254,18.231654,8.142857,9.259303,11.155882,12.063343,"no dup, no eq"
1,OneClassSVM,rbf,0.001,0.01,f1 score,nested cv,65.702703,2.210441,27.982083,4.669921,53.571429,14.160163,36.632178,7.301241,"no dup, no eq"
2,OneClassSVM,rbf,0.012915,0.01,accuracy score,outer cv inner holdout,80.756757,1.066113,2.090476,7.643308,0.857143,3.392669,1.204177,4.643457,"no dup, no eq"


In [24]:
def outer_cv_inner_holdout_f1(X, Y):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        best_f1 = 0
        for trainval_idx, test_idx in cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed, shuffle=True)
            
            params_list = list(itertools.product(kernels, gammas, nus))
            for params in params_list:
                clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                f1 = f1_score(true_values, predicted_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['kernel'] = params[0]
                    best_params['gamma'] = params[1]
                    best_params['nu'] = params[2]
            
            clf = svm.OneClassSVM(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)
            
            tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
            fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = tp / (tp + fp) if (tp + fp) != 0 else 0
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'f1 score',
               'method used': 'outer cv inner holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq'}
        
    return results

In [25]:
results = outer_cv_inner_holdout_f1(mezzo_0, mezzo_1)
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.003594,0.01,accuracy score,nested cv,79.027027,2.062571,18.53254,18.231654,8.142857,9.259303,11.155882,12.063343,"no dup, no eq"
1,OneClassSVM,rbf,0.001,0.01,f1 score,nested cv,65.702703,2.210441,27.982083,4.669921,53.571429,14.160163,36.632178,7.301241,"no dup, no eq"
2,OneClassSVM,rbf,0.012915,0.01,accuracy score,outer cv inner holdout,80.756757,1.066113,2.090476,7.643308,0.857143,3.392669,1.204177,4.643457,"no dup, no eq"
3,OneClassSVM,rbf,0.001,0.5,f1 score,outer cv inner holdout,65.513514,7.04903,27.420288,5.172216,50.285714,15.905332,35.106908,7.545007,"no dup, no eq"


In [26]:
def outer_holdout_inner_cv_accuracy(X, Y):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed, shuffle=True)

        best_accuracy = 0
        cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        for train_idx, valid_idx in cv.split(X_trainval):
            X_train = X_trainval[train_idx]
            X_valid = X_trainval[valid_idx]

            params_list = list(itertools.product(kernels, gammas, nus))
            for params in params_list:
                clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                accuracy = accuracy_score(true_values, predicted_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['kernel'] = params[0]
                    best_params['gamma'] = params[1]
                    best_params['nu'] = params[2]
                
        clf = svm.OneClassSVM(**best_params)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)
            
        tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
        fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = tp / (tp + fp) if (tp + fp) != 0 else 0
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'accuracy score',
               'method used': 'outer holdout inner cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq'}
    
    return results
        

In [27]:
results = outer_holdout_inner_cv_accuracy(mezzo_0, mezzo_1)
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.003594,0.01,accuracy score,nested cv,79.027027,2.062571,18.53254,18.231654,8.142857,9.259303,11.155882,12.063343,"no dup, no eq"
1,OneClassSVM,rbf,0.001,0.01,f1 score,nested cv,65.702703,2.210441,27.982083,4.669921,53.571429,14.160163,36.632178,7.301241,"no dup, no eq"
2,OneClassSVM,rbf,0.012915,0.01,accuracy score,outer cv inner holdout,80.756757,1.066113,2.090476,7.643308,0.857143,3.392669,1.204177,4.643457,"no dup, no eq"
3,OneClassSVM,rbf,0.001,0.5,f1 score,outer cv inner holdout,65.513514,7.04903,27.420288,5.172216,50.285714,15.905332,35.106908,7.545007,"no dup, no eq"
4,OneClassSVM,rbf,0.003594,0.05,accuracy score,outer holdout inner cv,80.405405,1.384723,3.333333,10.0,1.428571,4.285714,2.0,6.0,"no dup, no eq"


In [28]:
def outer_holdout_inner_cv_f1(X, Y):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed, shuffle=True)

        best_f1 = 0
        cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        for train_idx, valid_idx in cv.split(X_trainval):
            X_train = X_trainval[train_idx]
            X_valid = X_trainval[valid_idx]

            params_list = list(itertools.product(kernels, gammas, nus))
            for params in params_list:
                clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                f1 = f1_score(true_values, predicted_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['kernel'] = params[0]
                    best_params['gamma'] = params[1]
                    best_params['nu'] = params[2]
                
        clf = svm.OneClassSVM(**best_params)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)
            
        tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
        fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = tp / (tp + fp) if (tp + fp) != 0 else 0
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'f1 score',
               'method used': 'outer holdout inner cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq'}
        
    return results
        

In [29]:
results = outer_holdout_inner_cv_f1(mezzo_0, mezzo_1)
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.003594,0.01,accuracy score,nested cv,79.027027,2.062571,18.53254,18.231654,8.142857,9.259303,11.155882,12.063343,"no dup, no eq"
1,OneClassSVM,rbf,0.001,0.01,f1 score,nested cv,65.702703,2.210441,27.982083,4.669921,53.571429,14.160163,36.632178,7.301241,"no dup, no eq"
2,OneClassSVM,rbf,0.012915,0.01,accuracy score,outer cv inner holdout,80.756757,1.066113,2.090476,7.643308,0.857143,3.392669,1.204177,4.643457,"no dup, no eq"
3,OneClassSVM,rbf,0.001,0.5,f1 score,outer cv inner holdout,65.513514,7.04903,27.420288,5.172216,50.285714,15.905332,35.106908,7.545007,"no dup, no eq"
4,OneClassSVM,rbf,0.003594,0.05,accuracy score,outer holdout inner cv,80.405405,1.384723,3.333333,10.0,1.428571,4.285714,2.0,6.0,"no dup, no eq"
5,OneClassSVM,rbf,0.001,0.5,f1 score,outer holdout inner cv,67.432432,2.216381,26.866097,5.087898,44.285714,14.914724,33.162737,8.084593,"no dup, no eq"


In [30]:
def double_holdout_accuracy(X, Y):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed, shuffle=True)

        X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed, shuffle=True)
        best_accuracy = 0

        params_list = list(itertools.product(kernels, gammas, nus))
        for params in params_list:
            clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
            clf.fit(X_train)

            positive_class_predictions = clf.predict(X_valid)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params['kernel'] = params[0]
                best_params['gamma'] = params[1]
                best_params['nu'] = params[2]
        
        clf = svm.OneClassSVM(**best_params)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)
        
        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
        fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])
        
        accuracy = accuracy_score(true_values, predicted_values)
        precision = tp / (tp + fp) if (tp + fp) != 0 else 0
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        
    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'accuracy score',
               'method used': 'double holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq'}
        
    return results

In [31]:
results = double_holdout_accuracy(mezzo_0, mezzo_1)
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.003594,0.01,accuracy score,nested cv,79.027027,2.062571,18.53254,18.231654,8.142857,9.259303,11.155882,12.063343,"no dup, no eq"
1,OneClassSVM,rbf,0.001,0.01,f1 score,nested cv,65.702703,2.210441,27.982083,4.669921,53.571429,14.160163,36.632178,7.301241,"no dup, no eq"
2,OneClassSVM,rbf,0.012915,0.01,accuracy score,outer cv inner holdout,80.756757,1.066113,2.090476,7.643308,0.857143,3.392669,1.204177,4.643457,"no dup, no eq"
3,OneClassSVM,rbf,0.001,0.5,f1 score,outer cv inner holdout,65.513514,7.04903,27.420288,5.172216,50.285714,15.905332,35.106908,7.545007,"no dup, no eq"
4,OneClassSVM,rbf,0.003594,0.05,accuracy score,outer holdout inner cv,80.405405,1.384723,3.333333,10.0,1.428571,4.285714,2.0,6.0,"no dup, no eq"
5,OneClassSVM,rbf,0.001,0.5,f1 score,outer holdout inner cv,67.432432,2.216381,26.866097,5.087898,44.285714,14.914724,33.162737,8.084593,"no dup, no eq"
6,OneClassSVM,rbf,0.003594,0.01,accuracy score,double holdout,80.945946,0.405405,3.333333,10.0,0.714286,2.142857,1.176471,3.529412,"no dup, no eq"


In [32]:
def double_holdout_f1(X, Y):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed, shuffle=True)

        X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed, shuffle=True)
        best_f1 = 0

        params_list = list(itertools.product(kernels, gammas, nus))
        for params in params_list:
            clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
            clf.fit(X_train)

            positive_class_predictions = clf.predict(X_valid)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            f1 = f1_score(true_values, predicted_values)
            if f1 > best_f1:
                best_f1 = f1
                best_params['kernel'] = params[0]
                best_params['gamma'] = params[1]
                best_params['nu'] = params[2]
        
        clf = svm.OneClassSVM(**best_params)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)
        
        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
        fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])
        
        accuracy = accuracy_score(true_values, predicted_values)
        precision = tp / (tp + fp) if (tp + fp) != 0 else 0
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        
    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'f1 score',
               'method used': 'double holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq'}
        
    return results

In [33]:
results = double_holdout_f1(mezzo_0, mezzo_1)
scores_df = add_record(scores_df, results)
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.003594,0.01,accuracy score,nested cv,79.027027,2.062571,18.53254,18.231654,8.142857,9.259303,11.155882,12.063343,"no dup, no eq"
1,OneClassSVM,rbf,0.001,0.01,f1 score,nested cv,65.702703,2.210441,27.982083,4.669921,53.571429,14.160163,36.632178,7.301241,"no dup, no eq"
2,OneClassSVM,rbf,0.012915,0.01,accuracy score,outer cv inner holdout,80.756757,1.066113,2.090476,7.643308,0.857143,3.392669,1.204177,4.643457,"no dup, no eq"
3,OneClassSVM,rbf,0.001,0.5,f1 score,outer cv inner holdout,65.513514,7.04903,27.420288,5.172216,50.285714,15.905332,35.106908,7.545007,"no dup, no eq"
4,OneClassSVM,rbf,0.003594,0.05,accuracy score,outer holdout inner cv,80.405405,1.384723,3.333333,10.0,1.428571,4.285714,2.0,6.0,"no dup, no eq"
5,OneClassSVM,rbf,0.001,0.5,f1 score,outer holdout inner cv,67.432432,2.216381,26.866097,5.087898,44.285714,14.914724,33.162737,8.084593,"no dup, no eq"
6,OneClassSVM,rbf,0.003594,0.01,accuracy score,double holdout,80.945946,0.405405,3.333333,10.0,0.714286,2.142857,1.176471,3.529412,"no dup, no eq"
7,OneClassSVM,rbf,0.003594,0.01,f1 score,double holdout,67.432432,3.495275,26.403133,4.163302,44.285714,19.112983,32.079169,9.751404,"no dup, no eq"


In [34]:
import pickle

file_path = 'svm_exp1.pickle'

with open(file_path, "wb") as file:
    pickle.dump(scores_df, file)