In [5]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn import svm 
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.decomposition import PCA
from collections import defaultdict
import itertools

In [2]:
df = pd.read_csv("../data/run-over-dataset.csv")
cols_to_drop = ['VERBALE', 'DATA', 'BMI']
X = df.drop(columns=cols_to_drop)
X['ALTEZZA'] = [int(float(h.replace(',', '.'))*100) for h in X['ALTEZZA']]
X['PESO'] = [int(float(str(h).replace(',', '.'))) for h in X['PESO']]
X.head()

Unnamed: 0,SESSO,ANNI,PESO,ALTEZZA,Mezzo,Testa:Neurocranio,Testa:Splancnocranio,Testa:Telencefalo,Testa:Cervelletto,Testa:Tronco encefalico,...,II raggio sx.1,III raggio sx.1,IV raggio sx.1,V raggio sx.1,Art. coxo-femorale dx,Art. coxo-femorale sx,Rotula o Ginocchio dx,Rotula o Ginocchio sx,Caviglia dx,Caviglia sx
0,0,81,84,175,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,69,69,162,1,4,4,4,4,4,...,0,0,0,0,0,0,0,0,0,0
2,1,71,67,155,1,2,0,1,1,2,...,0,0,0,0,0,0,0,0,0,0
3,1,54,60,159,1,4,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,78,69,167,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
num_unique_values = X.nunique()
constant_columns = num_unique_values[num_unique_values == 1].index.tolist()
X = X.drop(columns=constant_columns)
X = X.T.drop_duplicates().T
X.shape

(130, 312)

In [4]:
mezzo_1 = X[X['Mezzo'] == 1].drop(columns='Mezzo').values
mezzo_0 = X[X['Mezzo'] == 0].drop(columns='Mezzo').values

In [11]:
def nested_cv_accuracy(X, Y, pca_components=0.9):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    pca = PCA(n_components=pca_components)

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
        best_accuracy = 0

        for trainval_idx, test_idx in outer_cv.split(X):
            X_trainval = X[trainval_idx]
            X_test = X[test_idx]

            inner_cv = KFold(n_splits=7, shuffle=True, random_state=seed)
            for train_idx, valid_idx in inner_cv.split(X_trainval):
                X_train = X_trainval[train_idx]
                X_valid = X_trainval[valid_idx]

                params_list = list(itertools.product(kernels, gammas, nus))
                for params in params_list:
                    clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
                    clf.fit(X_train)

                    positive_class_predictions = clf.predict(X_valid)
                    negative_class_predictions = clf.predict(Y)

                    true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                    predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                    accuracy = accuracy_score(true_values, predicted_values)
                    if accuracy> best_accuracy:
                        best_accuracy = accuracy
                        best_params['kernel'] = params[0]
                        best_params['gamma'] = params[1]
                        best_params['nu'] = params[2]
                    
            clf = svm.OneClassSVM(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)
            
            tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
            fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = tp / (tp + fp) if (tp + fp) != 0 else 0
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'accuracy score',
               'method used': 'nested cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100}

    return results

In [12]:
results = nested_cv_accuracy(mezzo_0, mezzo_1, 0.95)
results

{'algorythm': 'OneClassSVM',
 'best kernel': 'rbf',
 'best gamma': 0.01291549665014884,
 'best nu': 0.01,
 'score used for model selection': 'accuracy score',
 'method used': 'nested cv',
 'accuracy score mean': 87.19402985074625,
 'accuracy score std': 5.235296694261334,
 'precision score mean': 4.068693693693693,
 'precision score std': 12.11968783432332,
 'recall score mean': 1.714285714285714,
 'recall score std': 4.642307659791978,
 'f1 score mean': 2.2494949494949497,
 'f1 score std': 6.32321384771561}

In [13]:
results['data set'] = 'no dup, no const, pca 0.95'

In [14]:
scores_df = pd.DataFrame(results, index=[0])
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.012915,0.01,accuracy score,nested cv,87.19403,5.235297,4.068694,12.119688,1.714286,4.642308,2.249495,6.323214,"no dup, no const, pca 0.95"


In [15]:
def add_record(df, record):
    new_record = pd.DataFrame(record, index=[0])
    df = pd.concat([df, new_record], ignore_index=True)
    return df  

In [18]:
def nested_cv_f1(X, Y, pca_components=0.9):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    pca = PCA(n_components=pca_components)

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
        best_f1 = 0

        for trainval_idx, test_idx in outer_cv.split(X):
            X_trainval = X[trainval_idx]
            X_test = X[test_idx]

            inner_cv = KFold(n_splits=7, shuffle=True, random_state=seed)
            for train_idx, valid_idx in inner_cv.split(X_trainval):
                X_train = X_trainval[train_idx]
                X_valid = X_trainval[valid_idx]

                params_list = list(itertools.product(kernels, gammas, nus))
                for params in params_list:
                    clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
                    clf.fit(X_train)

                    positive_class_predictions = clf.predict(X_valid)
                    negative_class_predictions = clf.predict(Y)

                    true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                    predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                    f1 = f1_score(true_values, predicted_values)
                    if f1 > best_f1:
                        best_f1 = f1
                        best_params['kernel'] = params[0]
                        best_params['gamma'] = params[1]
                        best_params['nu'] = params[2]
                    
            clf = svm.OneClassSVM(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)
            
            tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
            fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = tp / (tp + fp) if (tp + fp) != 0 else 0
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'f1 score',
               'method used': 'nested cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no const, pca ' + str(pca_components)}

    return results

In [19]:
scores_df = add_record(scores_df, nested_cv_f1(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.012915,0.01,accuracy score,nested cv,87.19403,5.235297,4.068694,12.119688,1.714286,4.642308,2.249495,6.323214,"no dup, no const, pca 0.95"
1,OneClassSVM,linear,0.001,0.4,f1 score,nested cv,62.402985,17.707425,13.613902,6.533688,42.714286,20.652123,19.989932,9.081377,"no dup, no const, pca 0.95"


In [20]:
def outer_cv_inner_holdout_accuracy(X, Y, pca_components=0.9):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    pca = PCA(n_components=pca_components)
    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        best_accuracy = 0
        for trainval_idx, test_idx in cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed, shuffle=True)
            
            params_list = list(itertools.product(kernels, gammas, nus))
            for params in params_list:
                clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                accuracy = accuracy_score(true_values, predicted_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['kernel'] = params[0]
                    best_params['gamma'] = params[1]
                    best_params['nu'] = params[2]
            
            clf = svm.OneClassSVM(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)
            
            tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
            fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = tp / (tp + fp) if (tp + fp) != 0 else 0
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'accuracy score',
               'method used': 'outer cv inner holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq, pca ' + str(pca_components)}
        
    return results

In [22]:
scores_df = add_record(scores_df, outer_cv_inner_holdout_accuracy(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.012915,0.01,accuracy score,nested cv,87.19403,5.235297,4.068694,12.119688,1.714286,4.642308,2.249495,6.323214,"no dup, no const, pca 0.95"
1,OneClassSVM,linear,0.001,0.4,f1 score,nested cv,62.402985,17.707425,13.613902,6.533688,42.714286,20.652123,19.989932,9.081377,"no dup, no const, pca 0.95"
2,OneClassSVM,rbf,0.012915,0.05,accuracy score,outer cv inner holdout,80.243243,1.007278,2.333333,9.433981,0.428571,1.696335,0.720588,2.853434,"no dup, no eq, pca 0.95"


In [23]:
def outer_cv_inner_holdout_f1(X, Y, pca_components=0.9):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        best_f1 = 0
        for trainval_idx, test_idx in cv.split(X):
            X_trainval, X_test = X[trainval_idx], X[test_idx]

            X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed, shuffle=True)
            
            params_list = list(itertools.product(kernels, gammas, nus))
            for params in params_list:
                clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                f1 = f1_score(true_values, predicted_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['kernel'] = params[0]
                    best_params['gamma'] = params[1]
                    best_params['nu'] = params[2]
            
            clf = svm.OneClassSVM(**best_params)
            clf.fit(X_trainval)

            positive_class_predictions = clf.predict(X_test)
            negative_class_predictions = clf.predict(Y)
            
            tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
            fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            precision = tp / (tp + fp) if (tp + fp) != 0 else 0
            recall = recall_score(true_values, predicted_values)
            f1 = f1_score(true_values, predicted_values)

            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'f1 score',
               'method used': 'outer cv inner holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq, pca ' + str(pca_components)}
        
    return results

In [24]:
scores_df = add_record(scores_df, outer_cv_inner_holdout_f1(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.012915,0.01,accuracy score,nested cv,87.19403,5.235297,4.068694,12.119688,1.714286,4.642308,2.249495,6.323214,"no dup, no const, pca 0.95"
1,OneClassSVM,linear,0.001,0.4,f1 score,nested cv,62.402985,17.707425,13.613902,6.533688,42.714286,20.652123,19.989932,9.081377,"no dup, no const, pca 0.95"
2,OneClassSVM,rbf,0.012915,0.05,accuracy score,outer cv inner holdout,80.243243,1.007278,2.333333,9.433981,0.428571,1.696335,0.720588,2.853434,"no dup, no eq, pca 0.95"
3,OneClassSVM,linear,0.001,0.3,f1 score,outer cv inner holdout,57.027027,9.845408,23.323434,5.931448,53.428571,17.799335,31.86776,8.069672,"no dup, no eq, pca 0.95"


In [25]:
def outer_holdout_inner_cv_accuracy(X, Y, pca_components=0.9):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed, shuffle=True)

        best_accuracy = 0
        cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        for train_idx, valid_idx in cv.split(X_trainval):
            X_train = X_trainval[train_idx]
            X_valid = X_trainval[valid_idx]

            params_list = list(itertools.product(kernels, gammas, nus))
            for params in params_list:
                clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                accuracy = accuracy_score(true_values, predicted_values)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['kernel'] = params[0]
                    best_params['gamma'] = params[1]
                    best_params['nu'] = params[2]
                
        clf = svm.OneClassSVM(**best_params)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)
            
        tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
        fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = tp / (tp + fp) if (tp + fp) != 0 else 0
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'accuracy score',
               'method used': 'outer holdout inner cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq, pca ' + str(pca_components)}
    
    return results
        

In [26]:
scores_df = add_record(scores_df, outer_holdout_inner_cv_accuracy(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.012915,0.01,accuracy score,nested cv,87.19403,5.235297,4.068694,12.119688,1.714286,4.642308,2.249495,6.323214,"no dup, no const, pca 0.95"
1,OneClassSVM,linear,0.001,0.4,f1 score,nested cv,62.402985,17.707425,13.613902,6.533688,42.714286,20.652123,19.989932,9.081377,"no dup, no const, pca 0.95"
2,OneClassSVM,rbf,0.012915,0.05,accuracy score,outer cv inner holdout,80.243243,1.007278,2.333333,9.433981,0.428571,1.696335,0.720588,2.853434,"no dup, no eq, pca 0.95"
3,OneClassSVM,linear,0.001,0.3,f1 score,outer cv inner holdout,57.027027,9.845408,23.323434,5.931448,53.428571,17.799335,31.86776,8.069672,"no dup, no eq, pca 0.95"
4,OneClassSVM,rbf,0.046416,0.01,accuracy score,outer holdout inner cv,80.675676,0.865287,0.0,0.0,0.0,0.0,0.0,0.0,"no dup, no eq, pca 0.95"


In [27]:
def outer_holdout_inner_cv_f1(X, Y, pca_components=0.9):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed, shuffle=True)

        best_f1 = 0
        cv = KFold(n_splits=5, shuffle=True, random_state=seed)

        for train_idx, valid_idx in cv.split(X_trainval):
            X_train = X_trainval[train_idx]
            X_valid = X_trainval[valid_idx]

            params_list = list(itertools.product(kernels, gammas, nus))
            for params in params_list:
                clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
                clf.fit(X_train)

                positive_class_predictions = clf.predict(X_valid)
                negative_class_predictions = clf.predict(Y)

                true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
                predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

                f1 = f1_score(true_values, predicted_values)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params['kernel'] = params[0]
                    best_params['gamma'] = params[1]
                    best_params['nu'] = params[2]
                
        clf = svm.OneClassSVM(**best_params)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)
            
        tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
        fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])

        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        accuracy = accuracy_score(true_values, predicted_values)
        precision = tp / (tp + fp) if (tp + fp) != 0 else 0
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'f1 score',
               'method used': 'outer holdout inner cv',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq, pca ' + str(pca_components)}
    
    return results
        

In [28]:
scores_df = add_record(scores_df, outer_holdout_inner_cv_f1(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.012915,0.01,accuracy score,nested cv,87.19403,5.235297,4.068694,12.119688,1.714286,4.642308,2.249495,6.323214,"no dup, no const, pca 0.95"
1,OneClassSVM,linear,0.001,0.4,f1 score,nested cv,62.402985,17.707425,13.613902,6.533688,42.714286,20.652123,19.989932,9.081377,"no dup, no const, pca 0.95"
2,OneClassSVM,rbf,0.012915,0.05,accuracy score,outer cv inner holdout,80.243243,1.007278,2.333333,9.433981,0.428571,1.696335,0.720588,2.853434,"no dup, no eq, pca 0.95"
3,OneClassSVM,linear,0.001,0.3,f1 score,outer cv inner holdout,57.027027,9.845408,23.323434,5.931448,53.428571,17.799335,31.86776,8.069672,"no dup, no eq, pca 0.95"
4,OneClassSVM,rbf,0.046416,0.01,accuracy score,outer holdout inner cv,80.675676,0.865287,0.0,0.0,0.0,0.0,0.0,0.0,"no dup, no eq, pca 0.95"
5,OneClassSVM,linear,0.001,0.05,f1 score,outer holdout inner cv,58.648649,11.179523,23.116273,6.235496,51.428571,21.852941,31.007294,9.797178,"no dup, no eq, pca 0.95"


In [29]:
def double_holdout_accuracy(X, Y, pca_components=0.9):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed, shuffle=True)

        X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed, shuffle=True)
        best_accuracy = 0

        params_list = list(itertools.product(kernels, gammas, nus))
        for params in params_list:
            clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
            clf.fit(X_train)

            positive_class_predictions = clf.predict(X_valid)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            accuracy = accuracy_score(true_values, predicted_values)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params['kernel'] = params[0]
                best_params['gamma'] = params[1]
                best_params['nu'] = params[2]
        
        clf = svm.OneClassSVM(**best_params)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)
        
        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
        fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])
        
        accuracy = accuracy_score(true_values, predicted_values)
        precision = tp / (tp + fp) if (tp + fp) != 0 else 0
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        
    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'accuracy score',
               'method used': 'double holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq, pca ' + str(pca_components)}
        
    return results

In [30]:
scores_df = add_record(scores_df, double_holdout_accuracy(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.012915,0.01,accuracy score,nested cv,87.19403,5.235297,4.068694,12.119688,1.714286,4.642308,2.249495,6.323214,"no dup, no const, pca 0.95"
1,OneClassSVM,linear,0.001,0.4,f1 score,nested cv,62.402985,17.707425,13.613902,6.533688,42.714286,20.652123,19.989932,9.081377,"no dup, no const, pca 0.95"
2,OneClassSVM,rbf,0.012915,0.05,accuracy score,outer cv inner holdout,80.243243,1.007278,2.333333,9.433981,0.428571,1.696335,0.720588,2.853434,"no dup, no eq, pca 0.95"
3,OneClassSVM,linear,0.001,0.3,f1 score,outer cv inner holdout,57.027027,9.845408,23.323434,5.931448,53.428571,17.799335,31.86776,8.069672,"no dup, no eq, pca 0.95"
4,OneClassSVM,rbf,0.046416,0.01,accuracy score,outer holdout inner cv,80.675676,0.865287,0.0,0.0,0.0,0.0,0.0,0.0,"no dup, no eq, pca 0.95"
5,OneClassSVM,linear,0.001,0.05,f1 score,outer holdout inner cv,58.648649,11.179523,23.116273,6.235496,51.428571,21.852941,31.007294,9.797178,"no dup, no eq, pca 0.95"
6,OneClassSVM,rbf,0.046416,0.01,accuracy score,double holdout,80.540541,0.896385,0.0,0.0,0.0,0.0,0.0,0.0,"no dup, no eq, pca 0.95"


In [31]:
def double_holdout_f1(X, Y, pca_components=0.9):
    kernels = ['linear', 'rbf']
    gammas = np.logspace(-3, 2, 10)
    nus = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    best_params = {'kernel': '', 'gamma': 0, 'nu': 0}

    pca = PCA(n_components=pca_components)

    X = pca.fit_transform(X)
    Y = pca.transform(Y)

    random_seeds = np.random.randint(20, 100000, size=10)
    for seed in random_seeds:
        X_trainval, X_test = train_test_split(X, test_size=0.2, random_state=seed, shuffle=True)

        X_train, X_valid = train_test_split(X_trainval, test_size=0.2, random_state=seed, shuffle=True)
        best_f1 = 0

        params_list = list(itertools.product(kernels, gammas, nus))
        for params in params_list:
            clf = svm.OneClassSVM(kernel=params[0], gamma=params[1], nu=params[2])
            clf.fit(X_train)

            positive_class_predictions = clf.predict(X_valid)
            negative_class_predictions = clf.predict(Y)

            true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
            predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

            f1 = f1_score(true_values, predicted_values)
            if f1 > best_f1:
                best_f1 = f1
                best_params['kernel'] = params[0]
                best_params['gamma'] = params[1]
                best_params['nu'] = params[2]
        
        clf = svm.OneClassSVM(**best_params)
        clf.fit(X_trainval)

        positive_class_predictions = clf.predict(X_test)
        negative_class_predictions = clf.predict(Y)
        
        true_values = np.concatenate((np.full_like(positive_class_predictions, 1), np.full_like(negative_class_predictions, -1)))
        predicted_values = np.concatenate((positive_class_predictions, negative_class_predictions))

        tp = sum([1 if pred == 1 else 0 for pred in positive_class_predictions])
        fp = sum([1 if pred == 1 else 0 for pred in negative_class_predictions])
        
        accuracy = accuracy_score(true_values, predicted_values)
        precision = tp / (tp + fp) if (tp + fp) != 0 else 0
        recall = recall_score(true_values, predicted_values)
        f1 = f1_score(true_values, predicted_values)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        
    results = {'algorythm': 'OneClassSVM',
               'best kernel': best_params['kernel'],
               'best gamma': best_params['gamma'],
               'best nu': best_params['nu'],
               'score used for model selection': 'f1 score',
               'method used': 'double holdout',
               'accuracy score mean': np.mean(accuracy_scores) * 100,
               'accuracy score std': np.std(accuracy_scores) * 100,
               'precision score mean': np.mean(precision_scores) * 100,
               'precision score std': np.std(precision_scores) * 100,
               'recall score mean': np.mean(recall_scores) * 100,
               'recall score std': np.std(recall_scores) * 100,
               'f1 score mean': np.mean(f1_scores) * 100,
               'f1 score std': np.std(f1_scores) * 100,
               'data set': 'no dup, no eq, pca ' + str(pca_components)}
        
    return results

In [32]:
scores_df = add_record(scores_df, double_holdout_f1(mezzo_0, mezzo_1, 0.95))
scores_df

Unnamed: 0,algorythm,best kernel,best gamma,best nu,score used for model selection,method used,accuracy score mean,accuracy score std,precision score mean,precision score std,recall score mean,recall score std,f1 score mean,f1 score std,data set
0,OneClassSVM,rbf,0.012915,0.01,accuracy score,nested cv,87.19403,5.235297,4.068694,12.119688,1.714286,4.642308,2.249495,6.323214,"no dup, no const, pca 0.95"
1,OneClassSVM,linear,0.001,0.4,f1 score,nested cv,62.402985,17.707425,13.613902,6.533688,42.714286,20.652123,19.989932,9.081377,"no dup, no const, pca 0.95"
2,OneClassSVM,rbf,0.012915,0.05,accuracy score,outer cv inner holdout,80.243243,1.007278,2.333333,9.433981,0.428571,1.696335,0.720588,2.853434,"no dup, no eq, pca 0.95"
3,OneClassSVM,linear,0.001,0.3,f1 score,outer cv inner holdout,57.027027,9.845408,23.323434,5.931448,53.428571,17.799335,31.86776,8.069672,"no dup, no eq, pca 0.95"
4,OneClassSVM,rbf,0.046416,0.01,accuracy score,outer holdout inner cv,80.675676,0.865287,0.0,0.0,0.0,0.0,0.0,0.0,"no dup, no eq, pca 0.95"
5,OneClassSVM,linear,0.001,0.05,f1 score,outer holdout inner cv,58.648649,11.179523,23.116273,6.235496,51.428571,21.852941,31.007294,9.797178,"no dup, no eq, pca 0.95"
6,OneClassSVM,rbf,0.046416,0.01,accuracy score,double holdout,80.540541,0.896385,0.0,0.0,0.0,0.0,0.0,0.0,"no dup, no eq, pca 0.95"
7,OneClassSVM,linear,0.001,0.4,f1 score,double holdout,54.189189,12.421411,21.832558,5.874949,50.714286,20.316375,29.255047,7.638253,"no dup, no eq, pca 0.95"


In [33]:
import pickle

file_path = 'svm_exp2_df.pickle'

with open(file_path, "wb") as file:
    pickle.dump(scores_df, file)