In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme()
pd.set_option("display.max_columns", 85)

# Support Vector Machine

In [28]:
df = pd.read_csv("../data/abnormal_writeout_noscale.data.csv")

# trascurare da ACC a UVM
start_drop = df.columns.get_loc("ACC")
end_drop = df.columns.get_loc("UVM")
cols = np.arange(start_drop, end_drop + 1)
df.drop(df.columns[cols], axis=1, inplace=True)

# trascurare alcune colonne
df.drop("TTT_freq", axis=1, inplace=True)
df.drop("oldest_phylostratum_factor", axis=1, inplace=True)
df.drop("Unnamed: 0", axis=1, inplace=True)

# Drop NaNs
df.dropna(inplace=True)

# Sort features
resp = df["response"].to_numpy()
occ = df["occ_total_sum"]
age = df["oldest_phylostratum"].to_numpy()
conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1).to_numpy()

X = np.c_[occ, age, conf]
Y = resp

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("Feature matrix shape:", X_train.shape, X_test.shape)

Feature matrix shape: (14536, 84) (3634, 84)


In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA

conf_cols = np.arange(2, X.shape[1]) # Columns of confounder variables (highly colinear)

class ConfounderPCA(BaseEstimator, TransformerMixin):
    ''' Custom PCA transformer for this dataset''' 
    
    def __init__(self, explained_variance=.95, apply_PCA=True):
        self.explained_variance = explained_variance
        self.apply_PCA = apply_PCA
        if self.apply_PCA: 
            self.pca = PCA(n_components = self.explained_variance )
        
    def fit(self, X, y = None):
        if self.apply_PCA:
            self.pca.fit(X[:, conf_cols])
        return self
    
    def transform(self, X, y = None):
        if self.apply_PCA:
            X_conf_pca = self.pca.transform(X[:, conf_cols])
            return np.c_[X[:, :2], X_conf_pca]
        else:
            return X

In [6]:
from sklearn.metrics import auc, make_scorer, precision_recall_curve


def auprc(y_true, y_scores, **kwargs):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    # result is sum of the areas under each curve
    return auc(thresholds, precisions[:-1]) + auc(thresholds, recalls[:-1])

# [Support Vector Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)


> Support Vector Machine algorithms are **not scale invariant**, so it is highly recommended to scale your data.

Best hyperparamters:
- `C=1.0` 
- `class_weight=balanced`
- `shrinking=False`

In [14]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Define base model
svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', ConfounderPCA()), 
    ('svc', SVC(kernel='rbf', random_state=0, probability=True))
])

# Define metrics to be calculated during the grid search, along with their names
my_metrics = {
    "ROC_AUC": "roc_auc",
    "Precision": "precision",
    "Recall": "recall",
    "f1-score": "f1",
    "AUPRC": make_scorer(auprc, needs_proba=True,),  # custom: area under precision-recall-curve
}

# Define the parameter grid
param_grid = {
    "svc__C": np.logspace(0, 2, 5),
    "svc__shrinking": [False, True],
    "svc__class_weight": [None, "balanced"],
    
}

# Define the grid search object
grid_search = GridSearchCV(
    estimator=svm_clf,
    scoring=my_metrics,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1,
    error_score="raise",
    refit="ROC_AUC",  # Final fit 
)

# Search
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


GridSearchCV(cv=3, error_score='raise',
             estimator=Pipeline(steps=[('pca', ConfounderPCA()),
                                       ('svc',
                                        SVC(probability=True,
                                            random_state=0))]),
             n_jobs=-1,
             param_grid={'svc__C': array([  1.        ,   3.16227766,  10.        ,  31.6227766 ,
       100.        ]),
                         'svc__class_weight': [None, 'balanced'],
                         'svc__shrinking': [False, True]},
             refit='ROC_AUC',
             scoring={'AUPRC': make_scorer(auprc, needs_proba=True),
                      'Precision': 'precision', 'ROC_AUC': 'roc_auc',
                      'Recall': 'recall', 'f1-score': 'f1'},
             verbose=1)

In [18]:
gs_df = pd.DataFrame(grid_search.cv_results_)
gs_df.to_csv("./results/svm_gs_results.csv")

In [None]:
grid_search.bes

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import Pipeline
from scipy.stats import sem

my_metrics = {
    "ROC_AUC": "roc_auc",
    "Precision": "precision",
    "Recall": "recall",
    "f1-score": "f1",
    "AUPRC": make_scorer(auprc, needs_proba=True,),  # custom: area under precision-recall-curve
}

model = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', ConfounderPCA()), 
    ('svc', SVC(kernel='rbf', random_state=0, probability=True))
])


scores_df = pd.DataFrame()

print("SVM: (mean and standard error)")
for score_name in my_metrics:
    cvs = cross_val_score(model, X_train, y_train, scoring=my_metrics[score_name], cv=3)
    scores_df[score_name] = cvs
    print("   " + score_name + f" : {cvs.mean():.4f} +/- {sem(cvs):.4f} ")

In [None]:
scores_df

In [None]:
X.flags

In [88]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from scipy.stats import sem

my_metrics = {
    "ROC_AUC": "roc_auc",
    "Precision": "precision",
    "Recall": "recall",
    "f1-score": "f1",
    "AUPRC": make_scorer(auprc, needs_proba=True,),  # custom: area under precision-recall-curve
}

model = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', ConfounderPCA()), 
    ('svc', SVC(kernel='rbf', random_state=0, probability=True))
])

scores = cross_validate(model, X_train, y_train, cv=10, scoring=my_metrics)

SVM: (mean and standard error)


In [89]:
scores_df = pd.DataFrame(scores) 

Unnamed: 0,fit_time,score_time,test_ROC_AUC,test_Precision,test_Recall,test_f1-score,test_AUPRC
0,67.068063,22.856539,0.60204,0.526316,0.009116,0.017921,0.555102
1,58.656096,19.969765,0.595409,0.777778,0.012762,0.025112,0.679037
2,51.851468,22.44243,0.608009,0.625,0.013674,0.026762,0.584116


In [None]:
scores_df.mean()

In [None]:
scores_df.sem()

In [None]:
scores_df.to_csv("./results/svm_gs_results.csv")