In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme()
pd.set_option("display.max_columns", 85)

In [5]:
from sklearn.model_selection import train_test_split

# Read Data
df = pd.read_csv("../data/abnormal_writeout.data.csv")

# trascurare da ACC a UVM
start_drop = df.columns.get_loc("ACC")
end_drop = df.columns.get_loc("UVM")
cols = np.arange(start_drop, end_drop + 1)
df.drop(df.columns[cols], axis=1, inplace=True)

# trascurare alcune colonne
# df.drop("TTT_freq", axis=1, inplace=True)
df.drop("oldest_phylostratum_factor", axis=1, inplace=True)
df.drop("Unnamed: 0", axis=1, inplace=True)

# Drop NaNs
df.dropna(inplace=True)

# Sort features
resp = df["response"].to_numpy()
occ = df["occ_total_sum"]
age = df["oldest_phylostratum"].to_numpy()
conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1).to_numpy()

X = np.c_[occ, age, conf]
Y = resp

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("Feature matrix shape (train/test):", X_train.shape, "/", X_test.shape)

Feature matrix shape (train/test): (14536, 84) / (3634, 84)


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA

conf_cols = np.arange(2, X.shape[1])  # Columns of confounder variables (highly colinear)


class ConfounderPCA(BaseEstimator, TransformerMixin):
    """ Custom PCA transformer for this dataset"""

    def __init__(self, explained_variance=0.95):
        self.explained_variance = explained_variance
        self.pca = PCA(n_components=self.explained_variance)

    def fit(self, X, y=None):
        self.pca.fit(X[:, conf_cols])
        return self

    def transform(self, X, y=None):
        X_conf_pca = self.pca.transform(X[:, conf_cols])
        return np.c_[X[:, :2], X_conf_pca]

In [7]:
from sklearn.metrics import auc, make_scorer, precision_recall_curve


def auprc(y_true, y_scores, **kwargs):
    """ Remember to use make_scorer(auprc, needs_proba=True,) """
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    # result is sum of the areas under each curve
    return auc(thresholds, precisions[:-1]) + auc(thresholds, recalls[:-1])

In [62]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB


# Logistic Regression
lr_clf = Pipeline([
    ("pca", ConfounderPCA()), 
    ("lr_clf", LogisticRegression(max_iter=500))
])

blr_clf = Pipeline([
    ("pca", ConfounderPCA()), 
    ("blr_clf", LogisticRegression(max_iter=500, class_weight="balanced"))
])


# Random Forest
rf_clf = Pipeline([
    ("pca", ConfounderPCA()), 
    ("rf_clf", RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])


brf_clf = Pipeline([
    ("pca", ConfounderPCA()), 
    ("brf_clf", BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

# Support Vector Machine
svm_clf = Pipeline([
    # ('scaler', StandardScaler()),
    ('pca', ConfounderPCA()), 
    ('svc', SVC(kernel='rbf', random_state=0, probability=True))
])

# Gaussian Naive Bayes
gnb_clf = Pipeline([
    # ('scaler', StandardScaler()),
    ('pca', ConfounderPCA()), 
    ('gnb', GaussianNB())
])


models = {
    "Logistic Regression": lr_clf,                             
    "Balanced Logistic Regression": blr_clf,
    "Random Forest": rf_clf,
    "Balanced Random Forest": blr_clf,
    "SVM with Gaussian Kernel": svm_clf,
    "Gaussian Naive Bayes": gnb_clf,
}

In [63]:
from sklearn.model_selection import cross_val_score, cross_validate
from scipy.stats import sem
from sklearn.model_selection import KFold

my_metrics = {
    "ROC-AUC": "roc_auc",
    "Precision": "precision",
    "Recall": "recall",
    "f1-score": "f1",
    "AUPRC": make_scorer(auprc, needs_proba=True,),  # custom: area under precision-recall-curve
}

lr_df = pd.DataFrame()
blr_df = pd.DataFrame()
rf_df = pd.DataFrame()
brf_df = pd.DataFrame()
svc_df = pd.DataFrame()
gnb_df = pd.DataFrame()

frames = [lr_df, blr_df, rf_df, brf_df, svc_df, gnb_df]

assert len(frames) == len(models)

cv = KFold(n_splits=5, shuffle=True, random_state=0) # Shuffle data before splitting into folds

for i, model_name in enumerate(models):
    print(model_name, "...")
    frames[i] = pd.DataFrame(cross_validate(models[model_name], X_train, y_train, cv=cv, scoring=my_metrics))

Logistic Regression ...
Done.
Balanced Logistic Regression ...
Done.
Random Forest ...
Done.
Balanced Random Forest ...
Done.
SVM with Gaussian Kernel ...
Done.
Gaussian Naive Bayes ...
Done.


In [74]:
data = frames[5]
data

Unnamed: 0,fit_time,score_time,test_ROC-AUC,test_Precision,test_Recall,test_f1-score,test_AUPRC
0,0.088308,0.019941,0.643971,0.349693,0.341829,0.345716,0.756753
1,0.089028,0.018103,0.659697,0.338766,0.443598,0.384158,0.811227
2,0.091588,0.01837,0.618363,0.290774,0.537291,0.377338,0.820792
3,0.080215,0.016268,0.65444,0.363796,0.314112,0.337134,0.756021
4,0.083159,0.015191,0.645425,0.345185,0.357362,0.351168,0.751658


In [88]:
data = frames[5]
stats_df = pd.DataFrame()
stats_df["mean"] = data.mean()
stats_df["std"] = data.std()
stats_df["sem"] = data.sem()
stats_df

Unnamed: 0,mean,std,sem
fit_time,0.08646,0.004642,0.002076
score_time,0.017575,0.001864,0.000834
test_ROC-AUC,0.644379,0.015922,0.00712
test_Precision,0.337643,0.027767,0.012418
test_Recall,0.398838,0.091271,0.040817
test_f1-score,0.359103,0.020525,0.009179
test_AUPRC,0.77929,0.033746,0.015092
