In [9]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme()
pd.set_option("display.max_columns", 85)

# Gaussian Naive Bayes Classifier

In [10]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("../data/abnormal_writeout.data.csv")

# trascurare da ACC a UVM
start_drop = df.columns.get_loc("ACC")
end_drop = df.columns.get_loc("UVM")
cols = np.arange(start_drop, end_drop + 1)
df.drop(df.columns[cols], axis=1, inplace=True)

# trascurare alcune colonne
# df.drop("TTT_freq", axis=1, inplace=True)
df.drop("oldest_phylostratum_factor", axis=1, inplace=True)
df.drop("Unnamed: 0", axis=1, inplace=True)

# Drop NaNs
df.dropna(inplace=True)

# Sort features
resp = df["response"].to_numpy()
occ = df["occ_total_sum"]
age = df["oldest_phylostratum"].to_numpy()
conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1).to_numpy()

# Features and targets
X = np.c_[occ, age, conf]
Y = resp

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Feature matrix shape:", X_train.shape, X_test.shape)

Feature matrix shape: (14536, 84) (3634, 84)


In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA

conf_cols = np.arange(2, X.shape[1]) # Columns of confounder variables (highly colinear)

class ConfounderPCA(BaseEstimator, TransformerMixin):
    ''' Custom PCA transformer for this dataset''' 
    
    def __init__(self, explained_variance=.95):
        self.explained_variance = explained_variance
        self.pca = PCA(n_components = self.explained_variance )
        
    def fit(self, X, y = None):
        self.pca.fit(X[:, conf_cols])
        return self
    
    def transform(self, X, y = None):
        X_conf_pca = self.pca.transform(X[:, conf_cols])
        return np.c_[X[:, :2], X_conf_pca]
    

In [12]:
from sklearn.metrics import auc, make_scorer, precision_recall_curve


def auprc(y_true, y_scores, **kwargs):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    # result is sum of the areas under each curve
    return auc(thresholds, precisions[:-1]) + auc(thresholds, recalls[:-1])

# Create my custom scorer using a wrapper object
auprc_score = make_scorer(auprc, needs_proba=True,)

# [Gaussian Naive Bayes](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Define metrics to be calculated during the grid search, along with their names
my_metrics = {
    "ROC_AUC": "roc_auc",
    "Precision": "precision",
    "Recall": "recall",
    "f1-score": "f1",
    "AUPRC": make_scorer(auprc, needs_proba=True,),  # custom: area under precision-recall-curve
}

# Define the parameter grid
param_grid = {
#!!! No params?   
}

# Define the model
gnb_clf = Pipeline([
    # ('scaler', StandardScaler()),
    # ('pca', ConfounderPCA()), 
    ('gnb', GaussianNB())
])

# Define the grid search object
grid_search = GridSearchCV(
    estimator=gnb_clf,
    scoring=my_metrics,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1,
    error_score="raise",
    refit="ROC_AUC",  # Final fit 
)

# Search
grid_search.fit(X_train, y_train)

In [19]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import Pipeline
from scipy.stats import sem

my_metrics = {
    "ROC_AUC": "roc_auc",
    "Precision": "precision",
    "Recall": "recall",
    "f1-score": "f1",
    "AUPRC": make_scorer(auprc, needs_proba=True,),  # custom: area under precision-recall-curve
}

gnb_clf = Pipeline([
    # ('scaler', StandardScaler()),
    # ('pca', ConfounderPCA()), 
    ('gnb', GaussianNB())
])

scores = cross_validate(gnb_clf, X_train, y_train, cv=10, scoring=my_metrics)

GNB: (mean and standard error)


In [None]:
scores_df = pd.DataFrame(scores)
scores_df

Unnamed: 0,fit_time,score_time,test_ROC_AUC,test_Precision,test_Recall,test_f1-score,test_AUPRC
0,0.286045,0.038826,0.660212,0.356725,0.370821,0.363636,0.792193
1,0.324772,0.016397,0.656016,0.364706,0.3769,0.370703,0.743363
2,0.301001,0.052455,0.645944,0.326705,0.349544,0.337739,0.770094
3,0.4915,0.022072,0.646557,0.302231,0.452888,0.36253,0.799767
4,0.515567,0.015895,0.644763,0.365535,0.425532,0.393258,0.821847
5,0.262574,0.06278,0.661361,0.350725,0.366667,0.358519,0.773899
6,0.243272,0.018656,0.626426,0.317416,0.343465,0.329927,0.706749
7,0.219354,0.027735,0.648636,0.346591,0.370821,0.358297,0.786759
8,0.204661,0.018743,0.630012,0.32312,0.352584,0.337209,0.73707
9,0.299329,0.015688,0.63644,0.33049,0.471125,0.388471,0.804382


In [24]:
scores_df.mean()

fit_time          0.314808
score_time        0.028925
test_ROC_AUC      0.645637
test_Precision    0.338424
test_Recall       0.388034
test_f1-score     0.360029
test_AUPRC        0.773612
dtype: float64

In [None]:
scores_df.sem()