In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option("display.max_columns", 85)
sns.set_theme(context="paper", font_scale=1.5, style="ticks", rc={"axes.grid": True})

# Area Under the Precision Recall Curve

Does optimizing a model with respect to the "area under precision-recall curve" lead to different results?
- Does it lead to significantly different hyperparameters?
- Does it lead to significantly different model performance?
- Does anything change if the model is nonlinear (e.g. Gaussian Naive Bayes)?

Approach:
1. Using a grid-search cross-validation procedure, train a model on pr-auc. 
2. Evaluate using several performance metrics: precision, recall, f1-score, roc-auc, pr-auc...

### Data

In [2]:
# Read Data from CSV (NEW DATA, NOT SCALED)
df = pd.read_csv("../data/abnormal_writeout_noscale.data.csv", index_col=0)

# trascurare da ACC a UVM
start_drop = df.columns.get_loc("ACC")
end_drop = df.columns.get_loc("UVM")
cols = np.arange(start_drop, end_drop + 1)
df.drop(df.columns[cols], axis=1, inplace=True)

# trascurare alcune colonne
df.drop("TTT_freq", axis=1, inplace=True)
df.drop("oldest_phylostratum_factor", axis=1, inplace=True)
df.drop("gc_cds", axis=1, inplace=True) # ! New

# Drop NaNs
df.dropna(inplace=True)

# Sort features
resp = df["response"]
occ = df["occ_total_sum"]
age = df["oldest_phylostratum"]
conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1)

# Collect Features and Labels
features_df = pd.DataFrame()
features_df["occ_total_sum"] = occ
features_df["oldest_phylostratum"] = age
features_df = pd.concat([features_df, conf], axis=1)

X = features_df.to_numpy()
y = df["response"].to_numpy()

### Custom PCA

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Columns of confounder variables (highly colinear)
conf_index = 2
conf_cols = np.arange(2, X.shape[1])  

class ConfounderPCA(BaseEstimator, TransformerMixin):
    """ 
    Custom PCA transformer for this dataset.
    Applies PCA only to the many collinear confounder 
    variables.
    """
    def __init__(self, confcols, n_components=None, apply_PCA=True):
        self.n_components = n_components
        self.apply_PCA = apply_PCA
        self.confcols = confcols
        if self.apply_PCA:
            self.pca = PCA(n_components=self.n_components)

    def fit(self, X, y=None):
        if self.apply_PCA:
            self.pca.fit(X[:, self.confcols])
        return self

    def transform(self, X, y=None):
        if self.apply_PCA:
            X_conf_pca = self.pca.transform(X[:, self.confcols])
            return np.c_[X[:, :2], X_conf_pca]
        else:
            return X

### Custom Scoring: Area Under Precision Recall Curve

In [4]:
from sklearn.metrics import auc, make_scorer, precision_recall_curve


def auprc(y_true, y_scores, **kwargs):
    """ Remember to use make_scorer(auprc, needs_proba=True,) """
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    # results is area under x=Recall and y=Precision curve. 
    return auc(recalls, precisions)

### Train-Test Split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (14536, 82) (14536,)
Testing set shape: (3634, 82) (3634,)


# Logistic Regression

### Define LR and its Parameter Space

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Define a parameter space to search
param_grid = {
    "lr__C": np.logspace(-3, 4, 7),
    "lr__class_weight": [None, "balanced"],
    "pca__apply_PCA": [True, False],
    "pca__n_components": [0.99, None],
}

# Define the model to be tuned
lr_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", ConfounderPCA(confcols=np.arange(2, X.shape[1]))),
    ("lr", LogisticRegression(max_iter=2000,)),
])

### Optimize LR

In [17]:
from sklearn.model_selection import GridSearchCV

# Define the grid search object
gscv = GridSearchCV(
    estimator=lr_clf,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1,
    scoring=make_scorer(auprc, needs_proba=True), 
)

# Search
gscv_result = gscv.fit(X_train, y_train)

Fitting 3 folds for each of 56 candidates, totalling 168 fits


### Score LR

In [18]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

# Get best model
lr_A = gscv_result.best_estimator_
lr_A.fit(X_train, y_train)

# Parameters
print("Best Params:")
print(gscv_result.best_params_)

# Predict
y_pred_proba = lr_A.predict_proba(X_test)[:, 1]
y_pred = lr_A.predict(X_test)

# Score
lr_data = []
lr_data.append(roc_auc_score(y_test, y_pred_proba))
lr_data.append(auprc(y_test, y_pred_proba))
lr_data.append(precision_score(y_test, y_pred))
lr_data.append(recall_score(y_test, y_pred))
lr_data.append(f1_score(y_test, y_pred))

Best Params:
{'lr__C': 3.1622776601683795, 'lr__class_weight': 'balanced', 'pca__apply_PCA': False, 'pca__n_components': 0.99}


# GNB Classifier

### Define the GNB Classifier and its Parameter Space

In [19]:
from sklearn.naive_bayes import GaussianNB

# Define a parameter space to search
param_grid = {
    "pca__apply_PCA": [True, False],
    "pca__n_components": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, None],
}

# Define the model to be tuned
gnb_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", ConfounderPCA(confcols=np.arange(2, X.shape[1]))),
    ("gnb", GaussianNB()),
])

### Optimize the GNB Classifier 

In [20]:
# Define the grid search object
gscv = GridSearchCV(
    estimator=gnb_clf,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1,
    scoring=make_scorer(auprc, needs_proba=True), 
)

# Search
gscv_result = gscv.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


### Score the GNB Classifier

In [21]:
# Get best model
model = gscv_result.best_estimator_
model.fit(X_train, y_train)

# Parameters
print("Best Params:")
print(gscv_result.best_params_)

# Predict
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# Score
gnb_data = []
gnb_data.append(roc_auc_score(y_test, y_pred_proba))
gnb_data.append(auprc(y_test, y_pred_proba))
gnb_data.append(precision_score(y_test, y_pred))
gnb_data.append(recall_score(y_test, y_pred))
gnb_data.append(f1_score(y_test, y_pred))

Best Params:
{'pca__apply_PCA': True, 'pca__n_components': 0.1}


In [12]:
# Define the grid search object
gscv = GridSearchCV(
    estimator=gnb_clf,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1,
    scoring='roc_auc', 
)

# Search
gscv_result = gscv.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [13]:
# Get best model
model = gscv_result.best_estimator_
model.fit(X_train, y_train)

# Parameters
print("Best Params:")
print(gscv_result.best_params_)

# Predict
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# Score
gnb_roc_data = []
gnb_roc_data.append(roc_auc_score(y_test, y_pred_proba))
gnb_roc_data.append(auprc(y_test, y_pred_proba))
gnb_roc_data.append(precision_score(y_test, y_pred))
gnb_roc_data.append(recall_score(y_test, y_pred))
gnb_roc_data.append(f1_score(y_test, y_pred))

Best Params:
{'pca__apply_PCA': False, 'pca__n_components': 0.1}


# Random Forest

### Define the RF and its Parameter Space

In [22]:
from imblearn.ensemble import BalancedRandomForestClassifier

# Define a parameter space to search
param_grid = {
    "pca__apply_PCA": [True, False],
    "pca__n_components": [0.1, 0.5, 0.9, 0.95, 0.99, None],
    "rf__n_estimators" : [100, 300, 500, 1000],
}

# Define the model to be tuned
rf_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", ConfounderPCA(confcols=np.arange(2, X.shape[1]))),
    ("rf", BalancedRandomForestClassifier()),
])

### Optimize the RF

In [23]:
# Define the grid search object
gscv = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1,
    scoring=make_scorer(auprc, needs_proba=True), 
)

# Search
gscv_result = gscv.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


KeyboardInterrupt: 

In [None]:
# Get best model
model = gscv_result.best_estimator_
model.fit(X_train, y_train)

# Parameters
print("Best Params:")
print(gscv_result.best_params_)

# Predict
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# Score
rf_data = []
rf_data.append(roc_auc_score(y_test, y_pred_proba))
rf_data.append(auprc(y_test, y_pred_proba))
rf_data.append(precision_score(y_test, y_pred))
rf_data.append(recall_score(y_test, y_pred))
rf_data.append(f1_score(y_test, y_pred))

# Results

In [27]:
lr_data
gnb_data
# rf_data

pd.DataFrame([lr_data, gnb_data,], 
             index=["Logistic Regression", "Gaussian Naive Bayes", ],
             columns=["roc_auc", "pr_auc", "precision", "recall", "f1"])

Unnamed: 0,roc_auc,pr_auc,precision,recall,f1
Logistic Regression,0.659026,0.376416,0.311286,0.608541,0.411883
Gaussian Naive Bayes,0.623224,0.347424,0.463519,0.128114,0.200743
