In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option("display.max_columns", 85)

# Logistic Regression

In [3]:
df = pd.read_csv("../data/abnormal_writeout_noscale.data.csv") # NEW

# trascurare da ACC a UVM
start_drop = df.columns.get_loc("ACC")
end_drop = df.columns.get_loc("UVM")
cols = np.arange(start_drop, end_drop + 1)
df.drop(df.columns[cols], axis=1, inplace=True)

# trascurare alcune colonne
df.drop("TTT_freq", axis=1, inplace=True)
df.drop("oldest_phylostratum_factor", axis=1, inplace=True)
df.drop("Unnamed: 0", axis=1, inplace=True)

# Drop NaNs
df.dropna(inplace=True)

# Sort features
resp = df["response"].to_numpy()
occ = df["occ_total_sum"]
age = df["oldest_phylostratum"].to_numpy()
conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1).to_numpy()

X = np.c_[occ, age, conf]
Y = resp

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("Feature matrix shape:", X_train.shape, X_test.shape)

Feature matrix shape: (14536, 83) (3634, 83)


In [4]:
df["response"].sum()

4134

In [27]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA

conf_cols = np.arange(2, X.shape[1]) # Columns of confounder variables (highly colinear)

class ConfounderPCA(BaseEstimator, TransformerMixin):
    ''' Custom PCA transformer for this dataset''' 
    
    def __init__(self, explained_variance=.95, apply_PCA=True):
        self.explained_variance = explained_variance
        self.apply_PCA = apply_PCA
        if self.apply_PCA: 
            self.pca = PCA(n_components = self.explained_variance )
        
    def fit(self, X, y = None):
        if self.apply_PCA:
            self.pca.fit(X[:, conf_cols])
        return self
    
    def transform(self, X, y = None):
        if self.apply_PCA:
            X_conf_pca = self.pca.transform(X[:, conf_cols])
            return np.c_[X[:, :2], X_conf_pca]
        else:
            return X

In [28]:
from sklearn.metrics import auc, make_scorer, precision_recall_curve


def auprc(y_true, y_scores, **kwargs):
    """ Remember to use make_scorer(auprc, needs_proba=True,) """
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    # result is sum of the areas under each curve
    return auc(thresholds, precisions[:-1]) + auc(thresholds, recalls[:-1])

# [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Base Model
lr_clf = Pipeline([
    ('scaler', StandardScaler()),
    # ("pca", ConfounderPCA()), 
    ("lr", LogisticRegression(max_iter=500,))
])

# Define metrics to be calculated during the grid search, along with their names
my_metrics = {
    "ROC_AUC": "roc_auc",
    "Precision": "precision",
    "Recall": "recall",
    "f1-score": "f1",
    "AUPRC": make_scorer(auprc, needs_proba=True,),  # custom: area under precision-recall-curve
}

# Define the parameter grid
param_grid = {
    # "pca__apply_PCA": [False, True],
    "lr__penalty": ['none', 'l2',],
    "lr__C": np.logspace(0, 3, 10),
    "lr__class_weight": [None, "balanced"],
    # "lr__l1_ratio": [0.2, 0.5, 0.8],
}

# Define the grid search object
grid_search = GridSearchCV(
    estimator=lr_clf,
    scoring=my_metrics,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1,
    # error_score="raise", 
    refit="ROC_AUC",  # Final fit
)

# Search
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', ConfounderPCA()),
                                       ('lr',
                                        LogisticRegression(max_iter=500))]),
             n_jobs=-1, param_grid={'lr__class_weight': [None, 'balanced']},
             refit='ROC_AUC',
             scoring={'AUPRC': make_scorer(auprc, needs_proba=True),
                      'Precision': 'precision', 'ROC_AUC': 'roc_auc',
                      'Recall': 'recall', 'f1-score': 'f1'},
             verbose=1)

In [33]:
gs_df = pd.DataFrame(grid_search.cv_results_)
gs_df.to_csv("./results/lr_gs_results.csv")

In [34]:
grid_search.best_params_

{'lr__class_weight': 'balanced'}

In [35]:
from sklearn.model_selection import cross_val_score

model = grid_search.best_estimator_

my_metrics = {
    "ROC-AUC": "roc_auc",
    "Precision": "precision",
    "Recall": "recall",
    "f1-score": "f1",
    "AUPRC": make_scorer(auprc, needs_proba=True,),  # custom: area under precision-recall-curve
}

scores = cross_validate(model, X_train, y_train, cv=5, scoring=my_metrics)

Logistic Regression:
ROC-AUC : 0.6615 +/- 0.0130
Precision : 0.3131 +/- 0.0109
Recall : 0.6293 +/- 0.0246
f1-score : 0.4181 +/- 0.0148
AUPRC : 0.7527 +/- 0.0669


In [28]:
lr_df = pd.DataFrame(scores)

Unnamed: 0,ROC-AUC,Precision,Recall,f1-score,AUPRC
0,0.656956,0.311649,0.62614,0.416162,0.74249
1,0.684001,0.327744,0.653495,0.436548,0.685202
2,0.658143,0.305638,0.62614,0.410768,0.749999
3,0.676501,0.327327,0.662614,0.438191,0.748008
4,0.667126,0.321981,0.632219,0.426667,0.919775
5,0.669651,0.318927,0.648485,0.427572,0.767172
6,0.643171,0.300752,0.607903,0.402414,0.701263
7,0.658325,0.304075,0.589666,0.401241,0.72712
8,0.651535,0.314417,0.6231,0.417941,0.773146
9,0.654793,0.305723,0.617021,0.408862,0.729886


In [None]:
lr_df.mean()

In [None]:
lr_df.sem()

In [36]:
lr_df.to_csv("./results/lr_scores.csv", index=None)