In [48]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [49]:
from sklearn.decomposition import PCA


def fetch_data(drop_some=True):
    df = pd.read_csv("../data/abnormal_writeout.data.csv")
    if drop_some:
        # trascurare da ACC a UVM
        start_drop = df.columns.get_loc("ACC")
        end_drop = df.columns.get_loc("UVM")
        cols = np.arange(start_drop, end_drop + 1)
        df.drop(df.columns[cols], axis=1, inplace=True)
        # trascurare old_phylo_factor e la prima colonna
        # df.drop("TTT_freq", axis=1, inplace=True)
        df.drop("oldest_phylostratum_factor", axis=1, inplace=True)
        df.drop("Unnamed: 0", axis=1, inplace=True)

        # Drop NaNs
        df.dropna(inplace=True)
    return df


def separate_data(df):
    resp = df["response"].to_numpy()
    occ = df["occ_total_sum"]
    age = df["oldest_phylostratum"].to_numpy()
    conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1).to_numpy()
    return occ, age, conf, resp


def get_PCA(X, expl_var=0.95, plot=False):
    # Fit a PCA
    pca_test = PCA()
    pca_test.fit(X)
    cumsum = np.cumsum(pca_test.explained_variance_ratio_)
    d = np.argmax(cumsum >= expl_var) + 1
    # Apply PCA with d components
    pca_apply = PCA(n_components=d)
    X_PCA = pca_apply.fit_transform(X)
    print("Using {} principal components.".format(d))
    print(f"Reduced features by {(784-d)/784*100:.2f}%.")
    if plot == True:
        sns.heatmap(pd.DataFrame(X_PCA).corr())
        plt.show()
    return X_PCA

# Valutazione dell'area sotto la curva PR per modelli logistici e RF bilanciati e non

### Data

* Incoming datafile has been previously standardized.
* Columns from "ACC" to "UVM" are subsequently dropped.
* Column "oldest_phylostratum_factor" is dropped.
* PCA is applied to the confounding variables (all except occ_total_sum and olders_phylostratum) and PCs are kept only up to 95% explained variance.
* An 80:20 train-test split is applied. 

In [50]:
from sklearn.model_selection import train_test_split

df = fetch_data()
X_ots, X_age, X_conf, Y = separate_data(df)
X_conf_pca = get_PCA(X_conf)

# All features, confounders with PCA
X = np.c_[X_ots, X_age, X_conf_pca]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Feature matrix shape:", X_train.shape, X_test.shape)
print("Label matrix shape:", y_train.shape, y_test.shape)

Using 47 principal components.
Reduced features by 94.01%.
Feature matrix shape: (14536, 49) (3634, 49)
Label matrix shape: (14536,) (3634,)


## Define a Custom PRC-AUC Metric

The following cell uses an sklearn wrapper api called `make_scorer` to create a cv-compatible score from a user-defined callable function.  


In [51]:
from sklearn.metrics import auc, make_scorer, precision_recall_curve


def auprc(y_true, y_scores, **kwargs):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    # result is sum of the areas under each curve
    return auc(thresholds, precisions[:-1]) + auc(thresholds, recalls[:-1])


# Create my custom scorer using a wrapper object
auprc_score = make_scorer(auprc, needs_proba=True,)

## K-Fold Cross-Validation 

The following cell iterates through a selection of models and computes 10-fold cross-validation for each of the following metrics:
* ROC-AUC
* Precision
* Recall
* f1
* AUPRC

In [54]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Balanced Logistic Regression": LogisticRegression(max_iter=500, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    "Balanced Random Forest": BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
}

my_metrics = {
    "ROC-AUC": "roc_auc",
    "Precision": "precision",
    "Recall": "recall",
    "f1-score": "f1",
    "AUPRC": make_scorer(auprc, needs_proba=True,),  # custom: area under precision-recall-curve
}


lr_df = pd.DataFrame()
blr_df = pd.DataFrame()
rf_df = pd.DataFrame()
brf_df = pd.DataFrame()

frames = [lr_df, blr_df, rf_df, brf_df]
for i, model_name in enumerate(models):
    print(model_name)
    frame = frames[i]
    for score_name in my_metrics:
        cvs = cross_val_score(models[model_name], X_train, y_train, scoring=my_metrics[score_name], cv=10)
        frame[score_name] = cvs
        print(score_name + f" : {cvs.mean():.4f} +/- {cvs.std():.4f}")
    print()

Logistic Regression
ROC-AUC : 0.6610 +/- 0.0125
Precision : 0.5651 +/- 0.1113
Recall : 0.0535 +/- 0.0128
f1-score : 0.0976 +/- 0.0226
AUPRC : 0.6930 +/- 0.0555

Balanced Logistic Regression
ROC-AUC : 0.6622 +/- 0.0124
Precision : 0.3131 +/- 0.0096
Recall : 0.6275 +/- 0.0190
f1-score : 0.4177 +/- 0.0122
AUPRC : 0.7578 +/- 0.0594

Random Forest
ROC-AUC : 0.6463 +/- 0.0077
Precision : 0.6194 +/- 0.1089
Recall : 0.0340 +/- 0.0113
f1-score : 0.0643 +/- 0.0204
AUPRC : 0.4796 +/- 0.0419

Balanced Random Forest
ROC-AUC : 0.6692 +/- 0.0113
Precision : 0.3264 +/- 0.0074
Recall : 0.6001 +/- 0.0215
f1-score : 0.4227 +/- 0.0108
AUPRC : 0.5378 +/- 0.0749



In [57]:
lr_df

Unnamed: 0,ROC-AUC,Precision,Recall,f1-score,AUPRC
0,0.655552,0.606061,0.06079,0.110497,0.655936
1,0.684893,0.53125,0.051672,0.094183,0.653916
2,0.657705,0.75,0.054711,0.101983,0.675669
3,0.677822,0.4375,0.042553,0.077562,0.627465
4,0.663989,0.717949,0.085106,0.152174,0.829294
5,0.669964,0.576923,0.045455,0.08427,0.65481
6,0.642768,0.483871,0.045593,0.083333,0.683869
7,0.656565,0.645161,0.06079,0.111111,0.741363
8,0.64948,0.515152,0.051672,0.093923,0.686393
9,0.650897,0.387097,0.036474,0.066667,0.721159


In [70]:
print(lr_df.mean())
print()
print(lr_df.sem())

ROC-AUC      0.660963
Precision    0.565096
Recall       0.053482
f1-score     0.097570
AUPRC        0.692987
dtype: float64

ROC-AUC      0.004177
Precision    0.037116
Recall       0.004282
f1-score     0.007536
AUPRC        0.018496
dtype: float64


In [71]:
print(blr_df.mean())
print()
print(blr_df.sem())

ROC-AUC      0.662204
Precision    0.313075
Recall       0.627467
f1-score     0.417696
AUPRC        0.757813
dtype: float64

ROC-AUC      0.004133
Precision    0.003200
Recall       0.006325
f1-score     0.004083
AUPRC        0.019808
dtype: float64


In [72]:
print(rf_df.mean())
print()
print(rf_df.sem())

ROC-AUC      0.646292
Precision    0.619411
Recall       0.034037
f1-score     0.064258
AUPRC        0.479602
dtype: float64

ROC-AUC      0.002552
Precision    0.036289
Recall       0.003761
f1-score     0.006806
AUPRC        0.013982
dtype: float64


In [73]:
print(brf_df.mean())
print()
print(brf_df.sem())

ROC-AUC      0.669226
Precision    0.326356
Recall       0.600121
f1-score     0.422738
AUPRC        0.537811
dtype: float64

ROC-AUC      0.003758
Precision    0.002458
Recall       0.007171
f1-score     0.003601
AUPRC        0.024952
dtype: float64


In [59]:
from scipy.stats import ttest_ind

A = lr_df.AUPRC.to_numpy()
B = blr_df.AUPRC.to_numpy()

print(ttest_ind(A, B))

Ttest_indResult(statistic=-2.392015207883834, pvalue=0.027879820142883077)


In [78]:
lr_df.to_csv("./data/lr_scores_df", index=None)
blr_df.to_csv("./data/blr_scores_df", index=None)
rf_df.to_csv("./data/rf_scores_df", index=None)
brf_df.to_csv("./data/brf_scores_df", index=None)