In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option("display.max_columns", 85)

In [3]:
def fetch_data(drop_some=True):
    df = pd.read_csv("../data/abnormal_writeout.data.csv")
    if drop_some:
        # trascurare da ACC a UVM
        start_drop = df.columns.get_loc("ACC")
        end_drop = df.columns.get_loc("UVM")
        cols = np.arange(start_drop, end_drop + 1)
        df.drop(df.columns[cols], axis=1, inplace=True)
        # trascurare old_phylo_factor, ttt_freq e la prima colonna
        df.drop("TTT_freq", axis=1, inplace=True)
        df.drop("oldest_phylostratum_factor", axis=1, inplace=True)
        df.drop("Unnamed: 0", axis=1, inplace=True)

        # Drop NaNs
        df.dropna(inplace=True)
    return df


def separate_data(df):
    resp = df["response"].to_numpy()
    occ = df["occ_total_sum"]
    age = df["oldest_phylostratum"].to_numpy()
    conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1).to_numpy()
    return occ, age, conf, resp

# RF da variabili originali standardizzate, con parametri di default

## Data

* Incoming datafile has been previously standardized.
* Columns from "ACC" to "UVM" are subsequently dropped.
* Column "oldest_phylostratum_factor" is dropped.
* **Column "TTT_freq" is dropped**, because it is colinear with the other trinucleotide frequencies.
* **PCA is NOT applied.**
* An 80:20 train-test split is applied. 

In [8]:
from sklearn.model_selection import train_test_split

# Data
df = fetch_data()  # Get
df.dropna(inplace=True)  # Clean
X_occ, X_age, X_conf, Y = separate_data(df)  # Separate

# All features and most confounders
X = np.c_[X_occ, X_age, X_conf]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("Feature matrix shape:", x_train.shape, x_test.shape)

Feature matrix shape: (14536, 83) (3634, 83)


In [9]:
from sklearn.metrics import auc, make_scorer, precision_recall_curve


# area under the precision curve + area under the recall curve
def auprc(y_true, y_scores, **kwargs):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    # result is sum of the areas under each curve
    return auc(thresholds, precisions[:-1]) + auc(thresholds, recalls[:-1])

## Cross-Validation

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


my_metrics = {
    "ROC-AUC": "roc_auc",
    "Precision": "precision",
    "Recall": "recall",
    "f1-score": "f1",
    "AUPRC": make_scorer(auprc, needs_proba=True,),  # custom: area under precision-recall-curve
}

model = RandomForestClassifier(n_estimators=200, random_state=0, n_jobs=-1)

scores_df = pd.DataFrame()

print("RF w/o PCA:")
for score_name in my_metrics:
    cvs = cross_val_score(model, X_train, y_train, scoring=my_metrics[score_name], cv=10)
    scores_df[score_name] = cvs
    print(score_name + f" : {cvs.mean():.4f} +/- {cvs.std():.4f}")

RF w/o PCA:
ROC-AUC : 0.6691 +/- 0.0135
Precision : 0.6202 +/- 0.1339
Recall : 0.0307 +/- 0.0086
f1-score : 0.0583 +/- 0.0160
AUPRC : 0.5002 +/- 0.0600


In [21]:
scores_df

Unnamed: 0,ROC-AUC,Precision,Recall,f1-score,AUPRC
0,0.66251,0.529412,0.027356,0.052023,0.506235
1,0.676758,0.722222,0.039514,0.074928,0.486243
2,0.664112,0.833333,0.030395,0.058651,0.589238
3,0.646717,0.533333,0.024316,0.046512,0.459107
4,0.673722,0.526316,0.030395,0.057471,0.525462
5,0.683635,0.846154,0.033333,0.06414,0.570656
6,0.669988,0.5,0.027356,0.051873,0.432852
7,0.673436,0.444444,0.012158,0.023669,0.38532
8,0.691437,0.6,0.045593,0.084746,0.552648
9,0.648321,0.666667,0.036474,0.069164,0.494389


In [22]:
scores_df.describe()

Unnamed: 0,ROC-AUC,Precision,Recall,f1-score,AUPRC
count,10.0,10.0,10.0,10.0,10.0
mean,0.669064,0.620188,0.030689,0.058318,0.500215
std,0.014185,0.141184,0.00911,0.016825,0.063217
min,0.646717,0.444444,0.012158,0.023669,0.38532
25%,0.662911,0.52709,0.027356,0.051911,0.465891
50%,0.671712,0.566667,0.030395,0.058061,0.500312
75%,0.675999,0.708333,0.035689,0.067908,0.545852
max,0.691437,0.846154,0.045593,0.084746,0.589238


The following cell loads the gridsearch scores from another notebook and performs a ttest on two important scores.

In [29]:
from scipy.stats import ttest_ind

rf_df = pd.read_csv("./data/rf_scores_df") # "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
# brf_df = pd.read_csv("./data/brf_scores_df")

print("T-test b/w non-pca and pca RF (ROC-AUC):")
A = scores_df["ROC-AUC"].to_numpy()
B = rf_df["ROC-AUC"].to_numpy()
print(ttest_ind(A, B))
print()
print("T-test b/w non-pca and pca RF (AUPRC):")
A = scores_df.AUPRC.to_numpy()
B = rf_df.AUPRC.to_numpy()
print(ttest_ind(A, B))

T-test b/w non-pca and pca RF (ROC-AUC):
Ttest_indResult(statistic=4.412224185918338, pvalue=0.00033625012711190896)

T-test b/w non-pca and pca RF (AUPRC):
Ttest_indResult(statistic=0.8449343978978487, pvalue=0.4092382673140642)


In [30]:
print(scores_df.mean())
print()
print(scores_df.sem())

ROC-AUC      0.669064
Precision    0.620188
Recall       0.030689
f1-score     0.058318
AUPRC        0.500215
dtype: float64

ROC-AUC      0.004486
Precision    0.044646
Recall       0.002881
f1-score     0.005321
AUPRC        0.019991
dtype: float64


In [31]:
print(rf_df.mean())
print()
print(rf_df.sem())

ROC-AUC      0.646292
Precision    0.619411
Recall       0.034037
f1-score     0.064258
AUPRC        0.479602
dtype: float64

ROC-AUC      0.002552
Precision    0.036289
Recall       0.003761
f1-score     0.006806
AUPRC        0.013982
dtype: float64
