In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option("display.max_columns", 85)

# Random Forest

In [2]:
# Read Data from CSV (NEW DATA, NOT SCALED)
df = pd.read_csv("../data/abnormal_writeout_noscale.data.csv", index_col=0)

# trascurare da ACC a UVM
start_drop = df.columns.get_loc("ACC")
end_drop = df.columns.get_loc("UVM")
cols = np.arange(start_drop, end_drop + 1)
df.drop(df.columns[cols], axis=1, inplace=True)

# trascurare alcune colonne
df.drop("TTT_freq", axis=1, inplace=True)
df.drop("oldest_phylostratum_factor", axis=1, inplace=True)

# Drop NaNs
df.dropna(inplace=True)

# Sort features
resp = df["response"]
occ = df["occ_total_sum"]
age = df["oldest_phylostratum"]
conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1)

# Collect Features and Labels
features_df = pd.DataFrame()
features_df["occ_total_sum"] = occ
features_df["oldest_phylostratum"] = age
features_df = pd.concat([features_df, conf], axis=1)

X = features_df.to_numpy()
y = df["response"].to_numpy()

features_df.head()

Unnamed: 0,occ_total_sum,oldest_phylostratum,cds_length,gc_cds,dnase_gene,dnase_cds,H3k4me1_gene,H3k4me3_gene,H3k27ac_gene,H3k4me1_cds,H3k4me3_cds,H3k27ac_cds,lamin_gene,repli_gene,nsome_gene,nsome_cds,transcription_gene,repeat_gene,repeat_cds,recomb_gene,AAA_freq,AAC_freq,AAG_freq,AAT_freq,ACA_freq,ACC_freq,ACG_freq,ACT_freq,AGA_freq,AGC_freq,AGG_freq,AGT_freq,ATA_freq,ATC_freq,ATG_freq,ATT_freq,CAA_freq,CAC_freq,CAG_freq,CAT_freq,CCA_freq,CCC_freq,CCG_freq,CCT_freq,CGA_freq,CGC_freq,CGG_freq,CGT_freq,CTA_freq,CTC_freq,CTG_freq,CTT_freq,GAA_freq,GAC_freq,GAG_freq,GAT_freq,GCA_freq,GCC_freq,GCG_freq,GCT_freq,GGA_freq,GGC_freq,GGG_freq,GGT_freq,GTA_freq,GTC_freq,GTG_freq,GTT_freq,TAA_freq,TAC_freq,TAG_freq,TAT_freq,TCA_freq,TCC_freq,TCG_freq,TCT_freq,TGA_freq,TGC_freq,TGG_freq,TGT_freq,TTA_freq,TTC_freq,TTG_freq
1,33,12.0,1488,0.657258,0.61223,0.758065,0.561429,1.0,0.216855,0.66129,1.0,0.198925,0.0,0.041809,0.809254,0.706453,6.798234,0.040516,0.0,0.0,0.004755,0.008152,0.007473,0.002717,0.011549,0.026495,0.01087,0.008152,0.01019,0.028533,0.019701,0.009511,0.000679,0.006114,0.01087,0.002038,0.009511,0.019022,0.028533,0.007473,0.027174,0.03125,0.025136,0.029891,0.015625,0.027174,0.019701,0.009511,0.007473,0.017663,0.044837,0.013587,0.008832,0.021739,0.03125,0.008152,0.016984,0.033967,0.027853,0.034647,0.023777,0.030571,0.029212,0.013587,0.000679,0.012908,0.027174,0.003397,0.0,0.008152,0.0,0.001359,0.008832,0.021739,0.009511,0.01019,0.02038,0.027174,0.029212,0.01087,0.000679,0.013587,0.005435
10,28,1.0,873,0.42268,0.086769,0.195876,0.657839,0.0,0.0,0.0,0.0,0.0,1.0,-0.007148,0.828752,1.097018,0.061963,0.002809,0.0,2.04335,0.025258,0.019518,0.021814,0.02411,0.025258,0.01837,0.003444,0.012629,0.035591,0.009185,0.016073,0.006889,0.016073,0.017222,0.010333,0.033295,0.019518,0.011481,0.020666,0.022962,0.017222,0.008037,0.002296,0.021814,0.003444,0.001148,0.004592,0.002296,0.008037,0.019518,0.022962,0.019518,0.033295,0.013777,0.019518,0.011481,0.014925,0.006889,0.0,0.012629,0.01837,0.011481,0.017222,0.01837,0.005741,0.008037,0.012629,0.012629,0.012629,0.014925,0.006889,0.017222,0.017222,0.016073,0.005741,0.022962,0.020666,0.012629,0.027555,0.011481,0.021814,0.017222,0.026406
100,36,1.0,1092,0.572344,0.479295,0.611722,0.851369,0.354628,0.618954,0.754579,0.03022,0.086996,0.0,0.040463,1.2496,1.354306,6.08162,0.028404,0.0,0.868383,0.018727,0.012172,0.023408,0.003745,0.01779,0.024345,0.007491,0.014981,0.024345,0.020599,0.025281,0.011236,0.003745,0.013109,0.019663,0.004682,0.01779,0.016854,0.029963,0.01779,0.034644,0.022472,0.0103,0.02809,0.005618,0.0103,0.014045,0.003745,0.015918,0.015918,0.033708,0.011236,0.014981,0.022472,0.026217,0.009363,0.015918,0.031835,0.007491,0.025281,0.02809,0.029026,0.021536,0.013109,0.008427,0.0103,0.016854,0.003745,0.006554,0.012172,0.005618,0.008427,0.014981,0.016854,0.009363,0.008427,0.014981,0.019663,0.029026,0.0103,0.004682,0.0103,0.004682
1000,126,1.0,2800,0.46,0.171524,0.280357,0.554023,0.05242,0.278492,0.270357,0.021429,0.151429,0.0,-0.022495,0.92142,1.382249,2.254471,0.01452,0.0,1.14306,0.022054,0.014823,0.022415,0.024946,0.022054,0.0141,0.006146,0.015546,0.024946,0.016992,0.012292,0.015907,0.013377,0.02133,0.026392,0.017715,0.026392,0.011931,0.027477,0.017354,0.023861,0.016992,0.006508,0.019161,0.005785,0.003977,0.007954,0.003977,0.006146,0.010846,0.025307,0.015907,0.022415,0.022777,0.016269,0.0188,0.015184,0.016992,0.0047,0.014461,0.017354,0.010484,0.010123,0.011931,0.0094,0.007231,0.020607,0.011931,0.013738,0.008315,0.006146,0.016631,0.022054,0.018077,0.0047,0.009038,0.031092,0.019523,0.019523,0.016992,0.016269,0.0141,0.015907
10000,55,1.0,1484,0.401617,0.143843,0.030997,0.400789,0.106455,0.457949,0.708221,0.030997,0.659704,0.0,-0.000387,0.960747,1.196871,1.080241,0.009545,0.0,4.217,0.039835,0.015797,0.03022,0.025412,0.024038,0.012363,0.002747,0.019918,0.048077,0.006868,0.015797,0.009615,0.020604,0.009615,0.03228,0.023352,0.019918,0.012363,0.021978,0.015797,0.01511,0.003434,0.004121,0.013049,0.005495,0.001374,0.00206,0.00206,0.013736,0.014423,0.014423,0.013736,0.034341,0.017857,0.024725,0.024725,0.016484,0.006868,0.002747,0.006181,0.022665,0.013049,0.010302,0.008242,0.009615,0.004808,0.013736,0.011676,0.018544,0.012363,0.008242,0.019231,0.01511,0.012363,0.00206,0.015797,0.024038,0.010989,0.026099,0.018544,0.014423,0.015797,0.019231


In [3]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18170 entries, 1 to 9993
Data columns (total 83 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   occ_total_sum        18170 non-null  int64  
 1   oldest_phylostratum  18170 non-null  float64
 2   cds_length           18170 non-null  int64  
 3   gc_cds               18170 non-null  float64
 4   dnase_gene           18170 non-null  float64
 5   dnase_cds            18170 non-null  float64
 6   H3k4me1_gene         18170 non-null  float64
 7   H3k4me3_gene         18170 non-null  float64
 8   H3k27ac_gene         18170 non-null  float64
 9   H3k4me1_cds          18170 non-null  float64
 10  H3k4me3_cds          18170 non-null  float64
 11  H3k27ac_cds          18170 non-null  float64
 12  lamin_gene           18170 non-null  float64
 13  repli_gene           18170 non-null  float64
 14  nsome_gene           18170 non-null  float64
 15  nsome_cds            18170 non-null  

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (14536, 83) (14536,)
Testing set shape: (3634, 83) (3634,)


***

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA

conf_cols = np.arange(2, X.shape[1])  # Columns of confounder variables (highly colinear)


class ConfounderPCA(BaseEstimator, TransformerMixin):
    """ Custom PCA transformer for this dataset"""

    def __init__(self, explained_variance=0.95, apply_PCA=True):
        self.explained_variance = explained_variance
        self.apply_PCA = apply_PCA
        if self.apply_PCA:
            self.pca = PCA(n_components=self.explained_variance)

    def fit(self, X, y=None):
        if self.apply_PCA:
            self.pca.fit(X[:, conf_cols])
        return self

    def transform(self, X, y=None):
        if self.apply_PCA:
            X_conf_pca = self.pca.transform(X[:, conf_cols])
            return np.c_[X[:, :2], X_conf_pca]
        else:
            return X

In [8]:
from sklearn.metrics import auc, make_scorer, precision_recall_curve


def auprc(y_true, y_scores, **kwargs):
    """ Remember to use make_scorer(auprc, needs_proba=True,) """
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    # result is sum of the areas under each curve
    return auc(thresholds, precisions[:-1]) + auc(thresholds, recalls[:-1])

*** 
#### Nested CV on Logistic Regression, np.logspace(-5, 2, 8,)np.logspace(-4, 2, 8,)Returning Information on the Best Parameter Configurations

In [None]:
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils._testing import ignore_warnings

# Define a parameter space to search
param_grid = {
    "lr__C": [0, 0.5, 1, 3, 5, 10, 15, 20, 100],
    "lr__class_weight": [None, "balanced"],
    "pca__apply_PCA": [True, False],
    "pca__explained_variance": [0.95, 1],
}

# Define the model to be tuned
lr_clf = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("pca", ConfounderPCA()),
        ("lr", LogisticRegression(max_iter=2000,)),
    ]
)

# configure the cross-validation procedure
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)

# To store results
roc_results = list()
auprc_results = list()
prec_results = list()
rec_results = list()
f1_results = list()
found_params = list()

print("OUTER CV | INNER CV | CHOSEN PARAMS")

for i, (train_ix, test_ix) in enumerate(cv_outer.split(X)):

    # split data
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]

    # configure the cross-validation procedure
    cv_inner = KFold(n_splits=3, shuffle=True, random_state=i) # Deterministic but changing RS

    # define search
    with ignore_warnings(category=[ConvergenceWarning, FitFailedWarning]):
        # define search
        search = GridSearchCV(estimator=lr_clf, param_grid=param_grid, scoring="roc_auc", cv=cv_inner, n_jobs=-1)
        # execute search
        result = search.fit(X_train, y_train)
        
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_

    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_test)
    # yhat = best_model.predict_log_proba(X_test)[:,1]

    # evaluate the model
    roc_auc = roc_auc_score(y_test, yhat)
    
    # store the result
    roc_results.append(roc_auc)
    auprc_results.append(auprc(y_test, yhat))
    prec_results.append(precision_score(y_test, yhat))
    rec_results.append(recall_score(y_test, yhat))
    f1_results.append(f1_score(y_test, yhat))
    found_params.append(result.best_params_)

    # report progress
    print("roc-auc=%.3f, est=%.3f, params=%s" % (roc_auc, result.best_score_, result.best_params_))

# summarize the estimated performance of the model
print("ROC-AUC: %.3f (std = %.3f)" % (np.mean(outer_results), np.std(outer_results)))
print("(other scores stored in ncv_df)")

ncv_df = pd.DataFrame()
ncv_df["ROC-AUC"] = roc_results
ncv_df["AUPRC"] = auprc_results
ncv_df["Precision"] = prec_results
ncv_df["Recall"] = rec_results
ncv_df["f1-score"] = f1_results
ncv_df = pd.concat([ncv_df, pd.DataFrame(found_params)], axis=1)

OUTER CV | INNER CV | CHOSEN PARAMS
