In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option("display.max_columns", 85)

In [161]:
def fetch_data(drop_some=True):

    df = pd.read_csv("../data/abnormal_writeout.data.csv")

    if drop_some:
        # trascurare da ACC a UVM
        start_drop = df.columns.get_loc("ACC")
        end_drop = df.columns.get_loc("UVM")
        cols = np.arange(start_drop, end_drop + 1)
        df.drop(df.columns[cols], axis=1, inplace=True)

        # trascurare old_phylo_factor e la prima colonna
        df.drop("oldest_phylostratum_factor", axis=1, inplace=True)
        df.drop("Unnamed: 0", axis=1, inplace=True)

    return df


def separate_data(df):
    resp = df["response"].to_numpy()
    occ = df["occ_total_sum"]
    age = df["oldest_phylostratum"].to_numpy()
    conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1).to_numpy()
    return occ, age, conf, resp


def get_PCA(X, expl_var=0.95, plot=False):

    pca_test = PCA()
    pca_test.fit(X)
    cumsum = np.cumsum(pca_test.explained_variance_ratio_)
    d = np.argmax(cumsum >= expl_var) + 1

    # Apply PCA with d components
    pca_apply = PCA(n_components=d)
    X_PCA = pca_apply.fit_transform(X)

    print("Using {} principal components.".format(d))
    print(f"Reduced features by {(784-d)/784*100:.2f} percent.")
    if plot == True:
        sns.heatmap(pd.DataFrame(X_PCA).corr())
        plt.show()

    return X_PCA

# Random Forest

This notebook contains assessment of a random forest classifier. 

The data is obtained using 85 columns of `abnormal_writeout.data.csv`. 

Most of these 85 columns are highly colinear "confounder" variables against which we are comparing the `occ_total_sum` (total mutations of the gene across all cancers) and `oldest_phylostratum` (evolutionary age of the gene) variables. We apply PCA to the confounder variables and keep PCs up to 95% explained variance.

### Import, Clean, and apply PCA to data

* Start with first 85 features
* Apply PCA to confounding factors
* Keep PCs up to 95% explained variance 

In [63]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Data
df = fetch_data()  # Get
df.dropna(inplace=True)  # Clean
X_occ, X_age, X_conf, Y = separate_data(df)  # Separate
X_conf_pca = get_PCA(X_conf, plot=False)  # PCA

# All features, confounders with PCA
X = np.c_[X_occ, X_age, X_conf_pca]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("Feature matrix shape:", x_train.shape, x_test.shape)

Using 47 principal components.
Reduced features by 94.01 percent.
Feature matrix shape: (14536, 49) (3634, 49)


In [64]:
from sklearn.ensemble import RandomForestClassifier

ran_for = RandomForestClassifier(n_estimators=100, random_state=42)  # 100->2000 estimators, seeded
ran_for.fit(x_train, y_train)
print("Using a random forest.")

Using a random forest.


In [65]:
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_predict

pred = ran_for.predict(x_test)
pred_proba = ran_for.predict_proba(x_test)

# View the classification report for test data and predictions
print(classification_report(y_test, pred))
print(f"AUC {roc_auc_score(y_test, pred_proba[:, 1]):.3f}")

              precision    recall  f1-score   support

           0       0.77      0.99      0.87      2791
           1       0.60      0.03      0.06       843

    accuracy                           0.77      3634
   macro avg       0.69      0.51      0.46      3634
weighted avg       0.73      0.77      0.68      3634

AUC 63.801%


***

## Correcting Class Imbalance

> Ensemble method in which each tree of the forest will be provided a balanced bootstrap sample. This class provides all functionality of the RandomForestClassifier.

In [66]:
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
brf.fit(x_train, y_train)

BalancedRandomForestClassifier(random_state=42)

In [67]:
pred = brf.predict(x_test)
pred_proba = brf.predict_proba(x_test)

print(classification_report(y_test, pred))
print(f"AUC: {roc_auc_score(y_test, pred_proba[:, 1]):.3f}")

              precision    recall  f1-score   support

           0       0.83      0.64      0.72      2791
           1       0.33      0.58      0.42       843

    accuracy                           0.63      3634
   macro avg       0.58      0.61      0.57      3634
weighted avg       0.72      0.63      0.65      3634

AUC: 66.198%


## Radical Feature Selection


[See here](https://stackoverflow.com/questions/42562146/classification-report-with-nested-cross-validation-in-sklearn-average-individua) for a tutorial on creating a custom scoring metric to use with sklearn's `cross_validation`.

**sklearn.metrics.auc**

min_samples_split = 2, 5, 10, 20  
min_samples_leaf = 1, 2, 5, 10, 20

score: AUC, AU precision recall, precision, recall, f1

In [198]:
n_estimators = 200

***

In [212]:
df = fetch_data()
df.dropna(inplace=True)
X_occ, X_age, X_conf, Y = separate_data(df)
X_conf_pca = get_PCA(X_conf, plot=False)  # PCA

X = np.c_[X_occ, X_age, X_conf_pca]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Using 47 principal components.
Reduced features by 94.01 percent.


In [213]:
from imblearn.metrics import specificity_score
from sklearn.base import clone
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold

model = RandomForestClassifier(n_estimators=n_estimators, random_state=42, n_jobs=-1)

n_splits = 10
r_scores = np.zeros(n_splits)
auc_scores = np.zeros(n_splits)
skfolds = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

i = 0
for train_index, test_index in skfolds.split(x_train, y_train):
    clone_clf = clone(model)
    X_train_folds = x_train[train_index]
    y_train_folds = y_train[train_index]
    X_test_fold = x_train[test_index]
    y_test_fold = y_train[test_index]
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred_proba = clone_clf.predict_proba(X_test_fold)
    y_pred = clone_clf.predict(X_test_fold)
    r_scores[i] = recall_score(y_test_fold, y_pred)
    auc_scores[i] = roc_auc_score(y_test_fold, y_pred_proba[:, 1])
    i += 1

print("All PCs, Unbalanced:")
print(f"Recall: {np.mean(r_scores):.4f} +/_ {np.std(r_scores):.5f}")
print(f"AUC: {np.mean(auc_scores):.4f} +/_ {np.std(auc_scores):.5f}")

All PCs, Unbalanced:
Recall: 0.0325 +/_ 0.00908
AUC: 0.6498 +/_ 0.01184


***

**Keep most significant PCs: 3, 9, 4, 6, 1, 23, 12, 46**

In [221]:
X_conf_pca_best = X_conf_pca[:, [1, 3, 4, 6, 9, 12, 23, 46]]
X_conf_pca_best = X_conf_pca[:, [0, 2, 3, 5, 8, 11, 22, 45]]

# Sum of mutations, age, and best PCs
X = np.c_[X_occ, X_age, X_conf_pca_best]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [222]:
model = RandomForestClassifier(n_estimators=n_estimators, random_state=42, n_jobs=-1)
n_splits = 10
r_scores = np.zeros(n_splits)
auc_scores = np.zeros(n_splits)
skfolds = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

i = 0
for train_index, test_index in skfolds.split(x_train, y_train):
    clone_clf = clone(model)
    X_train_folds = x_train[train_index]
    y_train_folds = y_train[train_index]
    X_test_fold = x_train[test_index]
    y_test_fold = y_train[test_index]
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred_proba = clone_clf.predict_proba(X_test_fold)
    y_pred = clone_clf.predict(X_test_fold)
    r_scores[i] = recall_score(y_test_fold, y_pred)
    auc_scores[i] = roc_auc_score(y_test_fold, y_pred_proba[:, 1])
    i += 1

print("Paper PCs, Unbalanced:")
print(f"Recall: {np.mean(r_scores):.4f} +/_ {np.std(r_scores):.5f}")
print(f"AUC: {np.mean(auc_scores):.4f} +/_ {np.std(auc_scores):.5f}")

Paper PCs, Unbalanced:
Recall: 0.0620 +/_ 0.00952
AUC: 0.6502 +/_ 0.01535


***

In [227]:
model = BalancedRandomForestClassifier(n_estimators=n_estimators, random_state=42, n_jobs=-1)
n_splits = 10
r_scores = np.zeros(n_splits)
auc_scores = np.zeros(n_splits)
skfolds = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

i = 0
for train_index, test_index in skfolds.split(x_train, y_train):
    clone_clf = clone(model)
    X_train_folds = x_train[train_index]
    y_train_folds = y_train[train_index]
    X_test_fold = x_train[test_index]
    y_test_fold = y_train[test_index]
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred_proba = clone_clf.predict_proba(X_test_fold)
    y_pred = clone_clf.predict(X_test_fold)
    r_scores[i] = recall_score(y_test_fold, y_pred)
    auc_scores[i] = roc_auc_score(y_test_fold, y_pred_proba[:, 1])
    i += 1

print("Paper PCs, Balanced:")
print(f"Recall: {np.mean(r_scores):.4f} +/_ {np.std(r_scores):.5f}")
print(f"AUC: {np.mean(auc_scores):.4f} +/_ {np.std(auc_scores):.5f}")

Paper PCs, Balanced:
Recall: 0.6332 +/_ 0.02494
AUC: 0.6678 +/_ 0.00947


***

**Keep first 8 PCs**

In [230]:
X_conf_pca_best = X_conf_pca[:, :8]
# Sum of mutations, age, and best PCs
X = np.c_[X_occ, X_age, X_conf_pca_best]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [231]:
model = RandomForestClassifier(n_estimators=n_estimators, random_state=42, n_jobs=-1)
n_splits = 10
r_scores = np.zeros(n_splits)
auc_scores = np.zeros(n_splits)
skfolds = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

i = 0
for train_index, test_index in skfolds.split(x_train, y_train):
    clone_clf = clone(model)
    X_train_folds = x_train[train_index]
    y_train_folds = y_train[train_index]
    X_test_fold = x_train[test_index]
    y_test_fold = y_train[test_index]
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred_proba = clone_clf.predict_proba(X_test_fold)
    y_pred = clone_clf.predict(X_test_fold)
    r_scores[i] = recall_score(y_test_fold, y_pred)
    auc_scores[i] = roc_auc_score(y_test_fold, y_pred_proba[:, 1])
    i += 1

print("Top PCs, Unbalanced:")
print(f"Recall: {np.mean(r_scores):.4f} +/_ {np.std(r_scores):.5f}")
print(f"AUC: {np.mean(auc_scores):.4f} +/_ {np.std(auc_scores):.5f}")

Top PCs, Unbalanced:
Recall: 0.0735 +/_ 0.01172
AUC: 0.6576 +/_ 0.01378


***

**First 10 PCs, balanced**

In [232]:
model = BalancedRandomForestClassifier(n_estimators=n_estimators, random_state=42, n_jobs=-1)
n_splits = 10
r_scores = np.zeros(n_splits)
auc_scores = np.zeros(n_splits)
skfolds = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

i = 0
for train_index, test_index in skfolds.split(x_train, y_train):
    clone_clf = clone(model)
    X_train_folds = x_train[train_index]
    y_train_folds = y_train[train_index]
    X_test_fold = x_train[test_index]
    y_test_fold = y_train[test_index]
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred_proba = clone_clf.predict_proba(X_test_fold)
    y_pred = clone_clf.predict(X_test_fold)
    r_scores[i] = recall_score(y_test_fold, y_pred)
    auc_scores[i] = roc_auc_score(y_test_fold, y_pred_proba[:, 1])
    i += 1

print("Top PCs, Balanced:")
print(f"Recall: {np.mean(r_scores):.4f} +/_ {np.std(r_scores):.5f}")
print(f"AUC: {np.mean(auc_scores):.4f} +/_ {np.std(auc_scores):.5f}")

Top PCs, Balanced:
Recall: 0.6378 +/_ 0.02928
AUC: 0.6696 +/_ 0.01249
