In [1]:
import pandas as pd
import eli5
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from os.path import exists
import train
import joblib
from utils import main, extract_features, split_data, save_to_pickle

In [2]:
def reduce_lung():
    luad = pd.read_pickle("data/LUAD.pkl")

    lusc = pd.read_pickle("data/LUSC.pkl")

    lusc["Target"] = 1
    luad["Target"] = 2

    comb_df = pd.concat([lusc,luad])
    del lusc, luad

    all_target = comb_df["Target"]

    threshold = 1000
    df3 = comb_df.drop(comb_df.std()[comb_df.std() < threshold].index.values, axis=1)
    
    df3["Target"] = all_target
    save_to_pickle(df3, "Reduced_Lung")

In [3]:
if not exists("data/Reduced_Lung.pkl"):
    reduce_lung()

In [4]:
df = pd.read_pickle("data/Reduced_Lung.pkl")

In [5]:
X, y = extract_features(df)
X_train, X_test, y_train, y_test = split_data(X, y)
feature_names = list(X.columns)

In [6]:
dataname = "reduced_LUSCLUAD"
if not exists(f"models/{dataname}_LR.mdl"):
    train.run_logistic_regression(X_train, X_test, y_train, y_test, dataname)
if not exists(f"models/{dataname}_SVM.mdl"):
    train.run_svm(X_train, X_test, y_train, y_test, dataname)
if not exists(f"models/{dataname}_DT.mdl"):
    train.run_decision_trees(X_train, X_test, y_train, y_test, dataname)
if not exists(f"models/{dataname}_RF.mdl"):
    train.run_random_forest(X_train, X_test, y_train, y_test, dataname)
if not exists(f"models/{dataname}_XGB.mdl"):
    train.run_xgboost(X_train, X_test, y_train, y_test, dataname)

In [7]:
rf_model = joblib.load(f"models/{dataname}_RF.mdl")
lr_model = joblib.load(f"models/{dataname}_LR.mdl")
dt_model = joblib.load(f"models/{dataname}_DT.mdl")
svm_model = joblib.load(f"models/{dataname}_SVM.mdl")
xgb_model = joblib.load(f"models/{dataname}_XGB.mdl")

In [8]:
i = 10
X_test.iloc[[i]]

Ensembl_ID,ENSG00000218512.2,ENSG00000218520.5,ENSG00000218521.1,ENSG00000218536.1,ENSG00000218549.1,ENSG00000218561.1,ENSG00000218565.2,ENSG00000218574.1,ENSG00000218577.1,ENSG00000218582.2,...,ENSGR0000275287.3,ENSGR0000276543.3,ENSGR0000277120.3,ENSGR0000280767.1,ENSGR0000281849.1,__no_feature,__ambiguous,__too_low_aQual,__not_aligned,__alignment_not_unique
TCGA-37-3789-01A,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3.70044,...,0.0,0.0,0.0,0.0,0.0,21.878318,21.20043,0.0,0.0,24.56453


In [9]:
y_test.iloc[[i]]

TCGA-37-3789-01A    1
Name: Target, dtype: int64

In [10]:
y_pred = lr_model.predict(X_test)

print("LR Accuracy: ", accuracy_score(y_test, y_pred))

print("Classification report:\n",
        classification_report(y_test, y_pred))

eli5.show_weights(lr_model.named_steps["model"], feature_names=feature_names, top=20)

LR Accuracy:  0.956140350877193
Classification report:
               precision    recall  f1-score   support

           1       0.98      0.93      0.95       110
           2       0.94      0.98      0.96       118

    accuracy                           0.96       228
   macro avg       0.96      0.96      0.96       228
weighted avg       0.96      0.96      0.96       228



Weight?,Feature
+0.038,ENSG00000229807.8
+0.029,ENSG00000263711.4
+0.029,ENSG00000248713.1
+0.025,ENSG00000224237.1
+0.025,ENSG00000241388.4
+0.025,ENSG00000229119.3
+0.025,ENSG00000272620.1
… 21299 more positive …,… 21299 more positive …
… 15315 more negative …,… 15315 more negative …
-0.026,ENSG00000273760.1


In [11]:
eli5.show_prediction(lr_model.named_steps["model"],
                     X_test.iloc[[i]],
                     feature_names=feature_names, show_feature_values=True, top=20)

Contribution?,Feature,Value
+0.599,ENSG00000241794.1,12.271
+0.412,ENSG00000230937.8,12.890
+0.290,ENSG00000265190.5,12.688
+0.264,ENSG00000273760.1,10.291
+0.256,ENSG00000261116.1,11.668
+0.252,ENSG00000256812.1,10.821
+0.249,ENSG00000233864.6,9.617
+0.245,ENSG00000251381.5,11.998
+0.214,ENSG00000226084.5,10.746
+0.212,ENSG00000244094.1,6.658


In [12]:
y_pred = svm_model.predict(X_test)

print("SVM Accuracy: ", accuracy_score(y_test, y_pred))

print("Classification report:\n",
        classification_report(y_test, y_pred))

eli5.show_weights(svm_model.named_steps["model"], feature_names=feature_names, top=20)

SVM Accuracy:  0.956140350877193
Classification report:
               precision    recall  f1-score   support

           1       0.98      0.93      0.95       110
           2       0.94      0.98      0.96       118

    accuracy                           0.96       228
   macro avg       0.96      0.96      0.96       228
weighted avg       0.96      0.96      0.96       228



Weight?,Feature
+3.297,<BIAS>
+0.005,ENSG00000229807.8
+0.005,ENSG00000248713.1
+0.004,ENSG00000263711.4
+0.004,ENSG00000224237.1
+0.004,ENSG00000229119.3
… 20190 more positive …,… 20190 more positive …
… 16424 more negative …,… 16424 more negative …
-0.004,ENSG00000253954.3
-0.004,ENSG00000224040.1


In [13]:
eli5.show_prediction(svm_model.named_steps["model"],
                     X_test.iloc[[i]],
                     feature_names=feature_names, show_feature_values=True, top=20)

Contribution?,Feature,Value
+0.082,ENSG00000241794.1,12.271
+0.059,ENSG00000230937.8,12.890
+0.043,ENSG00000265190.5,12.688
+0.039,ENSG00000273760.1,10.291
+0.037,ENSG00000251381.5,11.998
+0.037,ENSG00000261116.1,11.668
+0.036,ENSG00000256812.1,10.821
+0.036,ENSG00000226084.5,10.746
+0.032,ENSG00000233864.6,9.617
+0.027,ENSG00000244094.1,6.658


In [14]:
y_pred = rf_model.predict(X_test)

print("RF Accuracy: ", accuracy_score(y_test, y_pred))

print("Classification report:\n",
    classification_report(y_test, y_pred))

eli5.show_weights(rf_model.named_steps["model"], feature_names=feature_names, top=20)

RF Accuracy:  0.9429824561403509
Classification report:
               precision    recall  f1-score   support

           1       0.96      0.92      0.94       110
           2       0.93      0.97      0.95       118

    accuracy                           0.94       228
   macro avg       0.94      0.94      0.94       228
weighted avg       0.94      0.94      0.94       228



Weight,Feature
0.0120  ± 0.1473,ENSG00000271134.1
0.0077  ± 0.0958,ENSG00000230943.1
0.0074  ± 0.1124,ENSG00000267284.1
0.0073  ± 0.0940,ENSG00000265933.4
0.0072  ± 0.0936,ENSG00000266729.4
0.0069  ± 0.1195,ENSG00000230937.8
0.0068  ± 0.1061,ENSG00000231648.1
0.0063  ± 0.0935,ENSG00000260581.1
0.0062  ± 0.0890,ENSG00000267325.1
0.0057  ± 0.0858,ENSG00000225548.4


In [15]:
y_pred = dt_model.predict(X_test)

print("DT Accuracy: ", accuracy_score(y_test, y_pred))

print("Classification report:\n",
    classification_report(y_test, y_pred))

eli5.show_weights(dt_model.named_steps["model"], feature_names=feature_names, top=20)

DT Accuracy:  0.9078947368421053
Classification report:
               precision    recall  f1-score   support

           1       0.89      0.92      0.91       110
           2       0.92      0.90      0.91       118

    accuracy                           0.91       228
   macro avg       0.91      0.91      0.91       228
weighted avg       0.91      0.91      0.91       228



Weight,Feature
0.6389,ENSG00000230937.8
0.0980,ENSG00000276644.3
0.0453,ENSG00000272894.4
0.0421,ENSG00000226084.5
0.0286,ENSG00000253258.1
0.0264,ENSG00000271615.1
0.0184,ENSG00000279149.1
0.0179,ENSG00000261405.2
0.0178,ENSG00000230943.1
0.0142,ENSG00000261613.2


In [16]:
eli5.show_prediction(dt_model.named_steps["model"], 
                     X_test.iloc[[i]],
                     feature_names=feature_names, show_feature_values=True, top=20)

Contribution?,Feature,Value
0.5,<BIAS>,1.0
0.442,ENSG00000230937.8,12.89
0.034,ENSG00000272894.4,5.17
0.018,ENSG00000230943.1,6.375
0.003,ENSG00000236360.2,6.57
0.003,ENSG00000268204.1,3.907


In [17]:
y_pred = xgb_model.predict(X_test)

print("XGB Accuracy: ", accuracy_score(y_test, y_pred))

print("Classification report:\n",
    classification_report(y_test, y_pred))

XGB Accuracy:  0.9473684210526315
Classification report:
               precision    recall  f1-score   support

           1       0.96      0.93      0.94       110
           2       0.93      0.97      0.95       118

    accuracy                           0.95       228
   macro avg       0.95      0.95      0.95       228
weighted avg       0.95      0.95      0.95       228

