In [1]:
import pandas as pd
import eli5
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from os.path import exists
import train
import joblib
from utils import main

### Download and process data

In [3]:
datasets = ["LUAD", "LUSC", "KIRP", "KIRC"]
for data in datasets:
    if not exists(f"data/{data}.pkl"):
        main()


Downloading https://gdc.xenahubs.net/download/TCGA-KIRC.htseq_counts.tsv.gz

Done downloading...
.
.
.
.
Extracting
Extraction done
Converting to pickle


### Load Preprocessed data

In [4]:
LUAD = pd.read_pickle("data/LUAD.pkl")
LUSC = pd.read_pickle("data/LUSC.pkl")

In [5]:
LUAD.shape , LUSC.shape

((586, 60488), (551, 60488))

In [6]:
LUAD = LUAD.head(LUSC.shape[0])

In [7]:
LUAD["Target"] = 1
LUSC["Target"] = 2

In [8]:
df = pd.concat([LUAD,LUSC])

del LUAD
del LUSC

In [9]:
def extract_features(df):
    features = list(df.columns[:-1])
    Y = df['Target']
    X = df[features]
    return X,Y

def split_data(X,Y):
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.2, random_state = 42)
    return Xtrain, Xtest, Ytrain, Ytest

In [10]:
X, Y = extract_features(df)
del df
Xtrain, Xtest, Ytrain, Ytest = split_data(X, Y)
feature_names = list(X.columns)
del X
del Y

In [11]:
dataname = "LUSCLUAD"
if not exists(f"models/{dataname}_LR.mdl"):
    train.run_logistic_regression(Xtrain, Xtest, Ytrain, Ytest, dataname)
if not exists(f"models/{dataname}_SVM.mdl"):
    train.run_svm(Xtrain, Xtest, Ytrain, Ytest, dataname)
if not exists(f"models/{dataname}_DT.mdl"):
    train.run_decision_trees(Xtrain, Xtest, Ytrain, Ytest, dataname)
if not exists(f"models/{dataname}_RF.mdl"):
    train.run_random_forest(Xtrain, Xtest, Ytrain, Ytest, dataname)

### Loading the models

In [12]:
rf_model = joblib.load(f"models/{dataname}_RF.mdl")
lr_model = joblib.load(f"models/{dataname}_LR.mdl")
dt_model = joblib.load(f"models/{dataname}_DT.mdl")
svm_model = joblib.load(f"models/{dataname}_SVM.mdl")

### Comparing results

In [13]:
i = 11
Xtest.iloc[[i]]

Ensembl_ID,ENSG00000000003.13,ENSG00000000005.5,ENSG00000000419.11,ENSG00000000457.12,ENSG00000000460.15,ENSG00000000938.11,ENSG00000000971.14,ENSG00000001036.12,ENSG00000001084.9,ENSG00000001167.13,...,ENSGR0000275287.3,ENSGR0000276543.3,ENSGR0000277120.3,ENSGR0000280767.1,ENSGR0000281849.1,__no_feature,__ambiguous,__too_low_aQual,__not_aligned,__alignment_not_unique
TCGA-44-5645-11A,10.343186,2.0,10.118941,9.712527,7.022368,11.812177,11.815383,11.486332,9.667112,10.228819,...,0.0,0.0,0.0,0.0,0.0,21.235978,20.967202,0.0,0.0,23.483737


In [14]:
Ytest.iloc[[i]]

TCGA-44-5645-11A    1
Name: Target, dtype: int64

##### SVM prediction accuracy

In [15]:
y_pred = svm_model.predict(Xtest)

print("SVM Accuracy: ", accuracy_score(Ytest, y_pred))

print("Classification report:\n",
        classification_report(Ytest, y_pred))

eli5.show_weights(svm_model.named_steps["model"], feature_names=feature_names)

SVM Accuracy:  0.9502262443438914
Classification report:
               precision    recall  f1-score   support

           1       0.96      0.95      0.95       118
           2       0.94      0.95      0.95       103

    accuracy                           0.95       221
   macro avg       0.95      0.95      0.95       221
weighted avg       0.95      0.95      0.95       221



Weight?,Feature
+0.005,ENSG00000134757.4
+0.004,ENSG00000178363.4
+0.004,ENSG00000205420.9
+0.004,ENSG00000186081.10
+0.004,ENSG00000197641.10
+0.003,ENSG00000134762.15
+0.003,ENSG00000185479.5
+0.003,ENSG00000186847.5
+0.003,ENSG00000169474.4
+0.003,ENSG00000251039.2


##### Logistic Regression Accuracy

In [16]:
y_pred = lr_model.predict(Xtest)

print("LR Accuracy: ", accuracy_score(Ytest, y_pred))

print("Classification report:\n",
        classification_report(Ytest, y_pred))

eli5.show_weights(lr_model.named_steps["model"], feature_names=feature_names)

LR Accuracy:  0.9592760180995475
Classification report:
               precision    recall  f1-score   support

           1       0.97      0.96      0.96       118
           2       0.95      0.96      0.96       103

    accuracy                           0.96       221
   macro avg       0.96      0.96      0.96       221
weighted avg       0.96      0.96      0.96       221



Weight?,Feature
+0.048,ENSG00000134757.4
+0.040,ENSG00000205420.9
+0.039,ENSG00000178363.4
+0.037,ENSG00000186081.10
+0.033,ENSG00000197641.10
+0.032,ENSG00000185479.5
+0.032,ENSG00000134762.15
+0.032,ENSG00000186847.5
+0.026,ENSG00000169469.8
+0.025,ENSG00000143556.7


##### Random Forest Accuracy

In [17]:
y_pred = rf_model.predict(Xtest)

print("RF Accuracy: ", accuracy_score(Ytest, y_pred))

print("Classification report:\n",
    classification_report(Ytest, y_pred))

eli5.show_weights(rf_model.named_steps["model"], feature_names=feature_names)

RF Accuracy:  0.9411764705882353
Classification report:
               precision    recall  f1-score   support

           1       0.95      0.94      0.94       118
           2       0.93      0.94      0.94       103

    accuracy                           0.94       221
   macro avg       0.94      0.94      0.94       221
weighted avg       0.94      0.94      0.94       221



Weight,Feature
0.0136  ± 0.1657,ENSG00000180739.13
0.0132  ± 0.1748,ENSG00000260581.1
0.0104  ± 0.1381,ENSG00000069849.9
0.0096  ± 0.1345,ENSG00000186081.10
0.0093  ± 0.1432,ENSG00000094796.4
0.0086  ± 0.1205,ENSG00000169474.4
0.0072  ± 0.1177,ENSG00000112378.11
0.0072  ± 0.1150,ENSG00000251381.5
0.0068  ± 0.1253,ENSG00000271134.1
0.0067  ± 0.1120,ENSG00000224984.1


##### Decision Trees Accuracy

In [18]:
y_pred = dt_model.predict(Xtest)

print("DT Accuracy: ", accuracy_score(Ytest, y_pred))

print("Classification report:\n",
    classification_report(Ytest, y_pred))

eli5.show_weights(dt_model.named_steps["model"], feature_names=feature_names)

DT Accuracy:  0.8914027149321267
Classification report:
               precision    recall  f1-score   support

           1       0.89      0.92      0.90       118
           2       0.90      0.86      0.88       103

    accuracy                           0.89       221
   macro avg       0.89      0.89      0.89       221
weighted avg       0.89      0.89      0.89       221



Weight,Feature
0.7021,ENSG00000180739.13
0.0751,ENSG00000154227.12
0.0380,ENSG00000236801.1
0.0357,ENSG00000073754.5
0.0328,ENSG00000241794.1
0.0256,ENSG00000243974.1
0.0241,ENSG00000185479.5
0.0178,ENSG00000272465.1
0.0139,ENSG00000231645.2
0.0097,ENSG00000178078.10


#### Decision Tree

In [None]:
eli5.show_prediction(dt_model.named_steps["model"], 
                     Xtest.iloc[[i]],
                     feature_names=feature_names, show_feature_values=True)

#### Random forest

In [None]:
eli5.show_prediction(rf_model.named_steps["model"],
                     Xtest.iloc[[i]],
                     feature_names=feature_names, show_feature_values=True)

#### Logistic Regression

In [None]:
eli5.show_prediction(lr_model.named_steps["model"],
                     Xtest.iloc[[i]],
                     feature_names=feature_names, show_feature_values=True)

#### SVM

In [None]:
eli5.show_prediction(svm_model.named_steps["model"],
                     Xtest.iloc[[i]],
                     feature_names=feature_names, show_feature_values=True)

# What is "BIAS"?

```
Here the explanation for a single prediction is calculated by following the decision path in the tree, and adding up contribution of each feature from each node crossed into the overall probability predicted. So bascially, it's everything combined.
```

## Running on KIRC and KIRP

In [None]:
KIRC = pd.read_pickle("data/KIRC.pkl")
KIRP = pd.read_pickle("data/KIRP.pkl")

In [None]:
KIRC.shape, KIRP.shape

In [None]:
KIRC = KIRC.head(KIRP.shape[0])

In [None]:
KIRC["Target"] = 1
KIRP["Target"] = 2
df = pd.concat([KIRC, KIRP])
del KIRC
del KIRP

In [None]:
X, Y = extract_features(df)
Xtrain, Xtest, Ytrain, Ytest = split_data(X, Y)
feature_names = list(X.columns)

In [None]:
del df

In [None]:
run_logistic_regression(Xtrain, Xtest, Ytrain, Ytest)

In [None]:
run_svm(Xtrain, Xtest, Ytrain, Ytest)

In [None]:
run_decision_trees(Xtrain, Xtest, Ytrain, Ytest)

In [None]:
run_random_forest(Xtrain, Xtest, Ytrain, Ytest)

In [None]:
i = 12
Xtest.iloc[[i]]

In [None]:
Ytest.iloc[[i]]

#### Decision Tree

In [None]:
eli5.show_prediction(dt_model.named_steps["model"], 
                     Xtest.iloc[[i]],
                     feature_names=feature_names, show_feature_values=True)

#### Random forest

In [None]:
eli5.show_prediction(rf_model.named_steps["model"],
                     Xtest.iloc[[i]],
                     feature_names=feature_names, show_feature_values=True)

#### Logistic Regression

In [None]:
eli5.show_prediction(lr_model.named_steps["model"],
                     Xtest.iloc[[i]],
                     feature_names=feature_names, show_feature_values=True)

#### SVM

In [None]:
eli5.show_prediction(svm_model.named_steps["model"],
                     Xtest.iloc[[i]],
                     feature_names=feature_names, show_feature_values=True)

In [None]:
import joblib
model_name = "KIRCKIRP_lin_reg.mdl"
joblib.dump(lr_model, f"./model/{model_name}")

In [None]:
! mkdir model

In [None]:
lr = joblib.load("model/KIRCKIRP_lin_reg.mdl")

In [None]:
eli5.show_prediction(lr.named_steps["model"],
                     Xtest.iloc[[i]],
                     feature_names=feature_names, show_feature_values=True)