In [41]:
import pandas as pd
import eli5
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from os.path import exists
import train
import joblib
from utils import main

In [2]:
luad = pd.read_pickle("data/LUAD.pkl")

In [3]:
x = luad.std().sort_values()

In [4]:
x.mean()

1249.2147676484396

In [5]:
lusc = pd.read_pickle("data/LUSC.pkl")

In [6]:
x_lusc = lusc.std().sort_values()

In [7]:
x_lusc.mean()

1288.2749978414865

In [8]:
lusc["Target"] = 1
luad["Target"] = 2

In [9]:
comb_df = pd.concat([lusc,luad])
del lusc, luad

In [10]:
comb_df

Ensembl_ID,ENSG00000000003.13,ENSG00000000005.5,ENSG00000000419.11,ENSG00000000457.12,ENSG00000000460.15,ENSG00000000938.11,ENSG00000000971.14,ENSG00000001036.12,ENSG00000001084.9,ENSG00000001167.13,...,ENSGR0000276543.3,ENSGR0000277120.3,ENSGR0000280767.1,ENSGR0000281849.1,__no_feature,__ambiguous,__too_low_aQual,__not_aligned,__alignment_not_unique,Target
TCGA-77-A5GA-01A,11.301496,0.000000,10.479780,8.471675,8.405141,7.918863,9.216746,10.855647,14.178898,9.784635,...,0.0,0.0,0.0,0.0,21.186970,20.404991,0.0,0.0,23.442353,1
TCGA-58-8387-01A,11.177420,0.000000,10.659104,8.945444,9.118941,9.368506,9.636625,10.590587,12.624795,10.948367,...,0.0,0.0,0.0,0.0,21.149569,21.060933,0.0,0.0,25.228361,1
TCGA-22-4599-01A,9.917372,0.000000,9.453271,9.157347,9.255029,8.965784,10.211888,10.087463,13.211280,9.501837,...,0.0,0.0,0.0,0.0,20.385185,20.003936,0.0,0.0,23.318429,1
TCGA-77-7142-11A,10.049849,2.584963,10.501837,9.787903,7.321928,11.870365,13.557703,11.593858,9.891784,10.633903,...,0.0,0.0,0.0,0.0,21.645906,20.801011,0.0,0.0,23.784045,1
TCGA-NC-A5HJ-01A,10.369597,1.000000,11.093418,9.766529,8.927778,11.690871,11.118292,11.385323,10.616549,10.364135,...,0.0,0.0,0.0,0.0,21.602103,20.829376,0.0,0.0,23.894515,1
TCGA-77-A5G6-01A,12.232421,1.584963,10.832099,9.778077,9.828136,7.960002,10.632995,9.861087,11.970106,11.211280,...,0.0,0.0,0.0,0.0,21.641727,20.651274,0.0,0.0,24.575271,1
TCGA-O2-A52Q-01A,11.852920,3.321928,10.827343,8.994353,8.618386,11.666668,12.145932,12.393659,15.310151,10.894818,...,0.0,0.0,0.0,0.0,21.536214,21.217255,0.0,0.0,23.878464,1
TCGA-90-7769-01A,11.482304,0.000000,11.749031,8.957102,9.197217,7.768184,11.402479,11.863025,14.983840,10.603626,...,0.0,0.0,0.0,0.0,21.467019,21.380152,0.0,0.0,24.810098,1
TCGA-56-8504-01A,11.159871,0.000000,10.510764,9.074141,9.139551,9.243174,10.464546,11.051889,13.777974,9.511753,...,0.0,0.0,0.0,0.0,20.861803,20.317563,0.0,0.0,23.628533,1
TCGA-22-5472-11A,11.246741,2.321928,10.821774,10.397675,8.375039,12.694140,14.208920,12.422853,11.389094,11.138912,...,0.0,0.0,0.0,0.0,21.840512,21.709110,0.0,0.0,24.400081,1


In [11]:
all_target = comb_df["Target"]

In [12]:
all_target

TCGA-77-A5GA-01A    1
TCGA-58-8387-01A    1
TCGA-22-4599-01A    1
TCGA-77-7142-11A    1
TCGA-NC-A5HJ-01A    1
TCGA-77-A5G6-01A    1
TCGA-O2-A52Q-01A    1
TCGA-90-7769-01A    1
TCGA-56-8504-01A    1
TCGA-22-5472-11A    1
TCGA-77-A5GF-01A    1
TCGA-34-7107-11A    1
TCGA-66-2800-01A    1
TCGA-85-7697-01A    1
TCGA-85-8049-01A    1
TCGA-18-4086-01A    1
TCGA-63-A5MJ-01A    1
TCGA-98-8020-01A    1
TCGA-34-8454-11A    1
TCGA-90-6837-01A    1
TCGA-66-2785-01A    1
TCGA-68-8250-01A    1
TCGA-56-A5DR-01A    1
TCGA-33-4589-01A    1
TCGA-NK-A5D1-01A    1
TCGA-39-5022-01A    1
TCGA-L3-A4E7-01A    1
TCGA-39-5037-01A    1
TCGA-43-6143-11A    1
TCGA-63-A5MU-01A    1
                   ..
TCGA-49-4488-01A    2
TCGA-78-7148-01A    2
TCGA-55-7283-01A    2
TCGA-44-3917-01A    2
TCGA-L9-A7SV-01A    2
TCGA-55-8208-01A    2
TCGA-MN-A4N4-01A    2
TCGA-55-6712-01A    2
TCGA-97-8172-01A    2
TCGA-44-2661-01A    2
TCGA-55-A491-01A    2
TCGA-MP-A4SY-01A    2
TCGA-05-4395-01A    2
TCGA-44-7667-01A    2
TCGA-44-A4

In [14]:
threshold = 1000
df3 = comb_df.drop(comb_df.std()[comb_df.std() < threshold].index.values, axis=1)

In [15]:
df3["Target"] = all_target

In [16]:
def extract_features(df):
    features = list(df.columns[:-1])
    y = df['Target']
    X = df[features]
    return X,y

def split_data(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify=y)
    return X_train, X_test, y_train, y_test

In [19]:
X, y = extract_features(df3)
X_train, X_test, y_train, y_test = split_data(X, y)
feature_names = list(X.columns)

In [24]:
dataname = "LUSCLUAD_Dim"
if not exists(f"models/{dataname}_LR.mdl"):
    train.run_logistic_regression(X_train, X_test, y_train, y_test, dataname)

Starting Logistic Regression
{'model__C': 1.3}
0.9526952695269527
Done training, model saved to model/LUSCLUAD_Dim_LR.mdl


In [27]:
lr_model = joblib.load(f"models/{dataname}_LR.mdl")

In [38]:
i = 10
X_test.iloc[[i]]

Ensembl_ID,ENSG00000218512.2,ENSG00000218520.5,ENSG00000218521.1,ENSG00000218536.1,ENSG00000218549.1,ENSG00000218561.1,ENSG00000218565.2,ENSG00000218574.1,ENSG00000218577.1,ENSG00000218582.2,...,ENSGR0000275287.3,ENSGR0000276543.3,ENSGR0000277120.3,ENSGR0000280767.1,ENSGR0000281849.1,__no_feature,__ambiguous,__too_low_aQual,__not_aligned,__alignment_not_unique
TCGA-37-3789-01A,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3.70044,...,0.0,0.0,0.0,0.0,0.0,21.878318,21.20043,0.0,0.0,24.56453


In [39]:
y_test.iloc[[i]]

TCGA-37-3789-01A    1
Name: Target, dtype: int64

In [42]:
y_pred = lr_model.predict(X_test)

print("LR Accuracy: ", accuracy_score(y_test, y_pred))

print("Classification report:\n",
        classification_report(y_test, y_pred))

eli5.show_weights(lr_model.named_steps["model"], feature_names=feature_names, top=20)

LR Accuracy:  0.956140350877193
Classification report:
               precision    recall  f1-score   support

           1       0.98      0.93      0.95       110
           2       0.94      0.98      0.96       118

    accuracy                           0.96       228
   macro avg       0.96      0.96      0.96       228
weighted avg       0.96      0.96      0.96       228



Weight?,Feature
+0.038,ENSG00000229807.8
+0.029,ENSG00000263711.4
+0.029,ENSG00000248713.1
+0.025,ENSG00000224237.1
+0.025,ENSG00000241388.4
+0.025,ENSG00000229119.3
+0.025,ENSG00000272620.1
… 21299 more positive …,… 21299 more positive …
… 15315 more negative …,… 15315 more negative …
-0.026,ENSG00000273760.1


In [43]:
eli5.show_prediction(lr_model.named_steps["model"],
                     X_test.iloc[[i]],
                     feature_names=feature_names, show_feature_values=True, top=20)

Contribution?,Feature,Value
+0.599,ENSG00000241794.1,12.271
+0.412,ENSG00000230937.8,12.890
+0.290,ENSG00000265190.5,12.688
+0.264,ENSG00000273760.1,10.291
+0.256,ENSG00000261116.1,11.668
+0.252,ENSG00000256812.1,10.821
+0.249,ENSG00000233864.6,9.617
+0.245,ENSG00000251381.5,11.998
+0.214,ENSG00000226084.5,10.746
+0.212,ENSG00000244094.1,6.658
