In [53]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [2]:
mut_raw = pd.read_csv('../data/TCGA.LUAD.mutations.txt', delimiter='\t')
exp_raw = pd.read_csv('../data/TCGA.LUAD.expression.txt', delimiter='\t')
meta_raw = pd.read_csv('../data/TCGA.LUAD.metadata.txt', delimiter='\t')
clinical = pd.read_csv('../data/clinical.txt', delimiter='\t')
exposure = pd.read_csv('../data/exposure.txt', delimiter='\t')

### Preprocess

In [37]:
metadata_final

Unnamed: 0,patient_id,age_at_initial_pathologic_diagnosis,gender,tumor_stage,is_smoker,OS,OS.time
0,TCGA-05-4244,70.0,0,1.0,True,0,0.0
1,TCGA-05-4249,67.0,0,0.0,True,0,1523.0
2,TCGA-05-4250,79.0,1,1.0,True,1,121.0
3,TCGA-05-4382,68.0,0,0.0,True,0,607.0
4,TCGA-05-4384,66.0,0,1.0,True,0,426.0
...,...,...,...,...,...,...,...
507,TCGA-NJ-A55O,56.0,1,0.0,True,0,13.0
508,TCGA-NJ-A55R,67.0,0,0.0,False,0,603.0
509,TCGA-NJ-A7XG,49.0,0,1.0,False,0,617.0
510,TCGA-O1-A52J,74.0,1,0.0,True,1,1798.0


In [109]:
# Add expression data to meta_file
y = pd.read_csv('labels.csv')
expression = pd.read_csv('../Project 4/Gene_expression.csv')
meta_file = y.merge(expression)
meta_file = meta_file.drop(['sample_type'],axis = 1)
meta_file.shape
meta_file.columns = meta_file.columns.str.replace(r'\|.*', '')

# Add metadata to meta_file
metadata_final = pd.read_csv('metadata_final_no_os.csv')
# changing the gender to binary male=0, female=1
metadata_final['gender'] = metadata_final['gender'].replace({'MALE': 0, 'FEMALE': 1})
# changing the tumor stage to binary
metadata_final['tumor_stage'] = metadata_final['tumor_stage'].replace({'Early': 0, 'Late': 1})
# changing the smoker status to binary
metadata_final['is_smoker'] = metadata_final['is_smoker'].replace({bool(False): 0, bool(True): 1})
meta_file = meta_file.merge(metadata_final, how='left', on='patient_id')

meta_file = meta_file.drop(['patient_id'],axis = 1)

In [110]:
meta_file

Unnamed: 0,label,?|100133144,?|100134869,?|10357,?|10431,?|155060,?|26823,?|340602,?|388795,?|390284,...,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009,age_at_initial_pathologic_diagnosis,gender,tumor_stage,is_smoker
0,0,0.115255,0.097686,-1.940808,-0.033268,1.057022,1.176377,-0.577514,1.530721,0.136966,...,-0.233740,-1.752495,0.980841,-0.505237,-0.279620,-0.447237,70.0,0.0,1.0,1.0
1,1,-0.233267,0.219007,-0.289184,0.202607,-0.069052,1.877450,-0.577514,0.029955,0.027188,...,0.422735,-0.120232,-0.112752,-0.890109,0.314648,0.096801,67.0,0.0,0.0,1.0
2,0,0.248854,-0.384813,-0.533564,-0.173685,0.583903,0.236868,1.884460,0.123771,-0.210492,...,-0.275621,0.608923,0.369427,1.273709,0.374660,-0.019820,,,,
3,0,-1.015130,-1.252003,-0.898805,-1.181021,0.757503,0.484314,-0.577514,-0.013923,0.260217,...,0.782634,-0.680088,-0.150073,-0.171071,1.255729,0.236221,66.0,0.0,1.0,1.0
4,1,-0.816478,0.346711,-1.143054,1.242887,-1.570895,-0.056828,-0.577514,-0.166251,-0.193914,...,-0.065455,0.673791,-1.420624,-0.711757,-0.833433,0.356558,70.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,0,0.593944,0.996343,-0.009038,0.047639,0.668383,1.340455,-0.577514,-1.208640,-0.104800,...,0.222568,-0.210608,-0.386059,0.289729,0.667480,-0.396243,56.0,1.0,0.0,1.0
402,0,-0.069018,-0.011506,-0.817153,-0.316691,1.148248,0.675043,0.645656,2.908490,-1.628148,...,1.156270,0.476298,-0.242561,-0.339250,1.557140,-0.263652,67.0,0.0,0.0,0.0
403,0,1.749735,2.648401,-0.450322,-0.660086,1.016852,-0.892399,-0.577514,0.941106,-1.506687,...,0.543399,-0.541973,-1.441193,-1.226014,0.813458,-1.338024,49.0,0.0,1.0,0.0
404,1,0.486836,1.062102,-1.368185,0.780825,0.141787,3.015645,1.099217,0.123525,-0.071799,...,0.724190,-1.841894,-0.135198,0.430325,0.959205,-1.000029,74.0,1.0,0.0,1.0


In [111]:
prog_gene = pd.read_csv('prognosis_genes.txt')
meta_file.columns = [header.split('|')[0].strip() for header in meta_file.columns]

# Additional columns to include
additional_columns =  np.concatenate((metadata_final.columns[1:],[meta_file.columns[0]]))
# additional_columns = [meta_file.columns[0]]

# Subset the DataFrame based on the valid target column names and additional columns
valid_genes = [col for col in prog_gene['GENES'] if col in meta_file.columns]
columns_to_include = np.concatenate((valid_genes, additional_columns))

# Subset the DataFrame based on the valid target column names
prog_meta_file = meta_file[columns_to_include]
prog_meta_file = prog_meta_file.fillna(0)

In [112]:
prog_meta_file

Unnamed: 0,BAG1,CASP4,FADD,ITGA2,KRT18,KRT19,KRT7,LAMB1,BMP2,CDC6,...,RPS6KB1,TMF1,FEZ2,KIAA0020,KIAA0317,age_at_initial_pathologic_diagnosis,gender,tumor_stage,is_smoker,label
0,-0.300973,-1.609040,1.144728,1.466476,-0.045449,0.565879,-0.120513,-1.195601,0.775067,0.715580,...,2.714577,1.047157,1.207741,-1.151396,0.546405,70.0,0.0,1.0,1.0,0
1,-0.326736,0.284783,-0.829652,0.872741,-0.429994,-0.556406,-1.453648,-1.133745,-1.459483,-1.149056,...,0.700554,0.840149,1.429372,-0.572703,0.891123,67.0,0.0,0.0,1.0,1
2,-0.809002,0.666188,-0.071178,-0.373269,-0.579386,0.511680,-0.276158,0.927946,-0.569311,0.377111,...,-0.231853,-0.649095,0.706858,-0.922223,0.445354,0.0,0.0,0.0,0.0,0
3,2.153975,-0.408216,-0.112112,-1.143161,0.413813,-0.054186,0.437825,-0.290499,0.525574,-1.870488,...,-0.382774,0.286660,0.277540,-0.979793,0.910829,66.0,0.0,1.0,1.0,0
4,0.946434,-0.220682,1.436611,-0.219743,-0.393587,-0.553767,-1.028831,-2.264036,-0.594682,0.430613,...,-0.721583,-1.758530,-0.384056,-0.535291,0.846636,70.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,1.720983,-0.294207,0.186656,-0.090690,-0.574602,0.432138,-0.153641,-0.108077,0.679123,-0.771427,...,-1.533007,0.313823,-0.575478,0.216629,0.329030,56.0,1.0,0.0,1.0,0
402,0.643838,-1.274854,-0.487768,-1.514758,-0.723718,-2.239129,-0.585106,0.278553,0.358615,-0.845496,...,-2.097207,-0.732777,-1.599209,-0.367065,0.397518,67.0,0.0,0.0,0.0,0
403,0.351929,0.644262,-0.065726,-0.313786,0.209452,0.176847,0.057328,-1.890316,-0.258741,-1.481856,...,-0.004379,-0.687599,-0.616669,-1.207571,-1.565932,49.0,0.0,1.0,0.0,0
404,-1.156108,0.114911,-0.363290,0.628345,-1.067190,-0.835419,0.185254,-1.021971,0.494188,-1.134124,...,-1.358910,-1.883722,-0.607404,-1.195800,-0.097168,74.0,1.0,0.0,1.0,1


### Classify with cross-validation

In [113]:
X = prog_meta_file.iloc[:,0:-1]
y = prog_meta_file.iloc[:,-1]

X, y = shuffle(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [114]:
print(len(X_train))
print(len(X_test))

324
82


### Test on LR

In [115]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
model = LogisticRegression(C = 1,penalty='none')
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accurary:')
print(accuracy)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Accurary:
0.5
Confusion Matrix:
[[21 16]
 [25 20]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### CV with SVM

In [116]:
# Define linear SVM model evaluation method
cv = RepeatedStratifiedKFold(n_splits=5, random_state=42)
model = svm.SVC(kernel='linear')

# Evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
recall = cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1)
precision = cross_val_score(model, X, y, scoring='precision', cv=cv, n_jobs=-1)
# Summarize result
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
print('Mean Recall: %.3f (%.3f)' % (np.mean(recall), np.std(recall)))
print('Mean Precision: %.3f (%.3f)' % (np.mean(precision), np.std(precision)))

Mean Accuracy: 0.526 (0.054)
Mean Recall: 0.565 (0.089)
Mean Precision: 0.549 (0.049)


In [117]:
# Define linear SVM model evaluation method
cv = RepeatedStratifiedKFold(n_splits=5, random_state=42)
model = svm.SVC(kernel='rbf')

# Evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
recall = cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1)
precision = cross_val_score(model, X, y, scoring='precision', cv=cv, n_jobs=-1)
# Summarize result
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
print('Mean Recall: %.3f (%.3f)' % (np.mean(recall), np.std(recall)))
print('Mean Precision: %.3f (%.3f)' % (np.mean(precision), np.std(precision)))

Mean Accuracy: 0.521 (0.023)
Mean Recall: 0.953 (0.117)
Mean Precision: 0.525 (0.013)
