In [6]:
_random_seed_ = 42

In [22]:
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt

In [591]:
expression_minmax_df = pd.read_csv('../data/processed/expression_tpm_minmax.tsv', sep='\t', index_col=0)

In [592]:
expression_minmax_df = expression_minmax_df.set_index('pog_id')

In [593]:
drugs_df = pd.read_csv('../data/processed/drugs_filtered.tsv', sep='\t', index_col=0)

In [594]:
drugs_selected_df = drugs_df[['pog_id', 'drug_name', 'days_on_tx_since_biopsy', 'cancer_cohort']]

# Prepare features and labels

## Join drugs and expression tables

In [595]:
drugs_expression_df = drugs_selected_df.join(expression_minmax_df, on='pog_id', how='inner')

In [596]:
drugs_expression_df = drugs_expression_df.drop_duplicates()

Number of drug types and their names

## Select cancer type and drug
decide based on notebook 0

In [597]:
cancer_types = np.unique(drugs_expression_df['cancer_cohort'])

In [598]:
drug_names = np.unique(drugs_expression_df['drug_name'])

In [599]:
drugs_expression_sel_df = drugs_expression_df[(drugs_expression_df['cancer_cohort'] == 'BRCA') & (drugs_expression_df['drug_name'] == 'CISPLATIN')]
# drugs_expression_sel_df = drugs_expression_df[(drugs_expression_df['cancer_cohort'] == 'LYMP')]

In [600]:
drugs_expression_sel_df

Unnamed: 0,pog_id,drug_name,days_on_tx_since_biopsy,cancer_cohort,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,...,ENSG00000283690,ENSG00000283691,ENSG00000283692,ENSG00000283693,ENSG00000283694,ENSG00000283695,ENSG00000283696,ENSG00000283697,ENSG00000283698,ENSG00000283699
260,POG175,CISPLATIN,28,BRCA,0.0,0.162637,0.13626,0.056458,0.00625,0.037786,...,0.0,0.0,0.0,0.0,0.0,0.0,0.006536,0.0,0.0,0.0
470,POG804,CISPLATIN,15,BRCA,9.7e-05,0.322397,0.279334,0.430208,0.036414,0.013269,...,0.0,0.0,0.0,0.0,0.0,0.0,0.11329,0.0,0.0,0.0
2413,POG419,CISPLATIN,19,BRCA,0.000292,0.274935,0.350871,0.347083,0.022979,0.015939,...,0.0,0.0,0.0,0.0,0.0,0.0,0.139434,0.081081,0.0,0.0
79,POG056,CISPLATIN,53,BRCA,0.00034,0.147828,0.163702,0.140625,0.008142,0.107851,...,0.0,0.0,0.0,0.0,0.0,0.0,0.086057,0.243243,0.0,0.0
775,POG311,CISPLATIN,98,BRCA,0.001069,0.192151,0.112036,0.164792,0.01014,0.057766,...,0.0,0.0,0.0,0.0,0.0,0.0,0.155773,0.081081,0.0,0.0
133,POG304,CISPLATIN,114,BRCA,0.0,0.156934,0.115821,0.106667,0.008546,0.519986,...,0.0,0.0,0.0,0.0,0.0,0.0,0.086057,0.135135,0.0,0.0
2215,POG061,CISPLATIN,183,BRCA,0.0,0.440764,0.15159,0.058125,0.002083,0.001063,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01634,0.0,0.0,0.0
2672,POG131,CISPLATIN,90,BRCA,0.0,0.132182,0.232589,0.117083,0.004825,0.03345,...,0.0,0.0,0.0,0.0,0.0,0.0,0.32244,0.081081,0.0,0.0
213,POG141,CISPLATIN,72,BRCA,0.0,0.406227,0.145723,0.225208,0.008375,0.029787,...,0.0,0.0,0.0,0.0,0.0,0.0,0.020697,0.0,0.0,0.0
2674,POG890,CISPLATIN,112,BRCA,0.000632,0.230822,0.170704,0.206875,0.037052,0.017688,...,0.0,0.0,0.0,0.0,0.0,0.0,0.103486,0.108108,0.0,0.0


In [601]:
len(np.unique(drugs_expression_sel_df['pog_id']))

43

## Set features (X) and labels (y)

In [602]:
X = drugs_expression_sel_df.loc[:, expression_minmax_df.columns]

In [603]:
y = drugs_expression_sel_df.loc[:, 'days_on_tx_since_biopsy']

## Power transform on y

In [604]:
from sklearn.preprocessing import PowerTransformer

In [605]:
power_transformer = PowerTransformer(method='box-cox', standardize=True)

In [606]:
y_trans = power_transformer.fit_transform(y.values.reshape(-1, 1))[:, 0]

## Naive feature selection: Variance threshold

In [607]:
from sklearn.feature_selection import VarianceThreshold

In [663]:
variance_selector = VarianceThreshold(threshold=0.01)

In [664]:
X_selected = variance_selector.fit_transform(X)

In [665]:
X_selected.shape

(43, 9953)

In [611]:
selected_columns = X.columns[variance_selector.get_support()]

In [612]:
X_selected_df = pd.DataFrame(data=X_selected, columns=selected_columns, index=X.index)

In [613]:
from sklearn.model_selection import train_test_split

In [614]:
X_train, X_test, y_train, y_test = train_test_split(X_selected_df, y_trans, test_size=0.25, random_state=_random_seed_)
# X_train, X_test, y_train, y_test = train_test_split(X, y_trans, test_size=0.25, random_state=_random_seed_)

# Recursive Feature Selection

In [615]:
from sklearn.feature_selection import RFE

In [616]:
from sklearn.svm import LinearSVR

In [617]:
num_features = math.floor(len(X_selected_df.columns)/20)

In [657]:
num_features = 2000

In [658]:
svr = LinearSVR(max_iter=10000)

In [659]:
selector = RFE(svr, num_features, step=0.01, verbose=0)

In [660]:
selector = selector.fit(X_train, y_train)

In [661]:
selector.score(X_test, y_test)

0.17781005673145567

In [662]:
svr.fit(X_train, y_train)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=10000,
     random_state=None, tol=0.0001, verbose=0)

In [650]:
svr.score(X_test, y_test)

0.12051862185100981

In [673]:
drugs_expression_df['drug_name'] == 'FLUOROURACIL'

0       False
1911    False
2       False
1024     True
1152     True
1324    False
1562    False
1652    False
1802    False
1934    False
2931    False
3       False
53      False
1595    False
2634     True
2942    False
4       False
260     False
5       False
871     False
2134    False
6       False
470     False
1223    False
9       False
506     False
1455    False
2341    False
13      False
638     False
        ...  
2731    False
2760    False
3167    False
3317    False
2776    False
2801    False
2806    False
2814    False
2852    False
2886    False
2903    False
2908    False
3325    False
3354    False
2914    False
3148    False
2920    False
2960    False
2994    False
3202    False
3017    False
3132    False
3155    False
3159    False
3222    False
3248    False
3299    False
3301    False
3341    False
3304    False
Name: drug_name, Length: 1494, dtype: bool

In [675]:
test = [(1,2), (3,4), (5,6)]
test2 = [1,2,3]

In [680]:
for i, j in zip(test, test2):
    a, b = i
    print(a,b)

1 2
3 4
5 6
