In [9]:
import pandas as pd
import numpy as np

In [10]:
from sklearn.svm import SVC

In [11]:
random_seed = 42

## Get data

In [92]:
drugs_df = pd.read_csv('../data/processed/drugs_filtered.tsv', sep='\t', index_col=0)

In [93]:
drugs_df = drugs_df[['pog_id', 'drug_name', 'cancer_cohort', 'zscore', 'days_on_tx_since_biopsy']]

In [94]:
# expression_df = pd.read_csv('../data/processed/expression_tpm_log_selected.tsv', sep='\t', index_col=0)
expression_df = pd.read_csv('../data/processed/expression_tpm_minmax_selected.tsv', sep='\t', index_col=0)

## Discretize drug z-scores by K bins

In [95]:
from sklearn.preprocessing import KBinsDiscretizer

In [96]:
discretizer = KBinsDiscretizer(encode='ordinal', n_bins=2, strategy='kmeans')

In [97]:
binned_zscores = discretizer.fit_transform(drugs_df['days_on_tx_since_biopsy'].values.reshape(-1, 1))[:, 0]

In [98]:
drugs_df['binned_days'] = binned_zscores

# 1. Naive approach: all cancer types, all drugs

## Get dummy variables for drug names

In [99]:
drug_dummies = pd.get_dummies(drugs_df['drug_name'])

## Join drug data, drug dummies and expression data

In [100]:
expression_df = expression_df.set_index('pog_id')

In [101]:
drugs_df = drugs_df.join(drug_dummies)

In [102]:
drugs_expression_df = drugs_df.join(expression_df, on='pog_id', how='inner', sort=True)

## Prepare features and labels for SVM

In [103]:
X_columns = np.append(expression_df.columns.values, drug_dummies.columns.values)

In [104]:
X = drugs_expression_df.loc[:, X_columns]

In [105]:
y = drugs_expression_df['binned_zscores']

KeyError: 'binned_zscores'

### Train test split

In [106]:
from sklearn.model_selection import train_test_split

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_seed)

ValueError: Found input variables with inconsistent numbers of samples: [1496, 14]

## SVM

In [72]:
naive_svm = SVC(C=1, kernel='linear')

In [73]:
naive_svm.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [74]:
y_pred = naive_svm.predict(X_test)

## Binary classification metrics

In [75]:
from sklearn.metrics import confusion_matrix, f1_score

In [76]:
confusion_matrix(y_test, y_pred)

array([[373,  18],
       [ 91,  12]])

In [77]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [78]:
tn, fp, fn, tp

(373, 18, 91, 12)

In [79]:
f1_score(y_pred=y_pred, y_true=y_test)

0.18045112781954886

# 2. All cancer types, only GEMCITABINE or FLUOROURACIL

In [80]:
drugs_filtered_df = drugs_df[drugs_df['drug_name'] == 'CISPLATIN']

In [81]:
drugs_expression_filtered_df = drugs_filtered_df.join(expression_df, on='pog_id', how='inner', sort=True)

In [82]:
drugs_expression_filtered_df = drugs_expression_filtered_df[drugs_expression_filtered_df['cancer_cohort'] == 'LUNG']

In [83]:
X = drugs_expression_filtered_df.loc[:, expression_df.columns.values]

In [84]:
y = drugs_expression_filtered_df['binned_zscores']

In [85]:
drugs_expression_filtered_df

Unnamed: 0,pog_id,drug_name,cancer_cohort,zscore,days_on_tx_since_biopsy,binned_zscores,AFATINIB,AGS67E,ANASTROZOLE,ATEZOLIZUMAB,...,ENSG00000198561,ENSG00000198793,ENSG00000198947,ENSG00000205542,ENSG00000206503,ENSG00000213281,ENSG00000213424,ENSG00000215301,ENSG00000234745,ENSG00000245848
1232,POG051,CISPLATIN,LUNG,0.247251,130,0.0,0,0,0,0,...,0.205326,0.18173,0.178671,0.398742,0.128976,0.101495,0.000964,0.343695,0.114226,0.056097
2670,POG052,CISPLATIN,LUNG,-0.914007,5,0.0,0,0,0,0,...,0.209041,0.223032,0.041616,0.495344,0.294657,0.211433,0.000689,0.505776,0.149343,0.074102
2671,POG090,CISPLATIN,LUNG,-0.598145,39,0.0,0,0,0,0,...,0.237606,0.165452,0.07624,0.115941,0.027855,0.106223,0.0,0.276411,0.011673,0.040845
1378,POG181,CISPLATIN,LUNG,-0.291573,72,0.0,0,0,0,0,...,0.109172,0.12087,0.030407,0.08439,0.025198,0.093388,0.004958,0.121781,0.031567,0.02418
1567,POG280,CISPLATIN,LUNG,0.265831,132,0.0,0,0,0,0,...,0.231956,0.144072,0.044279,0.069158,0.08761,0.033269,0.000413,0.204497,0.049047,0.111748
2054,POG289,CISPLATIN,LUNG,-0.746786,23,0.0,0,0,0,0,...,0.250999,0.096453,0.037621,0.309687,0.398093,0.236764,0.0,0.233951,0.140646,0.036084
1774,POG318,CISPLATIN,LUNG,-0.282283,73,0.0,0,0,0,0,...,0.417801,0.240525,0.09555,0.610611,0.27177,0.131217,0.000413,0.519312,0.246444,0.147385
3030,POG402,CISPLATIN,LUNG,-0.245122,77,0.0,0,0,0,0,...,0.187843,0.169218,0.116524,0.137841,0.051524,0.114245,0.00179,0.368563,0.029467,0.021353
2613,POG420,CISPLATIN,LUNG,0.08003,112,0.0,0,0,0,0,...,0.250156,0.069363,0.067251,0.074962,0.049984,0.230516,0.050406,0.267019,0.011742,0.018525
580,POG515,CISPLATIN,LUNG,0.163641,121,0.0,0,0,0,0,...,0.272072,0.138241,0.001332,0.061241,0.015558,0.097864,0.000413,0.280247,0.034734,0.072837


## Train test split

In [42]:
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_seed)

## SVM

In [47]:
filtered_svm = SVC(C=0.01, kernel='linear')

In [48]:
filtered_svm.fit(X_train, y_train)

ValueError: The number of classes has to be greater than one; got 1 class

In [None]:
y_pred = filtered_svm.predict(X_test)

## Binary classification metrics

In [295]:
from sklearn.metrics import confusion_matrix, f1_score

In [296]:
confusion_matrix(y_test, y_pred)

array([[29,  0],
       [ 3,  0]])

In [297]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [298]:
tn, fp, fn, tp

(29, 0, 3, 0)

In [299]:
f1_score(y_pred=y_pred, y_true=y_test)

  'precision', 'predicted', average, warn_for)


0.0

test
