In [114]:
import pandas as pd
import numpy as np

In [115]:
from sklearn.svm import SVC

In [116]:
random_seed = 42

## Get data

In [184]:
drugs_df = pd.read_csv('../data/processed/drugs_filtered.tsv', sep='\t', index_col=0)

In [185]:
drugs_df = drugs_df[['pog_id', 'drug_name', 'cancer_cohort', 'zscore', 'days_on_tx_since_biopsy']]

In [186]:
# expression_df = pd.read_csv('../data/processed/expression_tpm_log_selected.tsv', sep='\t', index_col=0)
expression_df = pd.read_csv('../data/processed/expression_tpm_minmax_selected.tsv', sep='\t', index_col=0)

## Discretize drug z-scores by K bins

In [187]:
from sklearn.preprocessing import KBinsDiscretizer

In [188]:
discretizer = KBinsDiscretizer(encode='ordinal', n_bins=2, strategy='uniform')

In [189]:
binned_zscores = discretizer.fit_transform(drugs_df['zscore'].values.reshape(-1, 1))[:, 0]

In [190]:
drugs_df['binned_zscores'] = binned_zscores

# 1. Naive approach: all cancer types, all drugs

## Get dummy variables for drug names

In [191]:
drug_dummies = pd.get_dummies(drugs_df['drug_name'])

## Join drug data, drug dummies and expression data

In [192]:
expression_df = expression_df.set_index('pog_id')

In [193]:
drugs_df = drugs_df.join(drug_dummies)

In [194]:
drugs_expression_df = drugs_df.join(expression_df, on='pog_id', how='inner', sort=True)

## Prepare features and labels for SVM

In [195]:
X_columns = np.append(expression_df.columns.values, drug_dummies.columns.values)

In [196]:
X = drugs_expression_df.loc[:, X_columns]

In [197]:
y = drugs_expression_df['binned_zscores']

### Train test split

In [198]:
from sklearn.model_selection import train_test_split

In [199]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_seed)

## SVM

In [200]:
naive_svm = SVC(C=1, kernel='linear')

In [201]:
naive_svm.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [202]:
y_pred = naive_svm.predict(X_test)

## Binary classification metrics

In [203]:
from sklearn.metrics import confusion_matrix, f1_score

In [204]:
confusion_matrix(y_test, y_pred)

array([[456,   3],
       [ 34,   1]])

In [205]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [206]:
tn, fp, fn, tp

(456, 3, 34, 1)

In [207]:
f1_score(y_pred=y_pred, y_true=y_test)

0.05128205128205128

# 2. All cancer types, only GEMCITABINE or FLUOROURACIL

In [262]:
drugs_filtered_df = drugs_df[drugs_df['drug_name'] == 'FLUOROURACIL']

In [263]:
drugs_expression_filtered_df = drugs_filtered_df.join(expression_df, on='pog_id', how='inner', sort=True)

In [264]:
X = drugs_expression_filtered_df.loc[:, expression_df.columns.values]

In [265]:
y = drugs_expression_filtered_df['binned_zscores']

## Train test split

In [266]:
from sklearn.model_selection import train_test_split

In [267]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_seed)

## SVM

In [292]:
filtered_svm = SVC(C=0.01, kernel='linear')

In [293]:
filtered_svm.fit(X_train, y_train)

SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [294]:
y_pred = filtered_svm.predict(X_test)

## Binary classification metrics

In [295]:
from sklearn.metrics import confusion_matrix, f1_score

In [296]:
confusion_matrix(y_test, y_pred)

array([[29,  0],
       [ 3,  0]])

In [297]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [298]:
tn, fp, fn, tp

(29, 0, 3, 0)

In [299]:
f1_score(y_pred=y_pred, y_true=y_test)

  'precision', 'predicted', average, warn_for)


0.0