In [63]:
_random_seed_ = 42

In [113]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [65]:
expression_minmax_df = pd.read_csv('../data/processed/expression_tpm_minmax.tsv', sep='\t', index_col=0)

In [66]:
expression_minmax_df = expression_minmax_df.set_index('pog_id')

In [67]:
drugs_df = pd.read_csv('../data/processed/drugs_filtered.tsv', sep='\t', index_col=0)

In [101]:
drugs_selected_df = drugs_df[['pog_id', 'drug_name', 'days_on_tx_since_biopsy', 'cancer_cohort']]

# Prepare features and labels

## Join drugs and expression tables

In [104]:
drugs_expression_df = drugs_selected_df.join(expression_minmax_df, on='pog_id', how='inner')

In [105]:
drugs_expression_df = drugs_expression_df.drop_duplicates()

Number of drug types and their names

## Select cancer type and drug
decide based on notebook 0

In [305]:
# drugs_expression_sel_df = drugs_expression_df[(drugs_expression_df['cancer_cohort'] == 'BRCA') & (drugs_expression_df['drug_name'] == 'GEMCITABINE')]

In [306]:
drugs_expression_sel_df = drugs_expression_df[(drugs_expression_df['drug_name'] == 'GEMCITABINE')]

## Set features (X) and labels (y)

In [307]:
X = drugs_expression_sel_df.loc[:, expression_minmax_df.columns]

In [308]:
y = drugs_expression_sel_df.loc[:, 'days_on_tx_since_biopsy']

## Naive feature selection: Variance threshold

In [289]:
from sklearn.feature_selection import VarianceThreshold

In [334]:
variance_selector = VarianceThreshold(threshold=0.01)

In [335]:
X_selected = variance_selector.fit_transform(X)

In [336]:
X_selected.shape

(138, 12553)

In [292]:
selected_columns = X.columns[variance_selector.get_support()]

In [293]:
X_selected_df = pd.DataFrame(data=X_selected, columns=selected_columns, index=X.index)

## Power transform on y

In [309]:
from sklearn.preprocessing import PowerTransformer

In [295]:
power_transformer = PowerTransformer(method='box-cox', standardize=True)

In [310]:
y_trans = power_transformer.fit_transform(y.values.reshape(-1, 1))[:, 0]

In [323]:
# something to test real quick:
# y_trans = y
# y_trans = 350

# Machine learning

In [298]:
from sklearn.model_selection import train_test_split

In [312]:
#NOTE: X not X_selected_df
# X_train, X_test, y_train, y_test = train_test_split(X_selected_df, y_trans, test_size=0.25, random_state=_random_seed_)
X_train, X_test, y_train, y_test = train_test_split(X, y_trans, test_size=0.25, random_state=_random_seed_)

## SVR

In [300]:
from sklearn.svm import LinearSVR

Ideally I will use a much more principled approach to the hyperparameters for the SVR. The libsvm implementation of SVR that is used allows a lot of flexibility. I will come back to this...

In [313]:
svr = LinearSVR(max_iter=10000, random_state=_random_seed_)

In [314]:
svr.fit(X_train, y_train)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=10000,
     random_state=42, tol=0.0001, verbose=0)

# Evaluation

In [315]:
y_pred = svr.predict(X_test)
y_true = y_test

In [316]:
svr.score(X_test, y_test)

-0.32861525073509523