In [6]:
import numpy as np
import pandas as pd
import math
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn import decomposition
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.multioutput import MultiOutputRegressor
from matplotlib import pyplot as plt
import matplotlib.style as style
import time

In [7]:
import plotly
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
plotly.plotly.sign_in('spersad', 'oNkuP1yzbpN734Ag8M9P')
import plotly.graph_objs as go

from IPython.core.display import display, HTML

In [8]:
style.use('fivethirtyeight')
imputed = 'PATIENT_DATA_ALL_3.csv'
therapy = 'PATIENT_DATA_THERAPY.csv'
merged = pd.read_csv(imputed,sep='\t')
therapy = pd.read_csv(therapy,sep='\t')
cna = pd.read_csv('data_CNA.xls.txt',sep='\t')

In [9]:
cna.set_index('Hugo_Symbol', inplace=True)
cna.drop(['Entrez_Gene_Id'], axis=1,inplace=True)
cna = cna.transpose()
cna.sort_index(inplace=True)
#cna = cna[:-1] # getting rid of the last row

#merged.sort_values('SAMPLE_ID', ascending=False)
merged.set_index('SAMPLE_ID',inplace=True)
merged.drop('PATIENT_ID', axis=1,inplace=True)
merged.drop(['Unnamed: 0'], axis=1,inplace=True)
merged.sort_index(inplace=True)

therapy.drop('PATIENT_ID', axis=1,inplace=True)
therapy.drop(['Unnamed: 0'], axis=1,inplace=True)
therapy.set_index('SAMPLE_ID', inplace=True)
therapy.sort_index(inplace=True)

clinical_n_cna = pd.concat([merged, cna], axis=1)

columns = list(therapy.columns)
categories = list(range(len(columns)))
category_dict = dict(zip(columns, categories))

clinical_n_cna = clinical_n_cna.dropna(axis=0, how='any')

In [10]:
X = clinical_n_cna.as_matrix() 
Y = therapy.as_matrix()

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import time

In [12]:
randomize = True
if randomize:
    random_indices = np.random.permutation(len(X))
    # Randomized data
    X_r = X[random_indices]
    Y_r = Y[random_indices]

# Select a training, test and validation set
n_train = int(0.5*X.shape[0])
n_val = int(0.1*X.shape[0])

X_train, y_train = X_r[:n_train], Y_r[:n_train]
X_val, y_val = X_r[n_train:n_train+n_val], Y_r[n_train:n_train+n_val]
X_test, y_test = X_r[n_train+n_val:], Y_r[n_train+n_val:]

In [13]:
# Turn down for faster run time
n_samples = 10000

penalty = 'l1'

train_samples, n_features = X_train.shape
n_classes = np.unique(Y).shape[0]

t0 = time.clock()

print('Dataset 2CN, train_samples=%i, n_features=%i, n_classes=%i'
      % (train_samples, n_features, n_classes))

models = {'ovr': {'name': 'One versus Rest', 'iters': [1, 3, 7, 11, 13, 15]},
          'multinomial': {'name': 'Multinomial', 'iters': [1, 3, 7, 11, 13, 15]}}

# Solver which handles multiclass and l1 penalty
solver = 'newton-cg'

if penalty == 'l1':
    print('Using l1 regression')
    models = {'ovr': {'name': 'One versus Rest', 'iters': [1, 3]}}
    solver = 'liblinear'
    
for model in models:
    # Add initial chance-level values for plotting purpose
    test_accuracies = [1 / n_classes]
    tr_accuracies = [1/n_classes]
    val_accuracies = [1/n_classes]
    times = [0]
    densities = [1]

    model_params = models[model]

    # Small number of epochs for fast runtime
    for this_max_iter in model_params['iters']:
        print('[model=%s, solver=%s] Number of epochs: %s' %
              (model_params['name'], solver, this_max_iter))
        #lr = LogisticRegression(solver=solver,multi_class=model,C=1,penalty=penalty,fit_intercept=True, max_iter=this_max_iter, random_state=42)
        max_depth = 30
        lr = MultiOutputRegressor(RandomForestRegressor(max_depth=max_depth,
                                                          random_state=0))
        t1 = time.clock()
        lr.fit(X_train, y_train)
        train_time = time.clock() - t1

        y_pred = lr.predict(X_test)
        accuracy = np.sum(y_pred == y_test) / y_test.shape[0]
        #density = np.mean(lr.get_params(deep=True) != 0, axis=1) * 100
        tr_accuracy = np.sum(y_train == lr.predict(X_train)) / y_train.shape[0]
        val_accuracy = np.sum(y_val == lr.predict(X_val)) / y_val.shape[0]
        
        test_accuracies.append(accuracy)
        #densities.append(density)
        tr_accuracies.append(tr_accuracy)
        val_accuracies.append(val_accuracy)
        times.append(train_time)
    models[model]['times'] = times
    #models[model]['densities'] = densities
    models[model]['test_accuracies'] = test_accuracies
    models[model]['tr_accuracies'] = tr_accuracies
    models[model]['val_accuracies'] = val_accuracies
    
    print('Test accuracy for model %s: %.4f' % (model, test_accuracies[-1]))
    print('Train accuracy for model %s: %.4f' % (model, tr_accuracies[-1]))
    print('Validation accuracy for model %s: %.4f' % (model, val_accuracies[-1]))
    #print('%% non-zero coefficients for model %s, '
    #     'per class:\n %s' % (model, densities[-1]))
    print('Run time (%i epochs) for model %s:'
          '%.2f' % (model_params['iters'][-1], model, times[-1]))

fig = plt.figure()
ax = fig.add_subplot(111)

for model in models:
    data = []
    for accuracy in ['tr_accuracies', 'val_accuracies','test_accuracies']:
        
        trace = go.Scatter(
                            x = models[model]['times'],
                            y = models[model][accuracy],
                            name = accuracy
                        )

        data.append(trace)
    layout = go.Layout(
        title = 'Accuracy for {0} model'.format(model),
        xaxis = dict(title = 'Time'),
        yaxis = dict(title = 'Accuracy'),
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
        
run_time = time.clock() - t0
print('Example run in %.3f s' % run_time)

Dataset 2CN, train_samples=51, n_features=23168, n_classes=2
Using l1 regression
[model=One versus Rest, solver=liblinear] Number of epochs: 1
[model=One versus Rest, solver=liblinear] Number of epochs: 3
Test accuracy for model ovr: 35.1707
Train accuracy for model ovr: 40.7451
Validation accuracy for model ovr: 33.8000
Run time (3 epochs) for model ovr:5.18


NameError: name 't0' is not defined