In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from mpl_toolkits.mplot3d import Axes3D
import sklearn
from datetime import datetime, date
import warnings
import timeit
from collections import defaultdict, OrderedDict
import time
from skimage import io, transform
import glob
import tables


timeit.template = """
def inner(_it, _timer{init}):
    {setup}
    _t0 = _timer()
    for _i in _it:
        retval = {stmt}
    _t1 = _timer()
    return _t1 - _t0, retval
"""

matplotlib.style.use('ggplot')

RANDOM_SEED = 33

In [3]:
from iterative_lda import IterativeLDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

## Load P53 data

Start by clearing any row with a question mark

In [None]:
RAW_DATA = './data/iterative_lda/p53/p53_old_2010/K8.data' 
CLEANED_DATA = './data/iterative_lda/p53/p53_old_2010/K8.data_cleaned.csv'

raise ValueError("Don't run again unless I remove this")
with open(RAW_DATA, 'r') as raw_data:
    with open(CLEANED_DATA, 'w') as output:
        line = 'start'
        while line != '':
            line = raw_data.readline()
            if '?' not in line:
                line = line.replace('inactive,', '1').replace('active,', '0')
                output.write(line)

In [None]:
p53 = pandas.read_csv(CLEANED_DATA, header=None)

In [None]:
p53.head(n=20)

## Fit an LDA model to a 100 rows, just to see it fits well

In [None]:
minimal_p53 = p53.head(n=100)
minimal_p53_X = minimal_p53.iloc[:,:-1]
minimal_p53_y = minimal_p53.iloc[:,-1:].as_matrix().ravel()
print(minimal_p53_X.shape, minimal_p53_y.shape)

lda = LinearDiscriminantAnalysis()
lda.fit(minimal_p53_X, minimal_p53_y)
np.sum(lda.predict(minimal_p53_X) == minimal_p53_y) / minimal_p53_y.shape[0]

## Fit an LDA model to the entire data-set

In [None]:
p53_X = p53.iloc[:,:-1]
p53_y = p53.iloc[:,-1:].as_matrix().ravel()

X_train, X_test, y_train, y_test = train_test_split(p53_X, p53_y, test_size=0.1, 
                                        random_state=RANDOM_SEED,
                                       stratify=p53_y)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
print(np.sum(lda.predict(X_train) == y_train) / y_train.shape[0])
print(np.sum(lda.predict(X_test) == y_test) / y_test.shape[0])

In [None]:
raise ValueError('Dont run again')
ilda = IterativeLDA(10)
transformed_X_train = ilda.fit_transform(X_train, y_train)
transformed_X_test = ilda.transform(X_test)

In [None]:
rfc = RandomForestClassifier(verbose=True)
rfc.fit(transformed_X_train, y_train)

print(np.sum(rfc.predict(transformed_X_train) == y_train) / y_train.shape[0])
print(np.sum(rfc.predict(transformed_X_test) == y_test) / y_test.shape[0])

In [None]:
svm = SVC(verbose=True)
svm.fit(transformed_X_train, y_train)

print(np.sum(svm.predict(transformed_X_train) == y_train) / y_train.shape[0])
print(np.sum(svm.predict(transformed_X_test) == y_test) / y_test.shape[0])

## Looks like LDA handles the older p53 dataset with ease. Let's try the newer?

In [4]:
NEW_P53_DATA_RAW = './data/iterative_lda/p53/p53_new_2012/K9.data' 
NEW_P53_DATA = './data/iterative_lda/p53/p53_new_2012/K9.data_cleaned.csv'

raise ValueError("Don't run again unless I remove this")
with open(NEW_P53_DATA_RAW, 'r') as raw_data:
    with open(NEW_P53_DATA, 'w') as output:
        line = 'start'
        while line != '':
            line = raw_data.readline()
            if '?' not in line:
                line = line.replace('inactive,', '1').replace('active,', '0')
                output.write(line)

ValueError: Don't run again unless I remove this

In [5]:
new_p53 = pandas.read_csv(NEW_P53_DATA, header=None)
new_p53.shape

(31159, 5409)

In [6]:
new_p53.head(n=20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5399,5400,5401,5402,5403,5404,5405,5406,5407,5408
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.0,-0.015,...,0.006,0.013,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,1
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.0,-0.002,...,0.002,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,1
2,-0.169,-0.025,-0.01,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.019,0.01,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,1
3,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.02,-0.019,...,0.051,0.012,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,1
4,-0.154,0.005,-0.011,-0.013,-0.002,-0.115,0.005,0.002,-0.003,0.002,...,-0.011,0.012,0.009,0.003,-0.001,0.002,-0.006,0.009,0.013,1
5,-0.15,0.016,-0.014,0.0,0.016,-0.123,-0.004,-0.002,-0.005,0.01,...,0.0,-0.033,-0.032,0.029,-0.052,-0.029,-0.006,0.004,0.023,1
6,-0.158,0.002,-0.019,-0.028,-0.008,-0.101,0.011,0.005,0.001,0.003,...,0.015,-0.025,-0.003,0.017,-0.017,-0.009,-0.001,0.017,0.012,1
7,-0.152,0.009,-0.015,-0.008,0.004,-0.12,-0.002,-0.002,-0.007,0.004,...,-0.014,-0.009,-0.007,0.009,-0.027,-0.012,-0.009,0.008,0.021,1
8,-0.172,-0.028,0.003,-0.045,-0.055,-0.078,0.039,0.011,0.006,-0.02,...,0.016,0.031,-0.059,0.05,0.01,-0.005,0.137,0.098,-0.015,1
9,-0.164,-0.019,-0.011,-0.037,-0.031,-0.081,0.029,0.012,0.007,-0.01,...,0.012,0.006,0.019,0.022,0.011,-0.012,0.002,0.014,-0.009,1


In [9]:
new_p53_X = new_p53.iloc[:,:-1]
new_p53_y = new_p53.iloc[:,-1:].as_matrix().ravel()

new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_p53_X, new_p53_y, 
                                                                    train_size=0.6, test_size=0.2, 
                                        random_state=RANDOM_SEED,
                                       stratify=new_p53_y)

new_X_train.shape, new_y_train.shape, new_X_test.shape, new_y_test.shape

((18695, 5408), (18695,), (6232, 5408), (6232,))

In [10]:
new_lda = LinearDiscriminantAnalysis()
new_lda.fit(new_X_train, new_y_train)
print(np.sum(new_lda.predict(new_X_train) == new_y_train) / new_y_train.shape[0])
print(np.sum(new_lda.predict(new_X_test) == new_y_test) / new_y_test.shape[0])

0.997378978336
0.989409499358


In [11]:
# raise ValueError('Dont run again')
new_ilda = IterativeLDA(5)
new_transformed_X_train = new_ilda.fit_transform(new_X_train, new_y_train)
new_transformed_X_test = new_ilda.transform(new_X_test)

In [12]:
rfc = RandomForestClassifier(verbose=True)
rfc.fit(new_transformed_X_train, new_y_train)

print(np.sum(rfc.predict(new_transformed_X_train) == new_y_train) / new_y_train.shape[0])
print(np.sum(rfc.predict(new_transformed_X_test) == new_y_test) / new_y_test.shape[0])

0.999893019524
0.987804878049


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [13]:
svm = SVC(verbose=True)
svm.fit(new_transformed_X_train, new_y_train)

print(np.sum(svm.predict(new_transformed_X_train) == new_y_train) / new_y_train.shape[0])
print(np.sum(svm.predict(new_transformed_X_test) == new_y_test) / new_y_test.shape[0])

[LibSVM]0.999037175715
0.992458279846


## An SVM trained on the 5-d Iterative LDA data is about 1% better, but it's the last 1%...