In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
import pandas
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from mpl_toolkits.mplot3d import Axes3D
import sklearn
from datetime import datetime, date
from sklearn_pandas import DataFrameMapper
import warnings
import timeit
from collections import defaultdict, OrderedDict
import tabulate
import time
from skimage import io, transform
import glob
import tables


timeit.template = """
def inner(_it, _timer{init}):
    {setup}
    _t0 = _timer()
    for _i in _it:
        retval = {stmt}
    _t1 = _timer()
    return _t1 - _t0, retval
"""

matplotlib.style.use('ggplot')

RANDOM_SEED = 33

In [64]:
from iterative_lda import IterativeLDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

## Load P53 data

Start by clearing any row with a question mark

In [23]:
RAW_DATA = './data/iterative_lda/p53/p53_old_2010/K8.data' 
CLEANED_DATA = './data/iterative_lda/p53/p53_old_2010/K8.data_cleaned.csv'

raise ValueError("Don't run again unless I remove this")
with open(RAW_DATA, 'r') as raw_data:
    with open(CLEANED_DATA, 'w') as output:
        line = 'start'
        while line != '':
            line = raw_data.readline()
            if '?' not in line:
                line = line.replace('inactive,', '1').replace('active,', '0')
                output.write(line)

In [24]:
p53 = pandas.read_csv(CLEANED_DATA, header=None)

In [25]:
p53.head(n=20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5399,5400,5401,5402,5403,5404,5405,5406,5407,5408
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.0,-0.015,...,0.006,0.013,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,1
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.0,-0.002,...,0.002,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,1
2,-0.169,-0.025,-0.01,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.019,0.01,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,1
3,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.02,-0.019,...,0.051,0.012,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,1
4,-0.154,0.005,-0.011,-0.013,-0.002,-0.115,0.005,0.002,-0.003,0.002,...,-0.011,0.012,0.009,0.003,-0.001,0.002,-0.006,0.009,0.013,1
5,-0.15,0.016,-0.014,0.0,0.016,-0.123,-0.004,-0.002,-0.005,0.01,...,0.0,-0.033,-0.032,0.029,-0.052,-0.029,-0.006,0.004,0.023,1
6,-0.158,0.002,-0.019,-0.028,-0.008,-0.101,0.011,0.005,0.001,0.003,...,0.015,-0.025,-0.003,0.017,-0.017,-0.009,-0.001,0.017,0.012,1
7,-0.152,0.009,-0.015,-0.008,0.004,-0.12,-0.002,-0.002,-0.007,0.004,...,-0.014,-0.009,-0.007,0.009,-0.027,-0.012,-0.009,0.008,0.021,1
8,-0.172,-0.028,0.003,-0.045,-0.055,-0.078,0.039,0.011,0.006,-0.02,...,0.016,0.031,-0.059,0.05,0.01,-0.005,0.137,0.098,-0.015,1
9,-0.164,-0.019,-0.011,-0.037,-0.031,-0.081,0.029,0.012,0.007,-0.01,...,0.012,0.006,0.019,0.022,0.011,-0.012,0.002,0.014,-0.009,1


## Fit an LDA model to a 100 rows, just to see it fits well

In [56]:
minimal_p53 = p53.head(n=100)
minimal_p53_X = minimal_p53.iloc[:,:-1]
minimal_p53_y = minimal_p53.iloc[:,-1:].as_matrix().ravel()
print(minimal_p53_X.shape, minimal_p53_y.shape)

lda = LinearDiscriminantAnalysis()
lda.fit(minimal_p53_X, minimal_p53_y)
np.sum(lda.predict(minimal_p53_X) == minimal_p53_y) / minimal_p53_y.shape[0]

(100, 5408) (100,)




0.97999999999999998

## Fit an LDA model to the entire data-set

In [61]:
p53_X = p53.iloc[:,:-1]
p53_y = p53.iloc[:,-1:].as_matrix().ravel()

X, X_test, y, y_test = train_test_split(p53_X, p53_y, test_size=0.1, 
                                        random_state=RANDOM_SEED,
                                       stratify=p53_y)

X.shape, y.shape, X_test.shape, y_test.shape

((14932, 5408), (14932,), (1660, 5408), (1660,))

In [62]:
lda = LinearDiscriminantAnalysis()
lda.fit(X, y)
print(np.sum(lda.predict(X) == y) / y.shape[0])
print(np.sum(lda.predict(X_test) == y_test) / y_test.shape[0])

0.996182694883
0.977710843373


In [73]:
raise ValueError('Dont run again')
ilda = IterativeLDA(10)
transformed_X = ilda.fit_transform(X, y)
transformed_X_test = ilda.transform(X_test)

In [74]:
rfc = RandomForestClassifier(verbose=True)
rfc.fit(transformed_X, y)

print(np.sum(rfc.predict(transformed_X) == y) / y.shape[0])
print(np.sum(rfc.predict(transformed_X_test) == y_test) / y_test.shape[0])

0.99986605947
0.968674698795


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [75]:
svm = SVC(verbose=True)
svm.fit(transformed_X, y)

print(np.sum(svm.predict(transformed_X) == y) / y.shape[0])
print(np.sum(svm.predict(transformed_X_test) == y_test) / y_test.shape[0])

[LibSVM]0.997923921779
0.945180722892


## Looks like LDA handles the older p53 dataset with ease. Let's try the newer?

In [77]:
NEW_P53_DATA = './data/iterative_lda/p53/Data Sets/K9.data_cleaned.csv'
new_p53 = pandas.read_csv(NEW_P53_DATA, header=None)
new_p53.shape

(31159, 5409)

In [78]:
new_p53.head(n=20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5399,5400,5401,5402,5403,5404,5405,5406,5407,5408
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.0,-0.015,...,0.006,0.013,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,1
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.0,-0.002,...,0.002,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,1
2,-0.169,-0.025,-0.01,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.019,0.01,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,1
3,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.02,-0.019,...,0.051,0.012,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,1
4,-0.154,0.005,-0.011,-0.013,-0.002,-0.115,0.005,0.002,-0.003,0.002,...,-0.011,0.012,0.009,0.003,-0.001,0.002,-0.006,0.009,0.013,1
5,-0.15,0.016,-0.014,0.0,0.016,-0.123,-0.004,-0.002,-0.005,0.01,...,0.0,-0.033,-0.032,0.029,-0.052,-0.029,-0.006,0.004,0.023,1
6,-0.158,0.002,-0.019,-0.028,-0.008,-0.101,0.011,0.005,0.001,0.003,...,0.015,-0.025,-0.003,0.017,-0.017,-0.009,-0.001,0.017,0.012,1
7,-0.152,0.009,-0.015,-0.008,0.004,-0.12,-0.002,-0.002,-0.007,0.004,...,-0.014,-0.009,-0.007,0.009,-0.027,-0.012,-0.009,0.008,0.021,1
8,-0.172,-0.028,0.003,-0.045,-0.055,-0.078,0.039,0.011,0.006,-0.02,...,0.016,0.031,-0.059,0.05,0.01,-0.005,0.137,0.098,-0.015,1
9,-0.164,-0.019,-0.011,-0.037,-0.031,-0.081,0.029,0.012,0.007,-0.01,...,0.012,0.006,0.019,0.022,0.011,-0.012,0.002,0.014,-0.009,1


In [79]:
new_p53_X = new_p53.iloc[:,:-1]
new_p53_y = new_p53.iloc[:,-1:].as_matrix().ravel()

new_X, new_X_test, new_y, new_y_test = train_test_split(new_p53_X, new_p53_y, test_size=0.1, 
                                        random_state=RANDOM_SEED,
                                       stratify=new_p53_y)

new_X.shape, new_y.shape, new_X_test.shape, new_y_test.shape

((28043, 5408), (28043,), (3116, 5408), (3116,))

In [81]:
new_lda = LinearDiscriminantAnalysis()
new_lda.fit(new_X, new_y)
print(np.sum(new_lda.predict(new_X) == new_y) / new_y.shape[0])
print(np.sum(new_lda.predict(new_X_test) == new_y_test) / new_y_test.shape[0])

0.996041792961
0.991655969191


In [82]:
# raise ValueError('Dont run again')
new_ilda = IterativeLDA(10)
new_transformed_X = new_ilda.fit_transform(new_X, new_y)
new_transformed_X_test = new_ilda.transform(new_X_test)

In [84]:
rfc = RandomForestClassifier(verbose=True)
rfc.fit(new_transformed_X, new_y)

print(np.sum(rfc.predict(new_transformed_X) == new_y) / new_y.shape[0])
print(np.sum(rfc.predict(new_transformed_X_test) == new_y_test) / new_y_test.shape[0])

0.99975038334
0.98973042362


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished


In [85]:
svm = SVC(verbose=True)
svm.fit(new_transformed_X, new_y)

print(np.sum(svm.predict(new_transformed_X) == new_y) / new_y.shape[0])
print(np.sum(svm.predict(new_transformed_X_test) == new_y_test) / new_y_test.shape[0])

[LibSVM]0.99814570481
0.994544287548


## An SVM trained on the 10-d Iterative LDA data is about 0.3% better...