In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
import pandas
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from mpl_toolkits.mplot3d import Axes3D
import sklearn
from datetime import datetime, date
from sklearn_pandas import DataFrameMapper
import warnings
import timeit
from collections import defaultdict, OrderedDict
import tabulate
import time
from skimage import io, transform
import glob
import tables


timeit.template = """
def inner(_it, _timer{init}):
    {setup}
    _t0 = _timer()
    for _i in _it:
        retval = {stmt}
    _t1 = _timer()
    return _t1 - _t0, retval
"""

matplotlib.style.use('ggplot')

RANDOM_SEED = 33

In [4]:
from iterative_lda import IterativeLDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

## Load MNIST


In [5]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
mnist.data.shape

(70000, 784)

In [6]:
def extract_data(mnist, first_label, second_label):
    indices = np.logical_or(mnist.target == first_label, mnist.target == second_label)
    return mnist.data[indices], mnist.target[indices]

X, y = extract_data(mnist, 7, 8)
X.shape, y.shape

((14118, 784), (14118,))

## Fit an LDA model to the entire data-set

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, 
                                                    random_state=RANDOM_SEED,
                                                    stratify=y)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((12706, 784), (12706,), (1412, 784), (1412,))

In [17]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
print(np.sum(lda.predict(X_train) == y_train) / y_train.shape[0])
print(np.sum(lda.predict(X_test) == y_test) / y_test.shape[0])

0.990240831103
0.987252124646




In [18]:
# raise ValueError('Dont run again')
ilda = IterativeLDA(10)
transformed_X_train = ilda.fit_transform(X_train, y_train)
transformed_X_test = ilda.transform(X_test)



In [19]:
rfc = RandomForestClassifier(verbose=True)
rfc.fit(transformed_X_train, y_train)

print(np.sum(rfc.predict(transformed_X_train) == y_train) / y_train.shape[0])
print(np.sum(rfc.predict(transformed_X_test) == y_test) / y_test.shape[0])

0.998662049425
0.978753541076


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [20]:
svm = SVC(verbose=True)
svm.fit(transformed_X_train, y_train)

print(np.sum(svm.predict(transformed_X_train) == y_train) / y_train.shape[0])
print(np.sum(svm.predict(transformed_X_test) == y_test) / y_test.shape[0])

[LibSVM]0.990713048953
0.983711048159
