In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
import pandas as pd
import time

## Reducing the Breast Cancer Dataset
Loading the Breast Cancer Dataset

In [42]:
data_cancer = datasets.load_breast_cancer()

X_c = data_cancer.data
Y_c = data_cancer.target

#Splitting into Train and Test Set for the original dataset
Xc_train, Xc_test, Yc_train, Yc_test = train_test_split(X_c, Y_c, test_size = .3)

To Reduce the Breast Cancer dataset, we are going to use the Random Forest Classifier. If you look at the dataset, each data point is a real valued number and the target of the dataset is a bool (cancer or no cancer). This is the type of dataset that Random Forest can handle well.

In [43]:
RFCc = RandomForestClassifier()
RFCc.fit(X_c, Y_c)
CFI = RFCc.feature_importances_
cfi = CFI.copy()

#getting the top quarter of the features
quarterc = []
for i in range(len(cfi)/4):
    sort_cfi = sorted(cfi)
    quarterc.append(sort_cfi[len(sort_cfi)-1 - i])
    
#getting the arguments of the top quarter
argc = []
for j in quarterc:
    argc.append(CFI.tolist().index(j))
newX_c = X_c[:, argc]

#Splitting into Train and Test Set for the reduced dataset
newXc_train = train_test_split(newX_c, Y_c, test_size = .3)[0]
newXc_test = train_test_split(newX_c, Y_c, test_size = .3)[1]

Finding the runtime and the accuracy of the Original Cancer Dataset using MLP.

In [44]:
begin = time.time()
MLP_oc = MLPClassifier()
MLP_oc.fit(Xc_train, Yc_train)
Yc_predict = MLP_oc.predict(Xc_test)
print "Accuracy on original cancer dataset is \t\t" + str(MLP_oc.score(Xc_test, Yc_test))
end = time.time()
print "Runtime of MLP on original cancer dataset \t" + str(end - begin)

Accuracy on original cancer dataset is 		0.707602339181
Runtime of MLP on original cancer dataset 	0.0309998989105


Finding the runtime and the accuracy of the Reduced Cancer Dataset using MLP.

In [45]:
begin = time.time()
MLP_rc = MLPClassifier()
MLP_rc.fit(newXc_train, Yc_train)

print "Accuracy on reduced cancer dataset is \t\t" + str(MLP_rc.score(newXc_test, Yc_test))
end = time.time()
print "Runtime of MLP on reduced cancer dataset \t" + str(end - begin)

Accuracy on reduced cancer dataset is 		0.415204678363
Runtime of MLP on reduced cancer dataset 	0.0130000114441


## Reducing the Digit Dataset
Loading the Digit Dataset

In [46]:
data_digits = datasets.load_digits()

X_d = data_digits.data
Y_d = data_digits.target

Xd_train, Xd_test, Yd_train, Yd_test = train_test_split(X_d, Y_d, test_size = .3, random_state= 42)

Using Random Forest on this a reduced version of the Digit Dataset would give you 10% or less accuracy, because this dataset is made of images. What Random Forest would do on this dataset is pick out the pixels with the best luminance, which is not what we're looking for. Here we use PCA and .fit_transform to reduce the dimensionality of the dataset and get more accurate results.

In [55]:
PCAd = PCA()
PCAd.n_components = X_d.shape[1]/4.
newX_d = PCAd.fit_transform(X_d)

newXd_train = train_test_split(newX_d, Y_d, test_size = .3, random_state= 42)[0]
newXd_test = train_test_split(newX_d, Y_d, test_size = .3, random_state= 42)[1]

Finding the runtime and the accuracy of the Original Digit Dataset using MLP.

In [48]:
begin = time.time()
MLP_od = MLPClassifier()
MLP_od.fit(Xd_train, Yd_train)
Yd_predict = MLP_od.predict(Xd_test)

print "Accuracy on original digit dataset is \t\t" + str(MLP_od.score(Xd_test, Yd_test))
end = time.time()
print "Runtime of MLP on original digit dataset \t" + str(end - begin)

Accuracy on original digit dataset is 		0.975925925926
Runtime of MLP on original digit dataset 	1.20499992371


Finding the runtime and the accuracy of the Reduced Digit Dataset using MLP.

In [49]:
begin = time.time()
MLP_rd = MLPClassifier()
MLP_rd.fit(newXd_train, Yd_train)

print "Accuracy on reduced digit dataset is \t\t" + str(MLP_rd.score(newXd_test, Yd_test))
end = time.time()
print "Runtime of MLP on reduced digit dataset \t" + str(end - begin)

Accuracy on reduced digit dataset is 		0.981481481481
Runtime of MLP on reduced digit dataset 	0.94000005722


## Reducing the Olivetti Faces Dataset
Loading the Olivetti Dataset

In [50]:
data_faces = datasets.fetch_olivetti_faces()

X_f = data_faces.data
Y_f = data_faces.target

Xf_train, Xf_test, Yf_train, Yf_test = train_test_split(X_f, Y_f, test_size = .3, random_state= 42)

Here we also use PCA to reducing the Olivetti Faces Dataset, because the dataset is made of images. Pca will treat each image as a matrix and use SVD to transform the original dataset to obtain a k-dimensional (where k is the number of eigenvecctors) feature subspace.

In [51]:
PCAf = PCA()
PCAf.n_components = X_f.shape[1]/4.
newX_f = PCAf.fit_transform(X_f)

newXf_train = train_test_split(newX_f, Y_f, test_size = .3, random_state= 42)[0]
newXf_test = train_test_split(newX_f, Y_f, test_size = .3, random_state= 42)[1]

  self.components_ = components_[:n_components]
  self.explained_variance_ = explained_variance_[:n_components]
  explained_variance_ratio_[:n_components]


Finding the runtime and the accuracy of the Original Faces Dataset using MLP.

In [54]:
begin = time.time()
MLP_of = MLPClassifier()
MLP_of.fit(Xf_train, Yf_train)
Yf_predict = MLP_of.predict(Xf_test)
print "Accuracy on original faces dataset is \t\t" + str(MLP_of.score(Xf_test, Yf_test))
end = time.time()
print "Runtime of MLP on original faces dataset \t" + str(end - begin)

Accuracy on original faces dataset is 		0.833333333333
Runtime of MLP on original faces dataset 	16.8499999046


Finding the runtime and the accuracy of the Reduced Faces Dataset using MLP.

In [53]:
begin = time.time()
MLP_rf = MLPClassifier()
MLP_rf.fit(newXf_train, Yf_train)

print "Accuracy on reduced faces dataset is \t\t" + str(MLP_rf.score(newXf_test, Yf_test))
end = time.time()
print "Runtime of MLP on reduced faces dataset \t" + str(end - begin)

Accuracy on reduced faces dataset is 		0.891666666667
Runtime of MLP on reduced faces dataset 	1.20000004768
