## Dataset

In [9]:
import numpy as np 
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)

X = mnist["data"]
y = mnist["target"].astype(np.uint8)


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 

In [11]:
from sklearn.ensemble import RandomForestClassifier
import time

rf_clf = RandomForestClassifier(random_state=42)
t0 = time.time()
rf_clf.fit(X_train, y_train)
t1 = time.time()

print("Training took {:.3f}s".format(t1 - t0))

Training took 41.398s


In [12]:
from sklearn.metrics import accuracy_score

y_pred = rf_clf.predict(X_test)
print(rf_clf.__class__.__name__, accuracy_score(y_test, y_pred))

RandomForestClassifier 0.9653142857142857


## With PCA

In [13]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_train_reduced = pca.fit_transform(X_train)

In [14]:
rnd_clf2 = RandomForestClassifier(random_state=42)

t0 = time.time()
rnd_clf2.fit(X_train_reduced, y_train)
t1 = time.time()

print("Training took {:.3f}s".format(t1 - t0))

Training took 78.506s


In [15]:
X_test_reduced = pca.transform(X_test)

y_pred = rnd_clf2.predict(X_test_reduced)
print(rnd_clf2.__class__.__name__, accuracy_score(y_test, y_pred))

RandomForestClassifier 0.9449142857142857


# RESULT: 2 TIMES MORE SLOWER AFTER DIMENSIONALLITY !

## PCA really did not help: it slowed down training and reduced performance. :(

# Lets try for Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(random_state=42)

t0 = time.time()
lr_clf.fit(X_train, y_train)
t1 = time.time()

print("Training took {:.3f}s".format(t1 - t0))

Training took 16.031s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [18]:
y_pred = lr_clf.predict(X_test)
print(lr_clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.9210285714285714


In [19]:
lr_clf2 = LogisticRegression(random_state=42)

t0 = time.time()
lr_clf2.fit(X_train_reduced, y_train)
t1 = time.time()

print("Training took {:.3f}s".format(t1 - t0))

Training took 6.326s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [20]:
y_pred = lr_clf2.predict(X_test_reduced)
print(lr_clf2.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.9154285714285715
