In [6]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])

In [4]:
X, y = mnist["data"], mnist["target"]
X.shape, y.shape

((70000, 784), (70000,))

In [26]:
import matplotlib as mpl
import matplotlib.pyplot as plt
some_digit = X[69999]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap = mpl.cm.binary, interpolation="nearest")
plt.axis("off")
plt.show()

<Figure size 640x480 with 1 Axes>

In [7]:
y = y.astype(np.uint8)

In [8]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [9]:
y_train_6 = (y_train == 6) # True for all 5s, False for all other digits.
y_test_6 = (y_test == 6)

In [27]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_6)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [28]:
sgd_clf.predict([some_digit])

array([ True])

In [29]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train_6)

GaussianNB(priors=None, var_smoothing=1e-09)

In [30]:
gnb.predict([some_digit])

array([ True])

In [31]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier()
rnd_clf.fit(X_train, y_train_6)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [32]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
estimators=[('sgd', sgd_clf), ('gnb', gnb), ('rnd', rnd_clf)],
voting='hard')
voting_clf.fit(X_train, y_train_6)

VotingClassifier(estimators=[('sgd',
                              SGDClassifier(alpha=0.0001, average=False,
                                            class_weight=None,
                                            early_stopping=False, epsilon=0.1,
                                            eta0=0.0, fit_intercept=True,
                                            l1_ratio=0.15,
                                            learning_rate='optimal',
                                            loss='hinge', max_iter=1000,
                                            n_iter_no_change=5, n_jobs=None,
                                            penalty='l2', power_t=0.5,
                                            random_state=42, shuffle=True,
                                            tol=0.001, validation_fraction=0.1,
                                            verbose=0,...
                                                     class_weight=None,
                                        

In [40]:
voting_clf2 = VotingClassifier(
estimators=[('lr', sgd_clf), ('rf', gnb), ('svc', rnd_clf)],
voting='soft')
voting_clf2.fit(X_train, y_train_6)

VotingClassifier(estimators=[('lr',
                              SGDClassifier(alpha=0.0001, average=False,
                                            class_weight=None,
                                            early_stopping=False, epsilon=0.1,
                                            eta0=0.0, fit_intercept=True,
                                            l1_ratio=0.15,
                                            learning_rate='optimal',
                                            loss='hinge', max_iter=1000,
                                            n_iter_no_change=5, n_jobs=None,
                                            penalty='l2', power_t=0.5,
                                            random_state=42, shuffle=True,
                                            tol=0.001, validation_fraction=0.1,
                                            verbose=0, w...
                                                     class_weight=None,
                                       

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
bag_clf = BaggingClassifier(
RandomForestClassifier(), n_estimators=10,
max_samples=10, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train_6)
y_pred = bag_clf.predict(X_test)

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=70,
algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train_6)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

In [23]:
from xgboost import XGBClassifier
#import xgboost 
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train_6)
y_pred = xgb_clf.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score
for clf in (sgd_clf, gnb,rnd_clf,voting_clf,bag_clf,ada_clf,xgb_clf):
    clf.fit(X_train, y_train_6)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

SGDClassifier 0.0981
GaussianNB 0.1291
RandomForestClassifier 0.098
VotingClassifier 0.098
BaggingClassifier 0.098
AdaBoostClassifier 0.097
XGBClassifier 0.098
