# Voting Classifier on MNIST
### Load MNIST, split into train, validation, test.

In [1]:
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

X_tmp, X_test, y_tmp, y_test = train_test_split(X,y,test_size = 1./7.)
X_train, X_val, y_train, y_val = train_test_split(X_tmp,y_tmp,test_size = 1./6.)

print(len(X_test))
print(len(X_train))
print(len(X_val))

10000
50000
10000


### Train a random forest classifier

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

rnd_forest = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=25, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=6,
                       min_weight_fraction_leaf=0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
rnd_forest.fit(X_train,y_train)
  
#grid = {'n_estimators':[10,100,1000],'max_depth':[8,10,15], 'min_samples_split':[6,8],
#        'min_samples_leaf':[1,2],'min_weight_fraction_leaf':[0,.1],'max_leaf_nodes':[20,25]}
                                     
#searcher = GridSearchCV(estimator=rnd_forest,param_grid=grid, n_jobs=-1, cv=3, verbose=10)

#searcher.fit(X_train,y_train)

#searcher.best_estimator_ #RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
#                       criterion='gini', max_depth=15, max_features='auto',
#                       max_leaf_nodes=25, max_samples=None,
#                       min_impurity_decrease=0.0, min_impurity_split=None,
#                       min_samples_leaf=2, min_samples_split=6,
#                       min_weight_fraction_leaf=0, n_estimators=1000,
#                       n_jobs=None, oob_score=False, random_state=None,
#                       verbose=0, warm_start=False)

accuracy_score(y_val, rnd_forest.predict(X_val)) #= .8618

0.8618

### Train Extra Trees Classifier

In [3]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

ext_trees = ExtraTreesClassifier(n_estimators = 300, max_depth = 35, min_samples_split=5)

ext_trees.fit(X_train,y_train)

#grid = {'n_estimators':[300,500], 'max_depth':[30,35]}

#searcher = GridSearchCV(estimator= ext_trees, param_grid = grid, n_jobs=-1,cv=3,verbose=8)

#searcher.fit(X_train,y_train)

#searcher.best_estimator_

accuracy_score(y_val,ext_trees.predict(X_val)) #= .9701

0.9701

### Train SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

svc = SVC()

grid = {'C':[.5], 'kernel':['rbf','linear', 'poly', 'sigmoid'], 'degree':[3], 'gamma':['scale','auto'],
        'coef0':[0.0], 'tol':[0.001] }

searcher = GridSearchCV(estimator=svc, param_grid = grid, n_jobs = -1, cv=3, verbose=9)

searcher.fit(X_train,y_train)

searcher.best_estimator_

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


### Collect into Voting Classifier ensemble

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics impoert accuracy_score

voter = VotingClassifier([('rnd_forest',rnd_forest),
                         ('ext_trees',ext_trees),
                         ('svc',svc)])

voter.fit(X_train,y_train)

accuracy_score(y_val,voter.predict(X_val))