In [1]:
import numpy as np
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
mnist_dataset = datasets.fetch_openml('mnist_784')

random_state = 0


X = mnist_dataset.data
y = mnist_dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify = y, test_size = 0.1)

In [4]:
X = X_test
y = y_test

In [5]:
# Reduce dimension to 2 with PCA
pca = make_pipeline(StandardScaler(),
                    PCA(0.95))

# Use a nearest neighbor classifier to evaluate the methods
decision_tree_classifier = DecisionTreeClassifier(max_depth=10, min_samples_split=5)

dim_reduction_methods = [('PCA', pca)]
        

dataset_columns = np.array(['zero','one','two','three','four','five','six','seven','eight','nine'])


In [6]:
for i, (name, model) in enumerate(dim_reduction_methods):
    # Fit the method's model
    model.fit(X_train, y_train)

    # Fit a decision tree classifier on the training set
    decision_tree_classifier.fit(model.transform(X_train), y_train)

    accuracy_on_tranining = accuracy_score(decision_tree_classifier.predict(model.fit_transform(X_train)), y_train)
    
    
    accuracy_on_testing = accuracy_score(decision_tree_classifier.predict(model.transform(X_test)), y_test)
    
    print('Training accuracy ', accuracy_on_tranining)
    print('Testing accuracy ',  accuracy_on_testing)
    

Training accuracy  0.8416825396825397
Testing accuracy  0.811


In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
        ("kpca", KernelPCA(n_components=2)),
        ("log_reg", DecisionTreeClassifier())
    ])

param_grid = [{
        "kpca__gamma": np.linspace(0.03, 0.05, 10),
        "kpca__kernel": ["rbf", "sigmoid"]
    }]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(model.transform(X_test), y_test)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('kpca',
                                        KernelPCA(alpha=1.0, coef0=1,
                                                  copy_X=True, degree=3,
                                                  eigen_solver='auto',
                                                  fit_inverse_transform=False,
                                                  gamma=None, kernel='linear',
                                                  kernel_params=None,
                                                  max_iter=None, n_components=2,
                                                  n_jobs=None,
                                                  random_state=None,
                                                  remove_zero_eig=False,
                                                  tol=0)),
                                       ('log_reg',
                                 

In [7]:
print(grid_search.best_params_)

{'kpca__gamma': 0.04555555555555556, 'kpca__kernel': 'sigmoid'}


In [8]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=2)
kf.get_n_splits(X)


KFold(n_splits=2, random_state=None, shuffle=False)
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


In [9]:
print('X_train shape', X_train.shape)
print('X_test shape', X_test.shape)

X_train shape (35000, 784)
X_test shape (35000, 784)


In [10]:
print('y_train shape', y_train.shape)
print('y_test shape', y_test.shape)

y_train shape (35000,)
y_test shape (35000,)


In [7]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn import metrics

decision_tree_classifier = DecisionTreeClassifier(criterion = 'gini', max_depth=8, min_samples_split=4)
print('The number of fold is ', 10)
print('Here is a list of accuries for each fold')

scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(metrics.precision_score,  average = 'micro'),
           'recall': make_scorer(metrics.recall_score, average = 'micro'),
           'f1_macro': make_scorer(metrics.f1_score, average = 'micro')}

cross_validate_result = cross_validate(decision_tree_classifier, X, y,cv=10, scoring=scoring, return_train_score=True)
        


The number of fold is  10
Here is a list of accuries for each fold


In [8]:
cross_validate_result.keys()

dict_keys(['fit_time', 'score_time', 'test_accuracy', 'train_accuracy', 'test_precision', 'train_precision', 'test_recall', 'train_recall', 'test_f1_macro', 'train_f1_macro'])