In [1]:
import numpy as np
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
mnist_dataset = datasets.fetch_openml('mnist_784')

random_state = 0


X = mnist_dataset.data
y = mnist_dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify = y, test_size = 0.1)

In [4]:
X = X_testt
y = y_test

In [3]:
# Reduce dimension to 2 with PCA
pca = make_pipeline(StandardScaler(),
                    PCA(0.95))

# Use a nearest neighbor classifier to evaluate the methods
decision_tree_classifier = DecisionTreeClassifier(max_depth=10, min_samples_split=5)

dim_reduction_methods = [('PCA', pca)]
        

dataset_columns = np.array(['zero','one','two','three','four','five','six','seven','eight','nine'])


In [4]:
for i, (name, model) in enumerate(dim_reduction_methods):
    # Fit the method's model
    model.fit(X_train, y_train)

    # Fit a decision tree classifier on the training set
    decision_tree_classifier.fit(model.transform(X_train), y_train)

    accuracy_on_tranining = accuracy_score(decision_tree_classifier.predict(model.fit_transform(X_train)), y_train)
    
    
    accuracy_on_testing = accuracy_score(decision_tree_classifier.predict(model.transform(X_test)), y_test)
    
    print('Training accuracy ', accuracy_on_tranining)
    print('Testing accuracy ',  accuracy_on_testing)
    

Training accuracy  0.8269183673469388
Testing accuracy  0.7865714285714286


In [9]:
print(classification_report(y_test, decision_tree_classifier.predict(model.transform(X_test)), target_names = dataset_columns))
print ('Confussion matrix:\n', confusion_matrix(y_test, decision_tree_classifier.predict(model.transform(X_test))))          

              precision    recall  f1-score   support

        zero       0.91      0.84      0.88      2071
         one       0.97      0.91      0.94      2363
         two       0.75      0.82      0.78      2097
       three       0.80      0.74      0.77      2142
        four       0.71      0.78      0.74      2047
        five       0.70      0.70      0.70      1894
         six       0.90      0.85      0.87      2063
       seven       0.87      0.79      0.83      2188
       eight       0.63      0.70      0.66      2048
        nine       0.70      0.76      0.73      2087

    accuracy                           0.79     21000
   macro avg       0.79      0.79      0.79     21000
weighted avg       0.80      0.79      0.79     21000

Confussion matrix:
 [[1746    0   34   18   26  128   29    6   54   30]
 [   1 2144   39   22    8   11   15   14  100    9]
 [  25    5 1719   62   62   21   53   24  102   24]
 [  13    3   96 1588   22  109   21   33  221   36]
 [  15   

In [10]:
print(classification_report(y_test, decision_tree_classifier.predict(model.transform(X_test)), target_names = dataset_columns))
print ('Confussion matrix:\n', confusion_matrix(y_train, decision_tree_classifier.predict(model.fit_transform(X_train))))          

              precision    recall  f1-score   support

        zero       0.91      0.84      0.88      2071
         one       0.97      0.91      0.94      2363
         two       0.75      0.82      0.78      2097
       three       0.80      0.74      0.77      2142
        four       0.71      0.78      0.74      2047
        five       0.70      0.70      0.70      1894
         six       0.90      0.85      0.87      2063
       seven       0.87      0.79      0.83      2188
       eight       0.63      0.70      0.66      2048
        nine       0.70      0.76      0.73      2087

    accuracy                           0.79     21000
   macro avg       0.79      0.79      0.79     21000
weighted avg       0.80      0.79      0.79     21000

Confussion matrix:
 [[4243    3   56   16   48  231   39   18  134   44]
 [   2 5095  107   56   12   14   29    6  191    2]
 [  33    4 4237  138   86   43   33   29  241   49]
 [  29    6  223 3837   61  278   29   28  426   82]
 [  17   

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
        ("kpca", KernelPCA(n_components=2)),
        ("log_reg", DecisionTreeClassifier())
    ])

param_grid = [{
        "kpca__gamma": np.linspace(0.03, 0.05, 10),
        "kpca__kernel": ["rbf", "sigmoid"]
    }]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(model.transform(X_test), y_test)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('kpca',
                                        KernelPCA(alpha=1.0, coef0=1,
                                                  copy_X=True, degree=3,
                                                  eigen_solver='auto',
                                                  fit_inverse_transform=False,
                                                  gamma=None, kernel='linear',
                                                  kernel_params=None,
                                                  max_iter=None, n_components=2,
                                                  n_jobs=None,
                                                  random_state=None,
                                                  remove_zero_eig=False,
                                                  tol=0)),
                                       ('log_reg',
                                 

In [7]:
print(grid_search.best_params_)

{'kpca__gamma': 0.04555555555555556, 'kpca__kernel': 'sigmoid'}


In [8]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=2)
kf.get_n_splits(X)


KFold(n_splits=2, random_state=None, shuffle=False)
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


In [9]:
print('X_train shape', X_train.shape)
print('X_test shape', X_test.shape)

X_train shape (35000, 784)
X_test shape (35000, 784)


In [10]:
print('y_train shape', y_train.shape)
print('y_test shape', y_test.shape)

y_train shape (35000,)
y_test shape (35000,)


In [15]:
from sklearn.model_selection import cross_validate
decision_tree_classifier = DecisionTreeClassifier(criterion = 'gini', max_depth=8, min_samples_split=4)
print('The number of fold is ', 10)
print('Here is a list of accuries for each fold')
scoring = {'acc': 'accuracy',
           'precision': 'precision_macro',
           'recall': 'recall_macro',
            'f1_score':'f1'}

cross_validate_result = cross_validate(decision_tree_classifier, X, y,cv=10, scoring=scoring, return_train_score=True)
        


The number of fold is  10
Here is a list of accuries for each fold


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [None]:
cross_validate_result.keys()

In [None]:
print('fit-time', cross_validate_result['fit_time'])
print('accuracy', cross_validate_result['accuracy'])
print('f1', cross_validate_result['f1'])
print('precision', cross_validate_result['precision'])
print('recall', cross_validate_result['recall'])