In [1]:
import h5py
import warnings

import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## 1. Load Dataset

In [2]:
with h5py.File('input/data/train/images_training.h5','r') as H:
    data_train = np.copy(H['datatrain'])

with h5py.File('input/data/train/labels_training.h5','r') as H:
    label_train = np.copy(H['labeltrain'])

with h5py.File('input/data/test/images_testing.h5','r') as H:
    data_test = np.copy(H['datatest'])

with h5py.File('input/data/test/labels_testing_2000.h5','r') as H:
    label_test = np.copy(H['labeltest'])

In [3]:
data_train.shape

(30000, 784)

In [4]:
label_train.shape

(30000,)

In [5]:
data_test.shape

(10000, 784)

In [6]:
label_test.shape

(2000,)

## 2. Data Preprocessing

### 2.1 training, validation datasets split

In [7]:
def train_validate_split(X, y, valid_ratio):
    if (valid_ratio > 1) or (valid_ratio < 0):
        raise ValueError("The ratio of validation dataset must be between 0 and 1.")
    
    shuffled_idx = np.random.permutation(len(data_train))
    
    train_len = int(len(shuffled_idx) * (1- valid_ratio))
    
    train_idx = shuffled_idx[:train_len]
    valid_idx = shuffled_idx[train_len:]
    
    return X[train_idx], X[valid_idx], y[train_idx], y[valid_idx]  

In [8]:
X_train, X_valid, y_train, y_valid = train_validate_split(data_train, label_train, 0.2)

X_test = data_test[:2000]
y_test = label_test

print("X_train.shape", X_train.shape)
print("y_train.shape", y_train.shape)

print("X_valid.shape", X_valid.shape)
print("y_valid.shape", y_valid.shape)

X_train.shape (24000, 784)
y_train.shape (24000,)
X_valid.shape (6000, 784)
y_valid.shape (6000,)


### 2.2 PCA

In [9]:
from PCA import PCA

pca = PCA(n_components=100)
pca.fit(X_train)

PCA(n_components = 100)

In [10]:
X_train_reduction = pca.transform(X_train)
X_train_reduction.shape

(24000, 100)

In [11]:
X_valid_reduction = pca.transform(X_valid)
X_valid_reduction.shape

(6000, 100)

In [12]:
X_test_reduction = pca.transform(X_test)
X_test_reduction.shape

(2000, 100)

In [13]:
X_train_reduction_inv = pca.inverse_transform(X_train_reduction)
X_train_reduction_inv.shape

(24000, 784)

In [14]:
def calc_similarity(X, X_inv):
    return 1 - np.sum((X-X_inv)**2) / np.sum(X**2)

In [15]:
calc_similarity(X_train, X_train_reduction_inv)

0.9626695994419884

## 3. Model

In this section, utilizing DecisionTreeClassifier to implement classification tasks.

In [16]:
%%time
from DecisionTreeClassifier import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(max_depth=25)
dt_clf.fit(X_train_reduction, y_train)

CPU times: user 16min 17s, sys: 1.73 s, total: 16min 19s
Wall time: 16min 22s


DecisionTreeClassifier()

In [17]:
%%time
y_pred_valid = dt_clf.predict(X_valid_reduction)
y_pred_test = dt_clf.predict(X_test_reduction)

CPU times: user 96.4 ms, sys: 4.28 ms, total: 101 ms
Wall time: 100 ms


## 4. Metrics

Since the distribution of labels is unbiased, here using the accuracy to evaluate the performance.

### accuracy_score

In [18]:
from metrics import accuracy_score

accuracy_score_valid = accuracy_score(y_valid, y_pred_valid)
accuracy_score_valid

0.7438333333333333

In [19]:
accuracy_score_test = accuracy_score(y_test, y_pred_test)
accuracy_score_test

0.7425