In [1]:
import numpy as np
import pandas as pd

In [2]:
train_images = np.load('winter2020-mais-202/train_images.npy')
test_images = np.load('winter2020-mais-202/test_images.npy')

train_labels = np.loadtxt('winter2020-mais-202/train_labels.csv', delimiter=',', skiprows=1)[:, 1]

In [3]:
train_images = train_images.reshape(50000, 28*28)
test_images = test_images.reshape(20000, 28*28)

In [4]:
from sklearn.model_selection import train_test_split #to split the dataset
from sklearn.ensemble import RandomForestClassifier #RFC

In [5]:
#split the larger part of the dataset to two parts: 75 % (= 60 % of the total data) to training set, 25 % (= 20 % of the total)
#to the validation set
X_train, X_validation, y_train, y_validation = train_test_split(train_images, train_labels, test_size=0.25)

We will tune the following hyperparameters:
* n_estimators: number of trees in the forest, we will let this range from 1 to 50
* min_samples_split: minimum number of samples required to split an internal node, we let this range from 2 to 10

In [6]:
def rfc_acc(n_estimators, min_samples_split):
    rfc = RandomForestClassifier(n_estimators = n_estimators, min_samples_split = min_samples_split, class_weight='balanced')
    rfc.fit(X_train, y_train) #fits to training set
    
    #make predictions
    #train_predict = rfc.predict(X_train)
    #validation_predict = rfc.predict(X_validation)
    tr_acc = rfc.score(X_train, y_train) #training data accuracy
    val_acc = rfc.score(X_validation, y_validation) #validation data accuracy
    
    return tr_acc, val_acc

In [7]:
n_vals = np.arange(1,51) #n_estimators
min_sample_split_vals = np.arange(2,11) #min_sample_split

In [8]:
train_accuracies = np.empty((50, 10))
validation_accuracies = np.empty((50, 10))

In [9]:
for i in range(50): #loop through values of n_estimators
    print(i)
    for j in range(10): #loop through values of min_sample_split
        train_accuracies[i,j], validation_accuracies[i,j] = rfc_acc(n_vals[i], min_sample_split_vals[j])
        print(j)

0
0
1
2
3
4
1
0
1
2
3
4
2
0
1
2
3
4
3
0
1
2
3
4
4
0
1
2
3
4
5
0
1
2
3
4
6
0
1
2
3
4
7
0
1
2
3
4
8
0
1
2
3
4
9
0
1
2
3
4
10
0
1
2
3
4
11
0
1
2
3
4
12
0
1
2
3
4
13
0
1
2
3
4
14
0
1
2
3
4
15
0
1
2
3
4
16
0
1
2
3
4
17
0
1
2
3
4
18
0
1
2
3
4


In [10]:
#flatten the arrays
tr_acc_flat = np.ravel(train_accuracies)
val_acc_flat = np.ravel(validation_accuracies)

In [13]:
max_index_rfc = np.argmax(val_acc_flat) #the index corresponding to the greatest accuracy for the validation data

In [14]:
val_acc_flat[max_index_rfc] #the accuracy at this index

0.81728

In [15]:
tr_acc_flat[max_index_rfc] #the training accuracy at this index

0.9994666666666666

In [16]:
print(max_index_rfc)

86


In [17]:
np.unravel_index(max_index_rfc, validation_accuracies.shape)

(17, 1)

The choice of parameters that maximize the score are n_estimators = 33, min_sample_split = 3.

In [18]:
rfc = RandomForestClassifier(n_estimators = 33, min_samples_split = 3, class_weight='balanced')
rfc.fit(X_train, y_train) #fits to training set

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=33,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [19]:
y_test = rfc.predict(test_images)

In [20]:
df_test = pd.read_csv('winter2020-mais-202/sample_submission.csv')
df_test['label'] = y_test
df_test.to_csv('winter2020-mais-202/submission.csv', index=False)