In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
import os
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from emnist_prediction.metrics import min_f1_score
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pickle

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler

DATA_DIR = Path('../data/input_data')

RANDOM_FOREST_DIR = Path('../data/random_forest_experiment')
os.makedirs(RANDOM_FOREST_DIR, exist_ok=True)

import os
for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

..\data\input_data\sample_submission.csv
..\data\input_data\X_test.npy
..\data\input_data\X_train.npy
..\data\input_data\X_val.npy
..\data\input_data\y_train.npy
..\data\input_data\y_val.npy
..\data\input_data\subdata\X_subtest.npy
..\data\input_data\subdata\X_subval.npy
..\data\input_data\subdata\y_subtest.npy
..\data\input_data\subdata\y_subval.npy


In [3]:
X_train = np.load(DATA_DIR / 'X_train.npy')
y_train = np.load(DATA_DIR / 'y_train.npy')

X_val = np.load(DATA_DIR / 'X_val.npy')
y_val = np.load(DATA_DIR / 'y_val.npy')

X_devs = X_train[:200, :]
y_devs = y_train[:200, :]

### Baseline Random Forest
Train random forest on data as it is

Actually, one hot encoded labels are going to be taken as multioutput problem - that is, there is more than one output

In [4]:
'''
rfc_1hot = RandomForestClassifier(random_state=42)
rfc_1hot.fit(X_train.reshape(len(X_train), -1), y_train)

with open(RANDOM_FOREST_DIR / 'random_forest_clf_1hot', 'wb') as f:
    pickle.dump(rfc_1hot, f)'''

with open(RANDOM_FOREST_DIR / 'baseline_random_forest_clf', 'rb') as f:
    baseline_rfc = pickle.load(f)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [7]:
from emnist_prediction.metrics import get_classification_report

baseline_clf_report = get_classification_report(y_val.argmax(axis=-1), baseline_rfc.predict(X_val.reshape(len(X_val), -1)))
baseline_clf_report.sort_values(by=['f1_score'])

Unnamed: 0,precision,recall,f1_score,support
D,0.876179,0.814693,0.844318,912
H,0.947566,0.803175,0.869416,630
Q,0.934641,0.823417,0.87551,521
J,0.902439,0.885638,0.89396,752
G,0.949115,0.852883,0.898429,503
K,0.928726,0.870445,0.898642,494
V,0.920582,0.88781,0.903899,927
R,0.938606,0.88867,0.912955,1015
A,0.885082,0.967994,0.924683,1281
N,0.909571,0.940498,0.924776,1647


In [6]:
from emnist_prediction.random_forest_utils import get_tree_depths, get_leaves_count

tree_dephts = get_tree_depths(baseline_rfc)
leaves_count = get_leaves_count(baseline_rfc)

print(f"mean tree depth = {np.mean(tree_dephts)}")
print(f"std tree depth = {np.std(tree_dephts)}")
print(f"mean leaves count = {np.mean(leaves_count)}")
print(f"std leaves count = {np.std(leaves_count)}")

mean tree depth = 54.24
std tree depth = 5.779480945552118
mean leaves count = 15544.74
std leaves count = 179.97186557903987


In [16]:
X_test = np.load(DATA_DIR / 'X_test.npy')

In [17]:
test_predictions = baseline_rfc.predict(X_test.reshape(len(X_test), -1))

In [18]:
test_pred_df = pd.DataFrame(test_predictions)

In [19]:
pd.read_csv(DATA_DIR / 'sample_submission.csv')

Unnamed: 0,index,class
0,0,4
1,1,7
2,2,1
3,3,24
4,4,0
...,...,...
31341,31341,25
31342,31342,4
31343,31343,8
31344,31344,19


In [21]:
from emnist_prediction.random_forest import CustomizedRandomForest

In [24]:
# DEVS RUN to see if it's working
devs_rfc = CustomizedRandomForest(dimensionality=100, class_weight='balanced', max_features=100)

y_devs_labels = y_devs.argmax(axis=-1)

devs_rfc.fit(X_devs, y_devs_labels)
y_pred_devs = devs_rfc.predict(X_devs)

print(classification_report(y_devs_labels, y_pred_devs))

print(f'Score = {devs_rfc.score(X_devs, y_devs_labels)}')

clf_unb_report = get_classification_report(y_devs_labels, y_pred_devs)
clf_unb_report

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00        11
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00         8
           5       1.00      1.00      1.00        15
           6       1.00      1.00      1.00         2
           7       1.00      1.00      1.00         9
           8       1.00      1.00      1.00        10
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         8
          11       1.00      1.00      1.00         5
          12       1.00      1.00      1.00         6
          13       1.00      1.00      1.00         9
          14       1.00      1.00      1.00        24
          15       1.00      1.00      1.00         7
          16       1.00      1.00      1.00         1
          17       1.00    

Unnamed: 0,precision,recall,f1_score,support
A,1.0,1.0,1.0,6
B,1.0,1.0,1.0,2
C,1.0,1.0,1.0,11
D,1.0,1.0,1.0,2
E,1.0,1.0,1.0,8
F,1.0,1.0,1.0,15
G,1.0,1.0,1.0,2
H,1.0,1.0,1.0,9
I,1.0,1.0,1.0,10
J,1.0,1.0,1.0,1


### Grid Search

In [25]:
param_grid = [
    {'dimensionality': [400, 100], 'max_depth': [None, 20, 30], 'max_features': ['sqrt', 0.3]}, 
    {'resampler': [RandomUnderSampler(random_state=42), RandomOverSampler(random_state=42), SMOTE()]}, 
    {'class_weight': [None, 'balanced', 'balanced_subsample']}
]

In [18]:
from sklearn.model_selection import PredefinedSplit

train_indices = np.full(len(X_train, ), -1, dtype=int)
val_indices = np.full(len(X_val, ), 0, dtype=int)

indices = np.append(train_indices, val_indices)

ps = PredefinedSplit(indices)

In [19]:
grid_search = GridSearchCV(CustomizedRandomForest(), param_grid, verbose=3, cv=ps, refit=False, return_train_score=True)

In [20]:
# grid_search.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))

Fitting 1 folds for each of 18 candidates, totalling 18 fits
[CV 1/1] END dimensionality=400, max_depth=None, max_features=sqrt; avg_f1: (train=1.000, test=0.844) avg_weighted_f1: (train=1.000, test=0.894) min_f1: (train=1.000, test=0.604) total_f1: (train=1.000, test=0.900) total time=13.2min
[CV 1/1] END dimensionality=400, max_depth=None, max_features=0.3; avg_f1: (train=1.000, test=0.845) avg_weighted_f1: (train=1.000, test=0.895) min_f1: (train=1.000, test=0.615) total_f1: (train=1.000, test=0.901) total time=14.0min
[CV 1/1] END dimensionality=400, max_depth=20, max_features=sqrt; avg_f1: (train=1.000, test=0.843) avg_weighted_f1: (train=1.000, test=0.893) min_f1: (train=1.000, test=0.610) total_f1: (train=1.000, test=0.899) total time=13.8min
[CV 1/1] END dimensionality=400, max_depth=20, max_features=0.3; avg_f1: (train=1.000, test=0.844) avg_weighted_f1: (train=1.000, test=0.894) min_f1: (train=1.000, test=0.599) total_f1: (train=1.000, test=0.900) total time=14.5min
[CV 1/1] 

In [26]:
# with open(RANDOM_FOREST_DIR / 'grid_search_random_forest', 'wb') as f:
  #   pickle.dump(grid_search, f)
    
with open(RANDOM_FOREST_DIR / 'grid_search_random_forest', 'rb') as f:
    grid_search = pickle.load(f)

In [27]:
cv_results = pd.DataFrame(grid_search.cv_results_)

In [28]:
cv_results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_dimensionality', 'param_max_depth', 'param_max_features',
       'param_resampler', 'param_class_weight', 'params',
       'split0_test_avg_weighted_f1', 'mean_test_avg_weighted_f1',
       'std_test_avg_weighted_f1', 'rank_test_avg_weighted_f1',
       'split0_train_avg_weighted_f1', 'mean_train_avg_weighted_f1',
       'std_train_avg_weighted_f1', 'split0_test_min_f1', 'mean_test_min_f1',
       'std_test_min_f1', 'rank_test_min_f1', 'split0_train_min_f1',
       'mean_train_min_f1', 'std_train_min_f1', 'split0_test_total_f1',
       'mean_test_total_f1', 'std_test_total_f1', 'rank_test_total_f1',
       'split0_train_total_f1', 'mean_train_total_f1', 'std_train_total_f1',
       'split0_test_avg_f1', 'mean_test_avg_f1', 'std_test_avg_f1',
       'rank_test_avg_f1', 'split0_train_avg_f1', 'mean_train_avg_f1',
       'std_train_avg_f1'],
      dtype='object')

In [29]:
param_cols = ['param_dimensionality', 'param_max_depth', 'param_max_features',
       'param_resampler', 'param_class_weight']

In [30]:
cv_results[param_cols]

Unnamed: 0,param_dimensionality,param_max_depth,param_max_features,param_resampler,param_class_weight
0,400.0,,sqrt,,
1,400.0,,0.3,,
2,400.0,20.0,sqrt,,
3,400.0,20.0,0.3,,
4,400.0,30.0,sqrt,,
5,400.0,30.0,0.3,,
6,100.0,,sqrt,,
7,100.0,,0.3,,
8,100.0,20.0,sqrt,,
9,100.0,20.0,0.3,,


In [31]:
metric_cols = ['mean_test_avg_weighted_f1', 'mean_train_avg_weighted_f1', 'mean_test_min_f1', 'mean_train_min_f1', 
               'mean_test_total_f1', 'mean_train_total_f1', 'mean_test_avg_f1', 'mean_train_avg_f1']

In [32]:
cv_results[metric_cols]

Unnamed: 0,mean_test_avg_weighted_f1,mean_train_avg_weighted_f1,mean_test_min_f1,mean_train_min_f1,mean_test_total_f1,mean_train_total_f1,mean_test_avg_f1,mean_train_avg_f1
0,0.894106,0.999987,0.604278,0.99952,0.900206,0.999987,0.844148,0.999969
1,0.89468,0.999993,0.61457,0.999867,0.900728,0.999993,0.844585,0.99999
2,0.893251,0.999987,0.609854,0.999774,0.899319,0.999987,0.842991,0.99998
3,0.894267,0.999993,0.598658,0.999868,0.900441,0.999993,0.843621,0.99999
4,0.893848,0.999974,0.611921,0.999726,0.89983,0.999974,0.844023,0.999964
5,0.894389,0.999993,0.594086,0.999752,0.900617,0.999993,0.843322,0.999989
6,0.894201,0.99999,0.594886,0.999736,0.900367,0.99999,0.843405,0.999987
7,0.89359,0.99999,0.60719,0.99976,0.899776,0.99999,0.842595,0.999986
8,0.893062,0.999987,0.598658,0.999752,0.899177,0.999987,0.84241,0.999981
9,0.893996,0.999983,0.60241,0.99952,0.900137,0.999983,0.843333,0.999964


In [33]:
cv_results_subset = cv_results[param_cols + metric_cols]

#### Conclusion
Looking at the following table, and with comparison to baseline random forest, it can be concluded that other changes don't give any improvement and the baseline is the best model

In [34]:
cv_results_subset

Unnamed: 0,param_dimensionality,param_max_depth,param_max_features,param_resampler,param_class_weight,mean_test_avg_weighted_f1,mean_train_avg_weighted_f1,mean_test_min_f1,mean_train_min_f1,mean_test_total_f1,mean_train_total_f1,mean_test_avg_f1,mean_train_avg_f1
0,400.0,,sqrt,,,0.894106,0.999987,0.604278,0.99952,0.900206,0.999987,0.844148,0.999969
1,400.0,,0.3,,,0.89468,0.999993,0.61457,0.999867,0.900728,0.999993,0.844585,0.99999
2,400.0,20.0,sqrt,,,0.893251,0.999987,0.609854,0.999774,0.899319,0.999987,0.842991,0.99998
3,400.0,20.0,0.3,,,0.894267,0.999993,0.598658,0.999868,0.900441,0.999993,0.843621,0.99999
4,400.0,30.0,sqrt,,,0.893848,0.999974,0.611921,0.999726,0.89983,0.999974,0.844023,0.999964
5,400.0,30.0,0.3,,,0.894389,0.999993,0.594086,0.999752,0.900617,0.999993,0.843322,0.999989
6,100.0,,sqrt,,,0.894201,0.99999,0.594886,0.999736,0.900367,0.99999,0.843405,0.999987
7,100.0,,0.3,,,0.89359,0.99999,0.60719,0.99976,0.899776,0.99999,0.842595,0.999986
8,100.0,20.0,sqrt,,,0.893062,0.999987,0.598658,0.999752,0.899177,0.999987,0.84241,0.999981
9,100.0,20.0,0.3,,,0.893996,0.999983,0.60241,0.99952,0.900137,0.999983,0.843333,0.999964


In [9]:
X_test = np.load(DATA_DIR / 'X_test.npy')
y_pred = baseline_rfc.predict(X_test.reshape((len(X_test), -1)))

In [13]:
predictions_df = pd.DataFrame(y_pred).reset_index()
predictions_df.to_csv(RANDOM_FOREST_DIR / 'rfc_test.csv', header=['index', 'class'], index=False)

In [12]:
predictions_df.shape

(31346, 2)