In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

# Data Reading

In [2]:
train_feat = np.load("datasets/train/train_feature.npz", allow_pickle=True)
train_feat_X = train_feat['features']
train_feat_Y = train_feat['label']
val_feat = np.load("datasets/valid/valid_feature.npz", allow_pickle=True)
val_feat_X = val_feat['features']
val_feat_Y = val_feat['label']

In [3]:
# flatten the features  
train_feat_X = np.array([x.flatten() for x in train_feat_X])
val_feat_X = np.array([x.flatten() for x in val_feat_X])

# Grid Search on best hyperparameters

In [4]:
# train a random forest classifier
clf = RandomForestClassifier(n_estimators=150, max_depth=8, random_state=2)

In [7]:
param_dist = {
    'n_estimators': np.arange(50, 300, 50),
    'max_depth': np.arange(4, 16, 2),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize the Random Search
random_search = RandomizedSearchCV(
    estimator=clf,
    param_distributions=param_dist,
    n_iter=20,  # Number of parameter settings sampled
    cv=5,       # 5-fold cross-validation
    random_state=2,
    n_jobs=-1   # Use all available cores
)

# Fit the random search
random_search.fit(train_feat_X, train_feat_Y)
print("Best parameters found: ", random_search.best_params_)
# Best parameters found:  {'n_estimators': 250, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10}

Best parameters found:  {'n_estimators': 250, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10}


In [8]:
# using the best parameters to train the model
clf = RandomForestClassifier(**random_search.best_params_)
clf.fit(train_feat_X, train_feat_Y)

In [9]:
y = clf.predict(val_feat_X)
print("Accuracy: ", accuracy_score(val_feat_Y, y))

Accuracy:  0.9856850715746421


# Performance Analysis and model training

In [5]:
X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(train_feat_X, train_feat_Y, test_size=0.8, random_state=42)
X_train_40, X_test_40, y_train_40, y_test_40 = train_test_split(train_feat_X, train_feat_Y, test_size=0.6, random_state=42)
X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(train_feat_X, train_feat_Y, test_size=0.4, random_state=42)
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(train_feat_X, train_feat_Y, test_size=0.2, random_state=42)

In [6]:
clf1 = RandomForestClassifier(n_estimators=250, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=10, random_state=2)
clf2 = RandomForestClassifier(n_estimators=250, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=10, random_state=2)
clf3 = RandomForestClassifier(n_estimators=250, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=10, random_state=2)
clf4 = RandomForestClassifier(n_estimators=250, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=10, random_state=2)
clf5 = RandomForestClassifier(n_estimators=250, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=10, random_state=2)

In [7]:
clf1.fit(X_train_20, y_train_20)
clf2.fit(X_train_40, y_train_40)
clf3.fit(X_train_60, y_train_60)
clf4.fit(X_train_80, y_train_80)
clf5.fit(train_feat_X, train_feat_Y)

In [8]:
y1 = clf1.predict(val_feat_X)
y2 = clf2.predict(val_feat_X)
y3 = clf3.predict(val_feat_X)
y4 = clf4.predict(val_feat_X)
y5 = clf5.predict(val_feat_X)


In [9]:
print("Accuracy 20%: ", accuracy_score(val_feat_Y, y1))
print("Accuracy 40%: ", accuracy_score(val_feat_Y, y2))
print("Accuracy 60%: ", accuracy_score(val_feat_Y, y3))
print("Accuracy 80%: ", accuracy_score(val_feat_Y, y4))
print("Accuracy 100%: ", accuracy_score(val_feat_Y, y5))

Accuracy 20%:  0.9591002044989775
Accuracy 40%:  0.9693251533742331
Accuracy 60%:  0.9795501022494888
Accuracy 80%:  0.9856850715746421
Accuracy 100%:  0.9877300613496932


# Prediction

In [13]:
# read test data
test_feat_X = np.load("datasets/test/test_feature.npz", allow_pickle=True)

In [14]:
# flatten
test_feat_X = np.array([x.flatten() for x in test_feat_X['features']])

In [15]:
# predict
y = clf5.predict(test_feat_X)

In [16]:
# save the prediction in txt file for submission
np.savetxt("pred_deepfeat.txt", y, fmt='%d')