In [1]:
import os
import pathlib
import joblib
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
import sklearn.model_selection as skm

import matplotlib.pyplot as plt

In [2]:
# Load data
data = pd.read_csv(os.path.join(pathlib.Path.home(), "stat-5610-project", "data", "train.csv"))
x_data = np.array(data[data.columns.drop("Y")].values)
y_data = data["Y"].values

# Train/ test split
idx = list(range(len(y_data)))
train_idx, test_idx = skm.train_test_split(idx, random_state=2)
x_train, y_train = x_data[train_idx, :], y_data[train_idx]
x_test, y_test = x_data[test_idx, :], y_data[test_idx]

In [4]:
clf = RandomForestClassifier(n_estimators=600, criterion='entropy', max_features='sqrt', max_samples=0.7, random_state=2)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7020746887966807


In [5]:
clf = RandomForestClassifier(n_estimators=600, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=40)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7020746887966807


In [6]:
clf = RandomForestClassifier(n_estimators=600, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=30)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7020746887966807


In [7]:
clf = RandomForestClassifier(n_estimators=300, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=30)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.700414937759336


In [8]:
clf = RandomForestClassifier(n_estimators=800, criterion='entropy',\
    max_features='sqrt', max_samples=0.5, random_state=2, max_depth=30)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7016666666666667


In [9]:
clf = RandomForestClassifier(n_estimators=800, criterion='entropy',\
    max_features='sqrt', max_samples=0.5, random_state=2, max_depth=50)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7050000000000001


In [10]:
clf = RandomForestClassifier(n_estimators=800, criterion='entropy',\
    max_features='sqrt', max_samples=0.5, random_state=2, max_depth=70)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7050000000000001


In [11]:
clf = RandomForestClassifier(n_estimators=800, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=70)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7004991680532446


In [3]:
clf = RandomForestClassifier(n_estimators=200, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=20)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7057851239669422


In [4]:
clf = RandomForestClassifier(n_estimators=400, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=20)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7019867549668874


In [5]:
clf = RandomForestClassifier(n_estimators=100, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=20)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7043189368770764


In [6]:
clf = RandomForestClassifier(n_estimators=100, criterion='entropy',\
    max_features='sqrt', max_samples=0.9, random_state=2, max_depth=20)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7040395713107996


In [7]:
clf = RandomForestClassifier(n_estimators=100, criterion='entropy',\
    max_features='sqrt', max_samples=0.5, random_state=2, max_depth=20)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.699581589958159


In [8]:
clf = RandomForestClassifier(n_estimators=100, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=15)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7051070840197694


In [9]:
clf = RandomForestClassifier(n_estimators=100, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=10)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7178217821782178


In [10]:
clf = RandomForestClassifier(n_estimators=400, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=10)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7123966942148761


In [11]:
clf = RandomForestClassifier(n_estimators=200, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=10)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7144032921810699


In [12]:
clf = RandomForestClassifier(n_estimators=100, criterion='entropy',\
    max_features='sqrt', max_samples=0.75, random_state=2, max_depth=10)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.71


In [13]:
clf = RandomForestClassifier(n_estimators=100, criterion='entropy',\
    max_features='sqrt', max_samples=0.65, random_state=2, max_depth=10)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7165160230073951


In [14]:
clf = RandomForestClassifier(n_estimators=100, criterion='entropy',\
    max_features='sqrt', max_samples=0.65, random_state=2, max_depth=8)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.6785714285714286


In [15]:
clf = RandomForestClassifier(n_estimators=100, criterion='entropy',\
    max_features='sqrt', max_samples=0.68, random_state=2, max_depth=10)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7135761589403973


In [16]:
clf = RandomForestClassifier(n_estimators=100, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=9)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7162944582299421


In [17]:
clf = RandomForestClassifier(n_estimators=110, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=9)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.714759535655058


In [18]:
clf = RandomForestClassifier(n_estimators=90, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=9)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.715702479338843


In [19]:
clf = RandomForestClassifier(n_estimators=90, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=10)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7178217821782178


In [20]:
clf = RandomForestClassifier(n_estimators=80, criterion='entropy',\
    max_features='sqrt', max_samples=0.7, random_state=2, max_depth=10)
results = clf.fit(x_train, y_train)
preds = results.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.7140495867768595
