In [1]:
from setup_env import setup_environment

setup_environment()

In [10]:
import pickle

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

In [5]:
dataset = "data/history/concat/history_3x3-500.csv"
df = pd.read_csv(dataset)

In [8]:
SEED = 42

In [9]:
X = df.drop("success", axis=1).values
y = df["success"].values

X_train_test, X_val, y_train_test, y_val = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train_test, y_train_test, test_size=0.25, random_state=SEED, stratify=y_train_test
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")

Training set size: 338
Test set size: 113
Validation set size: 113


In [16]:
dt_param_grid = {
    "max_depth": [3, 5, 10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

dt_grid = GridSearchCV(
    DecisionTreeClassifier(random_state=SEED),
    dt_param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
)

dt_grid.fit(X_train, y_train)
dt_best = dt_grid.best_estimator_

print("Best Decision Tree params:", dt_grid.best_params_)
print("Best Decision Tree score:", dt_grid.best_score_)
print("Decision Tree Test Accuracy:", dt_best.score(X_test, y_test))

with open("weights/best_df.pkl", "wb") as file:
    pickle.dump(dt_best, file)

Best Decision Tree params: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Decision Tree score: 0.6593942054433714
Decision Tree Test Accuracy: 0.6548672566371682


In [17]:
rf_param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [5, 10, 20, None],
    "min_samples_split": [2, 5, 10],
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=SEED),
    rf_param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
)

rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_

print("Best Random Forest params:", rf_grid.best_params_)
print("Best Random Forest score:", rf_grid.best_score_)
print("Random Forest Test Accuracy:", rf_best.score(X_test, y_test))

with open("weights/best_rf.pkl", "wb") as file:
    pickle.dump(dt_best, file)

Best Random Forest params: {'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 50}
Best Random Forest score: 0.6685250219490781
Random Forest Test Accuracy: 0.6637168141592921
