In [1]:
from setup_env import setup_environment

setup_environment()

In [2]:
import pickle

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

In [7]:
dataset = "data/history/concat/history_5x5-13k.csv"
df = pd.read_csv(dataset)

In [8]:
X = df.drop("success", axis=1).values
y = df["success"].values

X_train_test, X_val, y_train_test, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train_test, y_train_test, test_size=0.25, random_state=42, stratify=y_train_test
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")

Training set size: 16398
Test set size: 5466
Validation set size: 5466


In [9]:
def test_all_classifiers(X_train, y_train, X_test, y_test, X_val, y_val, seed=42):
    """
    Test multiple classification algorithms and compare their performance.
    """

    # Define classifiers to test
    classifiers = {
        "Decision Tree": DecisionTreeClassifier(random_state=seed),
        "Random Forest": RandomForestClassifier(random_state=seed, n_estimators=100),
        "Gradient Boosting": GradientBoostingClassifier(
            random_state=seed, n_estimators=100
        ),
        "AdaBoost": AdaBoostClassifier(random_state=seed, n_estimators=100),
        "Logistic Regression": LogisticRegression(random_state=seed, max_iter=1000),
        "SVM (Linear)": SVC(random_state=seed, probability=True),
        "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
        "Naive Bayes": GaussianNB(),
    }

    results = []

    for name, clf in classifiers.items():
        print(f"\n{'='*60}")
        print(f"Testing: {name}")
        print(f"{'='*60}")

        try:
            clf.fit(X_train, y_train)

            y_pred_test = clf.predict(X_test)
            test_accuracy = accuracy_score(y_test, y_pred_test)

            y_pred_val = clf.predict(X_val)
            val_accuracy = accuracy_score(y_val, y_pred_val)

            cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring="accuracy")

            results.append(
                {
                    "Classifier": name,
                    "Test Accuracy": test_accuracy,
                    "Validation Accuracy": val_accuracy,
                    "CV Mean": cv_scores.mean(),
                    "CV Std": cv_scores.std(),
                    "Model": clf,
                }
            )

            print(f"\nTest Accuracy: {test_accuracy:.4f}")
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            print(
                f"5-Fold CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})"
            )

            print("\nTest Classification Report:")
            print(classification_report(y_test, y_pred_test))

            with open(f"weights/{name.lower().replace(' ', '_')}.pkl", "wb") as file:
                pickle.dump(clf, file)

        except Exception as e:
            print(f"Error with {name}: {e}")
            results.append(
                {
                    "Classifier": name,
                    "Test Accuracy": 0,
                    "Validation Accuracy": 0,
                    "CV Mean": 0,
                    "CV Std": 0,
                    "Model": None,
                    "Error": str(e),
                }
            )

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values("Test Accuracy", ascending=False)

    return results_df

In [10]:
test_all_classifiers(X_train, y_train, X_test, y_test, X_val, y_val)


Testing: Decision Tree

Test Accuracy: 0.9654
Validation Accuracy: 0.9629
5-Fold CV Accuracy: 0.9648 (+/- 0.0017)

Test Classification Report:
              precision    recall  f1-score   support

       False       0.95      0.98      0.97      2733
        True       0.98      0.95      0.96      2733

    accuracy                           0.97      5466
   macro avg       0.97      0.97      0.97      5466
weighted avg       0.97      0.97      0.97      5466


Testing: Random Forest

Test Accuracy: 0.9687
Validation Accuracy: 0.9689
5-Fold CV Accuracy: 0.9673 (+/- 0.0008)

Test Classification Report:
              precision    recall  f1-score   support

       False       0.96      0.98      0.97      2733
        True       0.98      0.96      0.97      2733

    accuracy                           0.97      5466
   macro avg       0.97      0.97      0.97      5466
weighted avg       0.97      0.97      0.97      5466


Testing: Gradient Boosting

Test Accuracy: 0.9241
Validat

Unnamed: 0,Classifier,Test Accuracy,Validation Accuracy,CV Mean,CV Std,Model
1,Random Forest,0.968716,0.968899,0.967252,0.000829,"(DecisionTreeClassifier(max_features='sqrt', r..."
0,Decision Tree,0.965423,0.962861,0.964813,0.001667,DecisionTreeClassifier(random_state=42)
5,SVM (Linear),0.946945,0.949872,0.945481,0.004048,"SVC(probability=True, random_state=42)"
6,K-Nearest Neighbors,0.944201,0.942554,0.944871,0.003573,KNeighborsClassifier()
2,Gradient Boosting,0.924076,0.926271,0.924869,0.00251,([DecisionTreeRegressor(criterion='friedman_ms...
3,AdaBoost,0.840102,0.842481,0.84181,0.00265,"(DecisionTreeClassifier(max_depth=1, random_st..."
4,Logistic Regression,0.810648,0.807903,0.815648,0.002666,"LogisticRegression(max_iter=1000, random_state..."
7,Naive Bayes,0.767472,0.771131,0.776132,0.003707,GaussianNB()
