In [1]:
from setup_env import setup_environment

setup_environment()

In [2]:
import pickle

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

In [3]:
dataset = 'data/history/concat/history_3x3-500.csv'
df = pd.read_csv(dataset)

In [5]:
X = df.drop('success', axis=1).values
y = df['success'].values

X_train_test, X_val, y_train_test, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train_test, y_train_test, test_size=0.25, random_state=42, stratify=y_train_test
)

print(f'Training set size: {X_train.shape[0]}')
print(f'Test set size: {X_test.shape[0]}')
print(f'Validation set size: {X_val.shape[0]}')

Training set size: 338
Test set size: 113
Validation set size: 113


In [6]:
def test_all_classifiers(X_train, y_train, X_test, y_test, X_val, y_val, seed=42):
    """
    Test multiple classification algorithms and compare their performance.
    """
    
    # Define classifiers to test
    classifiers = {
        'Decision Tree': DecisionTreeClassifier(random_state=seed),
        'Random Forest': RandomForestClassifier(random_state=seed, n_estimators=100),
        'Gradient Boosting': GradientBoostingClassifier(random_state=seed, n_estimators=100),
        'AdaBoost': AdaBoostClassifier(random_state=seed, n_estimators=100),
        'Logistic Regression': LogisticRegression(random_state=seed, max_iter=1000),
        'SVM (Linear)': SVC(kernel='linear', random_state=seed, probability=True),
        'SVM (RBF)': SVC(kernel='rbf', random_state=seed, probability=True),
        'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
        'Naive Bayes': GaussianNB()
    }
    
    results = []
    
    for name, clf in classifiers.items():
        print(f"\n{'='*60}")
        print(f"Testing: {name}")
        print(f"{'='*60}")
        
        try:
            clf.fit(X_train, y_train)
            
            y_pred_test = clf.predict(X_test)
            test_accuracy = accuracy_score(y_test, y_pred_test)
            
            y_pred_val = clf.predict(X_val)
            val_accuracy = accuracy_score(y_val, y_pred_val)

            cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')

            results.append({
                'Classifier': name,
                'Test Accuracy': test_accuracy,
                'Validation Accuracy': val_accuracy,
                'CV Mean': cv_scores.mean(),
                'CV Std': cv_scores.std(),
                'Model': clf
            })

            print(f"\nTest Accuracy: {test_accuracy:.4f}")
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            print(f"5-Fold CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

            print("\nTest Classification Report:")
            print(classification_report(y_test, y_pred_test))
            
            with open(f"weights/{name.lower().replace(' ', '_')}.pkl", "wb") as file:
                pickle.dump(clf, file)
                
        except Exception as e:
            print(f"Error with {name}: {e}")
            results.append({
                'Classifier': name,
                'Test Accuracy': 0,
                'Validation Accuracy': 0,
                'CV Mean': 0,
                'CV Std': 0,
                'Model': None,
                'Error': str(e)
            })

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('Test Accuracy', ascending=False)
    
    return results_df

In [7]:
test_all_classifiers(X_train, y_train, X_test, y_test, X_val, y_val)


Testing: Decision Tree

Test Accuracy: 0.4956
Validation Accuracy: 0.5310
5-Fold CV Accuracy: 0.5710 (+/- 0.0340)

Test Classification Report:
              precision    recall  f1-score   support

       False       0.31      0.39      0.34        38
        True       0.64      0.55      0.59        75

    accuracy                           0.50       113
   macro avg       0.47      0.47      0.47       113
weighted avg       0.53      0.50      0.51       113


Testing: Random Forest

Test Accuracy: 0.6106
Validation Accuracy: 0.6726
5-Fold CV Accuracy: 0.6154 (+/- 0.0202)

Test Classification Report:
              precision    recall  f1-score   support

       False       0.38      0.26      0.31        38
        True       0.68      0.79      0.73        75

    accuracy                           0.61       113
   macro avg       0.53      0.52      0.52       113
weighted avg       0.58      0.61      0.59       113


Testing: Gradient Boosting

Test Accuracy: 0.6106
Validat

Unnamed: 0,Classifier,Test Accuracy,Validation Accuracy,CV Mean,CV Std,Model
4,Logistic Regression,0.707965,0.663717,0.683231,0.043778,"LogisticRegression(max_iter=1000, random_state..."
6,SVM (RBF),0.690265,0.707965,0.665628,0.020683,"SVC(probability=True, random_state=42)"
7,K-Nearest Neighbors,0.672566,0.707965,0.641923,0.038742,KNeighborsClassifier()
3,AdaBoost,0.654867,0.716814,0.718832,0.053977,"(DecisionTreeClassifier(max_depth=1, random_st..."
5,SVM (Linear),0.654867,0.672566,0.668481,0.037032,"SVC(kernel='linear', probability=True, random_..."
1,Random Forest,0.610619,0.672566,0.615408,0.020193,"(DecisionTreeClassifier(max_features='sqrt', r..."
2,Gradient Boosting,0.610619,0.672566,0.648025,0.022132,([DecisionTreeRegressor(criterion='friedman_ms...
8,Naive Bayes,0.59292,0.654867,0.683406,0.027769,GaussianNB()
0,Decision Tree,0.495575,0.530973,0.570983,0.034025,DecisionTreeClassifier(random_state=42)
