In [18]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.utils import class_weight

target = 'champion'

def load_data(file_path):
    data = pd.read_csv(file_path)
    data= data[[target,'seed','rank','net_rating','off_rating','off_rating_rank','def_rating','def_rating_rank','adj_tempo','adj_tempo_rank','free_throws','free_throws_rank','two_pt_fg','two_pt_fg_rank','three_pt_fg','three_pt_fg_rank','def_free_throws','def_free_throws_rank','def_two_pt_fg','def_two_pt_fg_rank','def_three_pt_fg','def_three_pt_fg_rank','avg_hgt','avg_hgt_rank','eff_hgt','eff_hgt_rank','c_hgt','c_hgt_rank','pf_hgt','pf_hgt_rank','sf_hgt','sf_hgt_rank','sg_hgt','sg_hgt_rank','pg_hgt','pg_hgt_rank','experience','experience_rank','bench','bench_rank']]
    return data

def train_and_evaluate(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    class_weights = None
    if len(np.unique(y)) > 1:
        class_counts = np.bincount(y)
        imbalance_ratio = max(class_counts) / min(class_counts)
        if imbalance_ratio > 1.5:
            print("Detected class imbalance. Adjusting weights.")
            class_weights = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)))
    
    models = {
        'Logistic Regression': LogisticRegression(class_weight=class_weights),
        'Random Forest': RandomForestClassifier(class_weight=class_weights),
        'Gradient Boosting': GradientBoostingClassifier(),
        'SVM': SVC(class_weight=class_weights)
    }
    
    param_grids = {
        'Logistic Regression': {'classifier__C': [0.01, 0.1, 1, 10]},
        'Random Forest': {'classifier__n_estimators': [50, 100, 200]},
        'Gradient Boosting': {'classifier__n_estimators': [50, 100, 200]},
        'SVM': {'classifier__C': [0.1, 1, 10], 'classifier__kernel': ['linear', 'rbf']}
    }
    
    best_model = None
    best_f1 = 0
    best_pipeline = None
    
    for name, model in models.items():
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', model)
        ])
        
        grid_search = GridSearchCV(pipeline, param_grids[name], scoring='f1', cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        y_pred = grid_search.best_estimator_.predict(X_test)
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        print(f"Model: {name}, F1 Score: {f1:.4f}")
        
        if f1 > best_f1:
            best_f1 = f1
            best_model = grid_search.best_estimator_
    
    print("\nBest Model Selected:")
    print(best_model)
    
    y_pred = best_model.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    joblib.dump(best_model, f'best_classification_model_{target}.pkl')
    print("Best model saved as best_classification_model.pkl")

if __name__ == "__main__":
    file_path = "../data/training.csv"  # Update with your dataset path
    data = load_data(file_path)
    
    X = data.drop(columns=[target])  # Update 'target' with your actual target column
    y = data[target]
    
    train_and_evaluate(X, y)

Detected class imbalance. Adjusting weights.
Model: Logistic Regression, F1 Score: 0.9549
Model: Random Forest, F1 Score: 0.9931
Model: Gradient Boosting, F1 Score: 0.9907
Model: SVM, F1 Score: 0.9574

Best Model Selected:
Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 RandomForestClassifier(class_weight={0: np.float64(0.5075258701787394),
                                                      1: np.float64(33.71875)},
                                        n_estimators=50))])

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       215
           1       0.00      0.00      0.00         1

    accuracy                           1.00       216
   macro avg       0.50      0.50      0.50       216
weighted avg       0.99      1.00      0.99       216

Best model saved as best_classification_model.pkl


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
