### Importing Packages

In [1]:
import sys
sys.path.insert(0, 'C:/Users/enioh/Documents/Github/MMA')

from os.path import abspath, dirname, join
from src.models.train_test import train_test_split, X_y_split

import pandas as pd
import numpy as np

### Importing Fully Processed Data

In [2]:
df = pd.read_csv(abspath(join(dirname(abspath('')), 'data/', 'processed/', 'extracted_stats.csv')))

In [3]:
df['date'] = pd.to_datetime(df['date']) # make sure logic can handle datetime for time series data 
df.drop(df.loc[df['result'] == 'D'].index, inplace=True) # Drop ties first 
drop_cols = df.loc[:, :'referee'].columns # columns that will be dropped as features (strings, formats, etc.)

drop_cols = drop_cols.insert(0, df.columns[df.isna().all()].to_list()) # drop columns that contain all nans

post_comp_cols = df.loc[:, ~df.columns.str.contains('precomp_')].columns.to_list() # All columns that do not have the precomp_ id
post_comp_cols.remove('height') # These are known before the fight but do not have the identifier anyway
post_comp_cols.remove('reach')
post_comp_cols.remove('age')

drop_cols = drop_cols.insert(0, post_comp_cols)

### Splitting Dataset into training and test sets

In [5]:
train_df, test_df = train_test_split(df)
X_train, y_train, X_test, y_test = X_y_split(train_df.copy(), test_df.copy(), 'result', drop_cols)

In [15]:
X_test.dtypes

height                                         int64
reach                                          int64
age                                          float64
precomp_height_prior                         float64
precomp_height_avg                           float64
                                              ...   
precomp_elo_differential_var_vs_opp          float64
precomp_elo_differential_windowvar_vs_opp    float64
precomp_elo_differential_peak_vs_opp         float64
precomp_elo_differential_low_vs_opp          float64
precomp_elo_differential_delta_vs_opp        float64
Length: 3415, dtype: object

In [7]:
# Converting W to 1 and L to 0 and to numpy array

y_train[y_train == 'W'] = 1
y_train[y_train != 1] = 0

y_test[y_test == 'W'] = 1
y_test[y_test != 1] = 0

y_train = y_train.to_numpy(dtype=int)
y_test = y_test.to_numpy(dtype=int)

## MODEL 1: XGBoost 

In [8]:
import xgboost as xgb

bst = xgb.XGBClassifier(n_estimators=10,
                        max_depth=3, 
                        learning_rate=1, 
                        objective='binary:logistic')

In [9]:
bst.fit(X_train, y_train)
y_pred_train = bst.predict(X_train)
y_pred_test = bst.predict(X_test)

### Accuracy Metrics

In [10]:
from sklearn.metrics import accuracy_score

train_score = accuracy_score(y_train, y_pred_train)
test_score = accuracy_score(y_test, y_pred_test)

In [11]:
print(train_score)
print(test_score)

0.6834841964683187
0.5529243937232525


### Not that good, lets optimize using Optuna

In [None]:
from src.models.Optuna import OptunaTuning

tuner = OptunaTuning(X_test, y_test, X_train, y_train)
tuner.run()

### Optimal Model

In [12]:
bst = xgb.XGBClassifier(reg_lambda = 0.21262699261144707, 
                        alpha = 9.603744709436778, 
                        tree_method = 'gpu_hist',
                        objective = 'binary:logistic',
                        verbosity = 0,
                        n_jobs = -1, 
                        learning_rate = 0.01830371431723197,
                        min_child_weight = 12, 
                        max_depth = 6, 
                        max_delta_step = 5, 
                        subsample = 0.12516270393991097,
                        colsample_bytree = 0.39799515236683536,
                        gamma = 0.225275077908943, 
                        n_estimators = 315,
                        eta = 0.11452245768637671)

bst.fit(X_train, y_train)
y_pred_train = bst.predict(X_train)
y_pred_test = bst.predict(X_test)
train_score = accuracy_score(y_train, y_pred_train)
test_score = accuracy_score(y_test, y_pred_test)
print(train_score)
print(test_score)

0.671316219023594
0.585734664764622
