In [2]:
# It's python lib for our boosting model! Install it for the FIRST time only. https://catboost.ai/en/docs/installation/python-installation-method-pip-install
#!pip install catboost 

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

import catboost
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold


In [2]:
print(np.__version__)
print(pd.__version__)
print(catboost.__version__)
!python --version

1.26.4
2.2.1
1.2.3
Python 3.12.2


In [3]:
data = pd.read_csv('../raw_data/data.csv')
data.head(2)

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Adrian Yanez,Gustavo Lopez,Chris Tognoni,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Bantamweight,0.0,0.0,...,0,1,0,0,Orthodox,170.18,177.8,135.0,31.0,27.0
1,Trevin Giles,Roman Dolidze,Herb Dean,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Middleweight,0.5,0.0,...,0,3,0,0,Orthodox,182.88,187.96,185.0,32.0,28.0


In [10]:
X = data.drop(['Winner'], axis=1)
y= data.Winner

In [11]:
X = X.replace('NaN', np.nan)

In [12]:
# Replace non-Red values in Winner-column for 2-class-classification
y = y.apply(lambda x: 1 if x=='Red' else 0)
y.dtype

dtype('int64')

In [13]:
y.value_counts(normalize=True)

Winner
1    0.661843
0    0.338157
Name: proportion, dtype: float64

In [14]:
columns_to_drop = X.isna().sum().sort_values()[-109:].index.to_list() +['date', 'location','date', 'title_bout', 'weight_class']

X = X.drop(columns=columns_to_drop, axis=1)

categorical_column_names = X.select_dtypes(include=['object']).columns.to_list()
categorical_indices = [i for i, v in enumerate(X.columns) if v in categorical_column_names]
categorical_indices

[0, 1]

In [16]:
num_preproc = Pipeline([
    ("to_log", FunctionTransformer(np.log)),
    ("num_imputer", SimpleImputer(strategy = "median")),
    ("scaler", RobustScaler())
])

In [17]:
cat_preproc = Pipeline([
    ("cat_imputer", SimpleImputer(strategy = "constant", fill_value="Unknown"))
])
bool_preproc = Pipeline([
    ("bool_imputer", SimpleImputer(strategy = "most_frequent")),
    ("to_str", FunctionTransformer(str))
])

In [19]:

preproc = ColumnTransformer([
    ("num_tr", num_preproc, make_column_selector(dtype_include = ["float64", "int64"])),
    ("cat_tr", cat_preproc, make_column_selector(dtype_include = ["object"])),
    ("bool_tr", bool_preproc, make_column_selector(dtype_include = ["bool"]))
], remainder="passthrough")

preproc

In [20]:
cv = StratifiedKFold(n_splits = 5)
model3 = catboost.CatBoostClassifier(n_estimators=2500, depth=5, learning_rate=0.04,silent=True,
                                             cat_features=categorical_indices,
                                             eval_metric='AUC')

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, 
                                                       train_size=0.8, 
                                                       random_state=42, stratify = y)
X_train = pd.DataFrame(data=X_train, columns=X.columns)
X_test = pd.DataFrame(data=X_test, columns=X.columns)

In [22]:
model3_pipe = Pipeline([
    ("preproc", preproc),
    ("model3_classifier", model3)
])
    
model3_pipe

In [23]:
model3_pipe_mean_accuracy = cross_val_score(model3_pipe, X_train, y=y_train, scoring='accuracy', cv=cv).mean()
model3_pipe_mean_accuracy

0.7373670877312918

In [45]:
model3_pipe.fit(X_train,y_train)
y_pred = model3_pipe.predict(X_test)
model3_pipe_test_accuracy = accuracy_score(y_test, y_pred)
model3_pipe_test_accuracy

0.7946799667497921

In [46]:
#Export the fitted pipeline as a pickle file
with open(f'../models/of_model3_acc0{round(model3_pipe_mean_test_accuracy*100000)}.pkl', 'wb') as file:
    pickle.dump(model3_pipe, file)
print(f"model3_pipe is successfully saved as 'of_model3_acc0{round(model3_pipe_test_accuracy*100000)}.pkl'")


model3_pipe is successfully saved as 'of_model3_acc079468.pkl'


In [None]:
### Test API logic

In [27]:
def preprocessed_df(red_fighter, blue_fighter):

    all_fighters = pd.read_csv("all_fighters.csv")
    # all_fighters = get_data()

    X_blue = all_fighters[all_fighters['fighter'] == blue_fighter]
    X_blue.columns = ["B_"+col for col in X_blue.columns]

    X_red = all_fighters[all_fighters['fighter'] == red_fighter]
    X_red.columns = ["R_"+col for col in X_red.columns]

    new_column_names = pd.concat([X_red, X_blue]).columns
    data = list(X_red.iloc[0]) + list(X_blue.iloc[0])

    fight_df = pd.DataFrame(data).T
    fight_df.columns = new_column_names

    fight_df = fight_df[['R_fighter', 'B_fighter', 'B_total_rounds_fought',
       'B_total_title_bouts', 'B_current_win_streak', 'B_current_lose_streak',
       'B_longest_win_streak', 'B_wins', 'B_losses', 'B_draw',
       'B_win_by_Decision_Majority', 'B_win_by_Decision_Split',
       'B_win_by_Decision_Unanimous', 'B_win_by_KO/TKO', 'B_win_by_Submission',
       'B_win_by_TKO_Doctor_Stoppage', 'R_total_rounds_fought',
       'R_total_title_bouts', 'R_current_win_streak', 'R_current_lose_streak',
       'R_longest_win_streak', 'R_wins', 'R_losses', 'R_draw',
       'R_win_by_Decision_Majority', 'R_win_by_Decision_Split',
       'R_win_by_Decision_Unanimous', 'R_win_by_KO/TKO', 'R_win_by_Submission',
       'R_win_by_TKO_Doctor_Stoppage']]

    return fight_df

In [36]:
test_df = preprocessed_df('Gustavo Lopez', 'Max Griffin')
test_df

Unnamed: 0,R_fighter,B_fighter,B_total_rounds_fought,B_total_title_bouts,B_current_win_streak,B_current_lose_streak,B_longest_win_streak,B_wins,B_losses,B_draw,...,R_longest_win_streak,R_wins,R_losses,R_draw,R_win_by_Decision_Majority,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage
0,Gustavo Lopez,Max Griffin,28,0,0,1,1,4,6,0,...,1,1,1,0,0,0,0,0,1,0


In [37]:
y_pred = model3_pipe.predict(test_df)
y_pred[0]

0

In [38]:
np.max(model3_pipe.predict_proba(test_df))


0.7723815869861491

In [50]:
def predict(red_fighter: str, blue_fighter: str):
    """
    return the model prediction : if the fighter in RED corner will win the fight or not
    parameters : the user selects 2 fighters corresponding to red and blue corners
    """

    # importing the data then preprocessing to get specific data corresponding to the red and blue fighters
    fight_data = preprocessed_df(red_fighter, blue_fighter)

    # importing the model
    with open('../models/of_model3_acc079468.pkl', 'rb') as file:
        model = pickle.load(file)

    # predicting the outcome of the fight
    prediction = model.predict(fight_data)[0] # 1 for #Red wins', 0 for "No Red wins" 
    win_rate = np.max(model.predict_proba(fight_data))

    # returning the outcome to the user through the API
    return {'fight_outcome' : f'{red_fighter}{[" will not ", ""][prediction]} win{["", "s"][prediction]}',
                'confidence_rate': round(win_rate, 3)
                    }

In [51]:
predict('Amanda Lemos', 'Max Griffin')

{'fight_outcome': 'Amanda Lemos will not  win', 'confidence_rate': 0.808}