In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# import scipy.stats as st

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# DATA OVERVIEW

In [3]:
fight_df = pd.read_csv('/kaggle/input/ufcdata/raw_total_fight_data.csv', sep=';')
fight_df.head()

In [4]:
fighter_df = pd.read_csv('/kaggle/input/ufcdata/raw_fighter_details.csv')
fighter_df.head()

In [5]:
df = fight_df.merge(fighter_df, how='left', left_on='R_fighter', right_on='fighter_name')
df = df.merge(fighter_df, how='left', left_on='B_fighter', right_on='fighter_name',suffixes=('_Red', '_Blue'))
df.head()

In [6]:
df.info()

In [7]:
df.shape

In [8]:
df.duplicated().sum()

In [9]:
df.isna().sum()

In [10]:
df.describe()

In [11]:
df = df.drop(columns=['Referee', 'Reach_Red', 'Reach_Blue', 'DOB_Red', 'DOB_Blue'])
df.isna().sum()

In [12]:
df = df.dropna(subset=['Winner', 'Height_Blue', 'Height_Red', 'Weight_Blue', 'Weight_Red'])
df['Stance_Red'] = df['Stance_Red'].fillna(df['Stance_Red'].mode()[0])
df['Stance_Blue'] = df['Stance_Blue'].fillna(df['Stance_Blue'].mode()[0])
df.isna().sum()

In [13]:
df.shape

In [14]:
df.head()

# FEATURE ENGINEERING

In [15]:
def winner(x,red,blue):
    if red in x:
        return 1
    elif blue in x:
        return 0

In [16]:
df['winner_color'] = df[['Winner','R_fighter','B_fighter']].apply(lambda x: winner(x['Winner'], x['R_fighter'], x['B_fighter']), axis=1)
df['winner_color'].value_counts()

In [17]:
df = df.drop(columns=['R_fighter','B_fighter'])

In [18]:
df.sample(5)

In [19]:
df['R_SIG_STRIKE_LANDED'] = df['R_SIG_STR.'].apply(lambda x: x.split()[0]).astype(int)
df['R_SIG_STRIKE_TOTAL'] = df['R_SIG_STR.'].apply(lambda x: x.split()[2]).astype(int)
df['B_SIG_STRIKE_LANDED'] = df['B_SIG_STR.'].apply(lambda x: x.split()[0]).astype(int)
df['B_SIG_STRIKE_TOTAL'] = df['B_SIG_STR.'].apply(lambda x: x.split()[2]).astype(int)
df['R_TOTAL_STRIKE_LANDED'] = df['R_TOTAL_STR.'].apply(lambda x: x.split()[0]).astype(int)
df['R_TOTAL_STRIKE_TOTAL'] = df['R_TOTAL_STR.'].apply(lambda x: x.split()[2]).astype(int)
df['B_TOTAL_STRIKE_LANDED'] = df['B_TOTAL_STR.'].apply(lambda x: x.split()[0]).astype(int)
df['B_TOTAL_STRIKE_TOTAL'] = df['B_TOTAL_STR.'].apply(lambda x: x.split()[2]).astype(int)
df['R_TD_STRIKE_LANDED'] = df['R_TD'].apply(lambda x: x.split()[0]).astype(int)
df['R_TD_STRIKE_TOTAL'] = df['R_TD'].apply(lambda x: x.split()[2]).astype(int)
df['B_TD_STRIKE_LANDED'] = df['B_TD'].apply(lambda x: x.split()[0]).astype(int)
df['B_TD_STRIKE_TOTAL'] = df['B_TD'].apply(lambda x: x.split()[2]).astype(int)
df['R_HEAD_LANDED'] = df['R_HEAD'].apply(lambda x: x.split()[0]).astype(int)
df['R_HEAD_TOTAL'] = df['R_HEAD'].apply(lambda x: x.split()[2]).astype(int)
df['B_HEAD_LANDED'] = df['B_HEAD'].apply(lambda x: x.split()[0]).astype(int)
df['B_HEAD_TOTAL'] = df['B_HEAD'].apply(lambda x: x.split()[2]).astype(int)
df['R_BODY_LANDED'] = df['R_BODY'].apply(lambda x: x.split()[0]).astype(int)
df['R_BODY_TOTAL'] = df['R_BODY'].apply(lambda x: x.split()[2]).astype(int)
df['B_BODY_LANDED'] = df['B_BODY'].apply(lambda x: x.split()[0]).astype(int)
df['B_BODY_TOTAL'] = df['B_BODY'].apply(lambda x: x.split()[2]).astype(int)
df['R_LEG_LANDED'] = df['R_LEG'].apply(lambda x: x.split()[0]).astype(int)
df['R_LEG_TOTAL'] = df['R_LEG'].apply(lambda x: x.split()[2]).astype(int)
df['B_LEG_LANDED'] = df['B_LEG'].apply(lambda x: x.split()[0]).astype(int)
df['B_LEG_TOTAL'] = df['B_LEG'].apply(lambda x: x.split()[2]).astype(int)
df['R_DISTANCE_LANDED'] = df['R_DISTANCE'].apply(lambda x: x.split()[0]).astype(int)
df['R_DISTANCE_TOTAL'] = df['R_DISTANCE'].apply(lambda x: x.split()[2]).astype(int)
df['B_DISTANCE_LANDED'] = df['B_DISTANCE'].apply(lambda x: x.split()[0]).astype(int)
df['B_DISTANCE_TOTAL'] = df['B_DISTANCE'].apply(lambda x: x.split()[2]).astype(int)
df['R_CLINCH_LANDED'] = df['R_CLINCH'].apply(lambda x: x.split()[0]).astype(int)
df['R_CLINCH_TOTAL'] = df['R_CLINCH'].apply(lambda x: x.split()[2]).astype(int)
df['B_CLINCH_LANDED'] = df['B_CLINCH'].apply(lambda x: x.split()[0]).astype(int)
df['B_CLINCH_TOTAL'] = df['B_CLINCH'].apply(lambda x: x.split()[2]).astype(int)
df['R_GROUND_LANDED'] = df['R_GROUND'].apply(lambda x: x.split()[0]).astype(int)
df['R_GROUND_TOTAL'] = df['R_GROUND'].apply(lambda x: x.split()[2]).astype(int)
df['B_GROUND_LANDED'] = df['B_GROUND'].apply(lambda x: x.split()[0]).astype(int)
df['B_GROUND_TOTAL'] = df['B_GROUND'].apply(lambda x: x.split()[2]).astype(int)

In [20]:
df = df.drop(columns=['R_SIG_STR.','B_SIG_STR.','R_TOTAL_STR.','B_TOTAL_STR.','R_TD','B_TD','R_HEAD','B_HEAD','R_BODY','B_BODY','R_LEG','B_LEG','R_DISTANCE','B_DISTANCE','R_CLINCH','B_CLINCH','R_GROUND','B_GROUND'])

In [21]:
df['R_SIG_STR_pct'] = df['R_SIG_STR_pct'].str.strip('%')
df['B_SIG_STR_pct'] = df['B_SIG_STR_pct'].str.strip('%')
df['R_TD_pct'] = df['R_TD_pct'].str.strip('%')
df['B_TD_pct'] = df['B_TD_pct'].str.strip('%')
df['Str_Acc_Red'] = df['Str_Acc_Red'].str.strip('%')
df['Str_Def_Red'] = df['Str_Def_Red'].str.strip('%')
df['TD_Acc_Red'] = df['TD_Acc_Red'].str.strip('%')
df['TD_Def_Red'] = df['TD_Def_Red'].str.strip('%')
df['Str_Acc_Blue'] = df['Str_Acc_Blue'].str.strip('%')
df['Str_Def_Blue'] = df['Str_Def_Blue'].str.strip('%')
df['TD_Acc_Blue'] = df['TD_Acc_Blue'].str.strip('%')
df['TD_Def_Blue'] = df['TD_Def_Blue'].str.strip('%')

In [22]:
from datetime import timedelta
def seconder(x):
    if x == '--':
        return 0
    else:
        mins, secs = map(float, x.split(':'))
        td = timedelta(minutes=mins, seconds=secs)
        return int(td.total_seconds())

In [23]:
df['R_CTRL'] = df['R_CTRL'].apply(seconder)
df['B_CTRL'] = df['B_CTRL'].apply(seconder)

In [24]:
df = df.drop(columns=['Winner','Format','win_by','last_round','last_round_time','date','location','fighter_name_Red','fighter_name_Blue'])

In [25]:
def weight_class(x):
    if "Women's Strawweight" in x:
        return 'womens strawweight'
    elif 'Flyweight' in x:
        if 'Women' in x:
            return 'womens flyweight'
        else:
            return 'flyweight'
    elif 'Bantamweight' in x:
        if 'Women' in x:
            return 'womens bantamweight'
        else:
            return 'bantamweight'
    elif 'Featherweight' in x:
        if 'Women' in x:
            return 'womens featherweight'
        else:
            return 'featherweight'
    elif 'Lightweight' in x:
        return 'lightweight'
    elif 'Welterweight' in x:
        return 'welterweight'
    elif 'Middleweight' in x:
        return 'middleweight'
    elif 'Heavyweight' in x:
        if 'Light' in x:
            return 'light heavyweight'
        else:
            return 'heavyweight'
    else:
        return 'other'

In [26]:
df['weightclass'] = df['Fight_type'].apply(lambda x: weight_class(x))
df['titlebout'] = df['Fight_type'].apply(lambda x: 1 if 'Title' in x else 0)

In [27]:
df = df.drop(columns=['Fight_type'])

In [28]:
df['Height_Red'] = df['Height_Red'].apply(lambda x: (int(x.split()[0].strip("/'"))*12) + (int(x.split()[1].strip('/""'))))
df['Height_Blue'] = df['Height_Blue'].apply(lambda x: (int(x.split()[0].strip("/'"))*12) + (int(x.split()[1].strip('/""'))))

In [29]:
df['Weight_Red'] = df['Weight_Red'].apply(lambda x: x.split()[0])
df['Weight_Blue'] = df['Weight_Blue'].apply(lambda x: x.split()[0])

In [30]:
df.sample(5)

In [31]:
df['R_TD_pct'] = df['R_TD_pct'].str.replace('---', '0').astype(int)
df['B_TD_pct'] = df['B_TD_pct'].str.replace('---', '0').astype(int)
df['R_SIG_STR_pct'] = df['R_SIG_STR_pct'].str.replace('---', '0').astype(int)
df['B_SIG_STR_pct'] = df['B_SIG_STR_pct'].str.replace('---', '0').astype(int)
df['Weight_Red'] = df['Weight_Red'].astype(int)
df['Weight_Blue'] = df['Weight_Blue'].astype(int)
df['Str_Acc_Red'] = df['Str_Acc_Red'].astype(int)
df['Str_Acc_Blue'] = df['Str_Acc_Blue'].astype(int)
df['Str_Def_Red'] = df['Str_Def_Red'].astype(int)
df['Str_Def_Blue'] = df['Str_Def_Blue'].astype(int)
df['TD_Acc_Red'] = df['TD_Acc_Red'].astype(int)
df['TD_Acc_Blue'] = df['TD_Acc_Blue'].astype(int)
df['TD_Def_Red'] = df['TD_Def_Red'].astype(int)
df['TD_Def_Blue'] = df['TD_Def_Blue'].astype(int)

In [32]:
df.info()

# SPLITTING DATA

In [33]:
target_label = 'winner_color'
columns = [col for col in df.columns if col not in target_label]
cats = [col for col in columns if df[col].dtype == 'object']
nums = [col for col in columns if df[col].dtype != 'object']

In [34]:
X = df[columns].copy()
y = df[target_label].copy()
X.shape, y.shape

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X.copy(), y.copy(), test_size=0.25, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# FEATURE ENCODING / SCALING

## Train Data

In [36]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [37]:
scaler.fit(X_train[nums])
X_train_scaled = scaler.transform(X_train[nums])
X_train[nums] = X_train_scaled
X_train.head()

In [38]:
from sklearn.preprocessing import OneHotEncoder
onehotcats = [col for col in cats]
ohe = OneHotEncoder(drop='first', sparse=False).fit(X_train[onehotcats])

In [39]:
ohe.get_feature_names(onehotcats)

In [40]:
X_train_ohe = ohe.transform(X_train[onehotcats])
to_merge = pd.DataFrame(X_train_ohe, columns=ohe.get_feature_names(onehotcats))
X_train = X_train.reset_index().drop('index', axis=1)
X_train[ohe.get_feature_names(onehotcats)] = to_merge
X_train = X_train[[col for col in X_train if col not in onehotcats]].copy()
X_train.head()

## Test Data

In [41]:
X_test_scaled = scaler.transform(X_test[nums])
X_test[nums] = X_test_scaled
X_test.head()

In [42]:
X_test_ohe = ohe.transform(X_test[onehotcats])
to_merge = pd.DataFrame(X_test_ohe, columns=ohe.get_feature_names(onehotcats))
X_test = X_test.reset_index().drop('index', axis=1)
X_test[ohe.get_feature_names(onehotcats)] = to_merge
X_test = X_test[[col for col in X_test if col not in onehotcats]].copy()
X_test.head()

# FEATURE SELECTION

In [43]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=None, random_state=42)
rf.fit(X_train, y_train)

In [44]:
brt = BorutaPy(rf, n_estimators='auto', random_state=42)
brt.fit(np.array(X_train), np.array(y_train))
brt_ranking = brt.ranking_

In [45]:
plt.figure(figsize=(16,16))
sns.scatterplot(y=[col for col in X_train.columns.values], x=brt_ranking, hue=brt_ranking)

In [46]:
selected_features = {}
for i, col in enumerate(X_train.columns):
    if brt_ranking[i] <= 22:
        selected_features[col] = brt_ranking[i]
selected_features

In [47]:
features = [k for k in selected_features.keys()]

In [48]:
temp = X_train[features].copy()
temp[target_label] = y_train.copy()
corr = temp.corr(method='pearson')

In [49]:
plt.figure(figsize=(24,24))
ax = sns.heatmap(corr, annot=True, fmt='.2f', vmin=-1, vmax=1, cmap='coolwarm')
plt.tight_layout()
plt.show()

In [50]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9)
pca.fit(X_train[features])
pcs_train = pca.transform(X_train[features])

In [51]:
pdf_train = pd.DataFrame(data = pcs_train)
pdf_train.head()

In [52]:
pca.explained_variance_*100

In [53]:
plt.bar(pdf_train.columns, pca.explained_variance_*100)

In [54]:
pcs_test = pca.transform(X_test[features])

In [55]:
pdf_test = pd.DataFrame(data = pcs_test)
pdf_test.head()

In [56]:
X_train = pdf_train.copy()
X_test = pdf_test.copy()

In [57]:
feat = X_train.columns
feat

In [58]:
# feat = features
# feat

# MACHINE LEARNING MODELLING

## Model Selection - lazypredict

In [59]:
# !pip install lazypredict --user

In [60]:
# from lazypredict.Supervised import LazyClassifier

In [61]:
# clf = LazyClassifier(verbose=0, ignore_warnings=False, custom_metric=None)
# models, predictions = clf.fit(X_train[feat], X_test[feat], y_train, y_test)
# print(models)

## Cross Validation

In [160]:
from sklearn.metrics import precision_score, roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

In [161]:
from sklearn.model_selection import cross_val_score
def model_evaluation(model, X_train, y_train, scoring='roc_auc', cv=5):
    cv_results = cross_val_score(model, X_train[feat], y_train, scoring=scoring, cv=cv)
    avg_res = abs(np.mean(cv_results))
    return avg_res

In [162]:
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier

lgbc = LGBMClassifier()
svc = SVC()
etc = ExtraTreesClassifier()
rfc = RandomForestClassifier()
xgb = XGBClassifier(eval_metric='logloss')

models = [
    lgbc,svc,etc,rfc,xgb
]

In [165]:
results = []
print('WEIGHTED PRECISION SCORES ON TRAINING DATA:\n')
for i, model in enumerate(models):
    result = model_evaluation(model, X_train[feat], y_train, scoring='precision_weighted')
    print(model,':',result)

In [166]:
results = []
print('ROC AUC SCORES ON TRAINING DATA:\n')
for i, model in enumerate(models):
    result = model_evaluation(model, X_train[feat], y_train)
    print(model,':',result)

## Prediction

In [167]:
test_results = []
print('ACCURACY SCORES ON TEST DATA:\n')
for model in models:
    model.fit(X_train[feat], y_train)
    y_pred = model.predict(X_test[feat])
    test_results.append(precision_score(y_test, y_pred, average='weighted'))
for i, model in enumerate(models):
    print(model, ':', test_results[i])

In [168]:
test_results = []
print('ROC AUC SCORES ON TEST DATA:\n')
for model in models:
    model.fit(X_train[feat], y_train)
    y_pred = model.predict(X_test[feat])
    test_results.append(roc_auc_score(y_test, y_pred))
for i, model in enumerate(models):
    print(model, ':', test_results[i])

## Hyperparameter Tuning

In [169]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [361]:
hyperparameters = dict(
    C = [4,10],
    kernel = ['poly'],
    degree = [2,3],
    gamma = ['scale'],
    coef0 = [1.02,6],
    random_state = [42],
    class_weight = [{1:1,0:0.57}]
)

In [362]:
svc = SVC()
gs = GridSearchCV(estimator=svc, param_grid=hyperparameters, verbose=1, cv=5, n_jobs=-1, scoring='precision_weighted')
model1 = gs.fit(X_train[feat], y_train)

In [363]:
print(model1.best_estimator_)
print(model1.best_params_)
print(model1.scorer_, abs(model1.best_score_))

In [364]:
y_pred_train = model1.predict(X_train[feat])
y_pred_test = model1.predict(X_test[feat])

print(precision_score(y_train, y_pred_train, average='weighted'))
print(precision_score(y_test, y_pred_test, average='weighted'))

print(roc_auc_score(y_train, y_pred_train))
print(roc_auc_score(y_test, y_pred_test))

In [365]:
print(classification_report(y_test, y_pred_test, labels=sorted(model1.classes_, reverse=True)))

In [366]:
cm = confusion_matrix(y_test, y_pred_test, labels=sorted(model1.classes_, reverse=True))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.tight_layout()
plt.show()