# Preprocessing Workflow


## I) Imports


In [22]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector


## 2)  Loading Data ##

In [23]:
os.getcwd()

'/Users/alexfourdan/code/git-cguinel/ufc_combatiq'

In [24]:
csv_path = os.path.join(os.getcwd(), "raw_data/")
csv_path

data = pd.read_csv(os.path.join(csv_path, 'UFC_Fight_historical_data/data.csv'))
data.head()

fighter_data = pd.read_csv(os.path.join(csv_path, 'UFC_Fight_historical_data/raw_fighter_details.csv'), index_col= 'fighter_name')

## 3) Data Exploration ##

In [25]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_rows', 200)


In [26]:
data.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Adrian Yanez,Gustavo Lopez,Chris Tognoni,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Bantamweight,0.0,0.0,...,0,1,0,0,Orthodox,170.18,177.8,135.0,31.0,27.0
1,Trevin Giles,Roman Dolidze,Herb Dean,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Middleweight,0.5,0.0,...,0,3,0,0,Orthodox,182.88,187.96,185.0,32.0,28.0
2,Tai Tuivasa,Harry Hunsucker,Herb Dean,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Heavyweight,,,...,1,3,0,0,Southpaw,187.96,190.5,264.0,32.0,28.0
3,Cheyanne Buys,Montserrat Conejo,Mark Smith,2021-03-20,"Las Vegas, Nevada, USA",Blue,False,WomenStrawweight,,,...,0,0,0,0,Switch,160.02,160.02,115.0,28.0,25.0
4,Marion Reneau,Macy Chiasson,Mark Smith,2021-03-20,"Las Vegas, Nevada, USA",Blue,False,WomenBantamweight,0.125,0.0,...,1,2,2,0,Orthodox,167.64,172.72,135.0,29.0,43.0


In [27]:
data.shape

(6012, 144)

In [28]:
data.dtypes

R_fighter                        object
B_fighter                        object
Referee                          object
date                             object
location                         object
Winner                           object
title_bout                         bool
weight_class                     object
B_avg_KD                        float64
B_avg_opp_KD                    float64
B_avg_SIG_STR_pct               float64
B_avg_opp_SIG_STR_pct           float64
B_avg_TD_pct                    float64
B_avg_opp_TD_pct                float64
B_avg_SUB_ATT                   float64
B_avg_opp_SUB_ATT               float64
B_avg_REV                       float64
B_avg_opp_REV                   float64
B_avg_SIG_STR_att               float64
B_avg_SIG_STR_landed            float64
B_avg_opp_SIG_STR_att           float64
B_avg_opp_SIG_STR_landed        float64
B_avg_TOTAL_STR_att             float64
B_avg_TOTAL_STR_landed          float64
B_avg_opp_TOTAL_STR_att         float64


In [29]:
data.columns.tolist()

['R_fighter',
 'B_fighter',
 'Referee',
 'date',
 'location',
 'Winner',
 'title_bout',
 'weight_class',
 'B_avg_KD',
 'B_avg_opp_KD',
 'B_avg_SIG_STR_pct',
 'B_avg_opp_SIG_STR_pct',
 'B_avg_TD_pct',
 'B_avg_opp_TD_pct',
 'B_avg_SUB_ATT',
 'B_avg_opp_SUB_ATT',
 'B_avg_REV',
 'B_avg_opp_REV',
 'B_avg_SIG_STR_att',
 'B_avg_SIG_STR_landed',
 'B_avg_opp_SIG_STR_att',
 'B_avg_opp_SIG_STR_landed',
 'B_avg_TOTAL_STR_att',
 'B_avg_TOTAL_STR_landed',
 'B_avg_opp_TOTAL_STR_att',
 'B_avg_opp_TOTAL_STR_landed',
 'B_avg_TD_att',
 'B_avg_TD_landed',
 'B_avg_opp_TD_att',
 'B_avg_opp_TD_landed',
 'B_avg_HEAD_att',
 'B_avg_HEAD_landed',
 'B_avg_opp_HEAD_att',
 'B_avg_opp_HEAD_landed',
 'B_avg_BODY_att',
 'B_avg_BODY_landed',
 'B_avg_opp_BODY_att',
 'B_avg_opp_BODY_landed',
 'B_avg_LEG_att',
 'B_avg_LEG_landed',
 'B_avg_opp_LEG_att',
 'B_avg_opp_LEG_landed',
 'B_avg_DISTANCE_att',
 'B_avg_DISTANCE_landed',
 'B_avg_opp_DISTANCE_att',
 'B_avg_opp_DISTANCE_landed',
 'B_avg_CLINCH_att',
 'B_avg_CLINCH_lande

In [30]:
data.describe()

Unnamed: 0,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,B_avg_opp_REV,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
count,4585.0,4585.0,4585.0,4585.0,4585.0,4585.0,4585.0,4585.0,4585.0,4585.0,...,6012.0,6012.0,6012.0,6012.0,6012.0,6008.0,5606.0,6010.0,5840.0,5949.0
mean,0.247476,0.176818,0.45331,0.43429,0.29265,0.268742,0.478884,0.409276,0.15673,0.141306,...,0.306886,1.251331,1.297572,0.796241,0.069195,178.991788,183.562347,170.686356,29.293151,29.598252
std,0.378509,0.324633,0.130458,0.132618,0.273628,0.267178,0.724229,0.653826,0.34199,0.322623,...,0.619485,1.681376,1.859955,1.355801,0.27398,8.812985,10.585313,35.724626,4.063297,4.167755
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,152.4,152.4,115.0,18.0,19.0
25%,0.0,0.0,0.376489,0.351045,0.03125,0.033203,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,172.72,177.8,145.0,26.0,27.0
50%,0.015625,0.0,0.45,0.4275,0.25,0.2,0.148468,0.098389,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,180.34,182.88,170.0,29.0,29.0
75%,0.5,0.25,0.5275,0.51,0.5,0.422812,0.75,0.53125,0.125,0.125,...,0.0,2.0,2.0,1.0,0.0,185.42,190.5,185.0,32.0,32.0
max,5.0,3.0,1.0,1.0,1.0,1.0,8.0,7.0,4.0,3.0,...,5.0,10.0,11.0,13.0,2.0,210.82,213.36,345.0,51.0,47.0


In [31]:
data.duplicated().sum()

0

In [32]:
data.isnull().sum().sort_values(ascending=False).head(100)

B_avg_opp_GROUND_att            1427
B_avg_opp_TD_att                1427
B_avg_HEAD_att                  1427
B_avg_HEAD_landed               1427
B_avg_opp_HEAD_att              1427
B_avg_opp_HEAD_landed           1427
B_avg_BODY_att                  1427
B_avg_BODY_landed               1427
B_avg_opp_BODY_att              1427
B_avg_opp_BODY_landed           1427
B_avg_LEG_landed                1427
B_avg_opp_LEG_att               1427
B_avg_opp_LEG_landed            1427
B_avg_DISTANCE_att              1427
B_avg_DISTANCE_landed           1427
B_avg_opp_DISTANCE_att          1427
B_avg_opp_DISTANCE_landed       1427
B_avg_CLINCH_att                1427
B_avg_CLINCH_landed             1427
B_avg_opp_CLINCH_att            1427
B_avg_opp_CLINCH_landed         1427
B_avg_GROUND_att                1427
B_avg_GROUND_landed             1427
B_avg_opp_GROUND_landed         1427
B_avg_CTRL_time(seconds)        1427
B_avg_opp_CTRL_time(seconds)    1427
B_total_time_fought(seconds)    1427
B

## 4) Preprocessing ##

In [33]:
data_preproc = data.copy()

#### Removing all non-essential columns ####

In [34]:
data_preproc.drop(columns=['Referee', 'location'], inplace=True)

In [35]:
data_preproc.dtypes

R_fighter                        object
B_fighter                        object
date                             object
Winner                           object
title_bout                         bool
weight_class                     object
B_avg_KD                        float64
B_avg_opp_KD                    float64
B_avg_SIG_STR_pct               float64
B_avg_opp_SIG_STR_pct           float64
B_avg_TD_pct                    float64
B_avg_opp_TD_pct                float64
B_avg_SUB_ATT                   float64
B_avg_opp_SUB_ATT               float64
B_avg_REV                       float64
B_avg_opp_REV                   float64
B_avg_SIG_STR_att               float64
B_avg_SIG_STR_landed            float64
B_avg_opp_SIG_STR_att           float64
B_avg_opp_SIG_STR_landed        float64
B_avg_TOTAL_STR_att             float64
B_avg_TOTAL_STR_landed          float64
B_avg_opp_TOTAL_STR_att         float64
B_avg_opp_TOTAL_STR_landed      float64
B_avg_TD_att                    float64


In [36]:
data_preproc.head()

Unnamed: 0,R_fighter,B_fighter,date,Winner,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Adrian Yanez,Gustavo Lopez,2021-03-20,Red,False,Bantamweight,0.0,0.0,0.42,0.495,...,0,1,0,0,Orthodox,170.18,177.8,135.0,31.0,27.0
1,Trevin Giles,Roman Dolidze,2021-03-20,Red,False,Middleweight,0.5,0.0,0.66,0.305,...,0,3,0,0,Orthodox,182.88,187.96,185.0,32.0,28.0
2,Tai Tuivasa,Harry Hunsucker,2021-03-20,Red,False,Heavyweight,,,,,...,1,3,0,0,Southpaw,187.96,190.5,264.0,32.0,28.0
3,Cheyanne Buys,Montserrat Conejo,2021-03-20,Blue,False,WomenStrawweight,,,,,...,0,0,0,0,Switch,160.02,160.02,115.0,28.0,25.0
4,Marion Reneau,Macy Chiasson,2021-03-20,Blue,False,WomenBantamweight,0.125,0.0,0.535625,0.57875,...,1,2,2,0,Orthodox,167.64,172.72,135.0,29.0,43.0


In [37]:
data_preproc['R_Reach_cms']

0       177.80
1       187.96
2       190.50
3       160.02
4       172.72
         ...  
6007       NaN
6008       NaN
6009       NaN
6010       NaN
6011       NaN
Name: R_Reach_cms, Length: 6012, dtype: float64

In [38]:
data_preproc['R_Height_cms']

0       170.18
1       182.88
2       187.96
3       160.02
4       167.64
         ...  
6007    190.50
6008    177.80
6009    182.88
6010    187.96
6011    177.80
Name: R_Height_cms, Length: 6012, dtype: float64

In [42]:
data_preproc['R_Reach_cms'].fillna(data_preproc['R_Height_cms'], inplace=True)
data_preproc['B_Reach_cms'].fillna(data_preproc['B_Height_cms'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_preproc['B_Reach_cms'].fillna(data_preproc['B_Height_cms'], inplace=True)


In [43]:
numerical_columns = data_preproc.select_dtypes(include='number')
data_preproc[numerical_columns.columns] = numerical_columns.fillna(numerical_columns())

TypeError: 'DataFrame' object is not callable

In [44]:
data_preproc.isnull().sum().sort_values(ascending=False).head(100)


B_avg_opp_GROUND_landed         1427
B_avg_DISTANCE_landed           1427
B_avg_HEAD_landed               1427
B_avg_opp_HEAD_att              1427
B_avg_opp_HEAD_landed           1427
B_avg_BODY_att                  1427
B_avg_BODY_landed               1427
B_avg_opp_BODY_att              1427
B_avg_opp_BODY_landed           1427
B_avg_LEG_att                   1427
B_avg_opp_LEG_att               1427
B_avg_opp_LEG_landed            1427
B_avg_DISTANCE_att              1427
B_avg_opp_DISTANCE_att          1427
B_avg_opp_TD_landed             1427
B_avg_opp_DISTANCE_landed       1427
B_avg_CLINCH_att                1427
B_avg_CLINCH_landed             1427
B_avg_opp_CLINCH_att            1427
B_avg_opp_CLINCH_landed         1427
B_avg_GROUND_att                1427
B_avg_GROUND_landed             1427
B_avg_opp_GROUND_att            1427
B_avg_CTRL_time(seconds)        1427
B_avg_opp_CTRL_time(seconds)    1427
B_total_time_fought(seconds)    1427
B_avg_HEAD_att                  1427
B

In [45]:
data_preproc['B_Stance'].value_counts()

B_Stance
Orthodox       4530
Southpaw       1165
Switch          238
Open Stance       9
Sideways          4
Name: count, dtype: int64

In [46]:
data_preproc['R_Stance'].value_counts()

R_Stance
Orthodox       4538
Southpaw       1231
Switch          197
Open Stance      15
Sideways          2
Name: count, dtype: int64

In [47]:
data_preproc['R_Stance'].fillna('Orthodox', inplace=True)
data_preproc['B_Stance'].fillna('Orthodox', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_preproc['R_Stance'].fillna('Orthodox', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_preproc['B_Stance'].fillna('Orthodox', inplace=True)


In [48]:
data_preproc.isnull().sum()

R_fighter                          0
B_fighter                          0
date                               0
Winner                             0
title_bout                         0
weight_class                       0
B_avg_KD                        1427
B_avg_opp_KD                    1427
B_avg_SIG_STR_pct               1427
B_avg_opp_SIG_STR_pct           1427
B_avg_TD_pct                    1427
B_avg_opp_TD_pct                1427
B_avg_SUB_ATT                   1427
B_avg_opp_SUB_ATT               1427
B_avg_REV                       1427
B_avg_opp_REV                   1427
B_avg_SIG_STR_att               1427
B_avg_SIG_STR_landed            1427
B_avg_opp_SIG_STR_att           1427
B_avg_opp_SIG_STR_landed        1427
B_avg_TOTAL_STR_att             1427
B_avg_TOTAL_STR_landed          1427
B_avg_opp_TOTAL_STR_att         1427
B_avg_opp_TOTAL_STR_landed      1427
B_avg_TD_att                    1427
B_avg_TD_landed                 1427
B_avg_opp_TD_att                1427
B

## 5) Encoding Winner ##


In [49]:
data_preproc['Winner'].value_counts()

Winner
Red     3979
Blue    1923
Draw     110
Name: count, dtype: int64

#### a) Option 1: Reassign Draws with Blue Winners ####

In [50]:
data_preproc.loc[data_preproc['Winner']== 'Draw', 'Winner'] == 'Blue'

12      False
14      False
39      False
132     False
208     False
236     False
263     False
271     False
307     False
337     False
348     False
401     False
435     False
485     False
546     False
598     False
602     False
615     False
659     False
694     False
714     False
716     False
727     False
733     False
822     False
1095    False
1178    False
1195    False
1227    False
1369    False
1565    False
1568    False
1613    False
1695    False
1696    False
1761    False
1880    False
1939    False
1941    False
1987    False
1994    False
2027    False
2099    False
2116    False
2186    False
2206    False
2252    False
2283    False
2334    False
2360    False
2485    False
2522    False
2547    False
2613    False
2638    False
2795    False
2892    False
2910    False
2944    False
2977    False
3009    False
3097    False
3192    False
3233    False
3244    False
3375    False
3380    False
3411    False
3455    False
3495    False
3515    False
3566  

#### b) Option 2: Drop the rows with Draws since not very likely

In [51]:
data_preproc.drop(data_preproc.index[data_preproc['Winner'] == 'Draw'], inplace = True)

In [52]:
data_preproc['Winner'].value_counts()

Winner
Red     3979
Blue    1923
Name: count, dtype: int64

## 6) One hot encoding Categorical Features ##

### a) Weight Class ###

In [53]:
data_preproc['weight_class'].unique()

array(['Bantamweight', 'Middleweight', 'Heavyweight', 'WomenStrawweight',
       'WomenBantamweight', 'Lightweight', 'Welterweight', 'Flyweight',
       'LightHeavyweight', 'Featherweight', 'WomenFlyweight',
       'WomenFeatherweight', 'CatchWeight', 'OpenWeight'], dtype=object)

In [54]:
ohe = OneHotEncoder(sparse_output= False)
ohe.fit(data_preproc[["weight_class"]])
weight_class_encoded = ohe.transform(data_preproc[['weight_class']])

data_preproc["Bantamweight"], data_preproc['Middleweight'], data_preproc['Heavyweight'], data_preproc['WomenStrawweight'], data_preproc['WomenBantamweight'], data_preproc['Lightweight'], data_preproc['Welterweight'],data_preproc['Flyweight'], data_preproc['LightHeavyweight'], data_preproc['Featherweight'],data_preproc['WomenFlyweight'], data_preproc['WomenFeatherweight'], data_preproc['CatchWeight'],data_preproc['OpenWeight']  = weight_class_encoded.T
data_preproc.head(10)

data_preproc.drop(columns='weight_class', inplace = True)

### b) B Stance ###

In [55]:
data_preproc['B_Stance'].unique()

array(['Orthodox', 'Southpaw', 'Switch', 'Open Stance', 'Sideways'],
      dtype=object)

In [56]:
ohe = OneHotEncoder(sparse_output= False)
ohe.fit(data_preproc[["B_Stance"]])
b_stance_encoded = ohe.transform(data_preproc[['B_Stance']])


data_preproc["B_Stance_Orthodox"], data_preproc['B_Stance_Southpaw'], data_preproc['B_Stance_Switch'], data_preproc['B_Stance_Open Stance'], data_preproc['B_Stance_Sideways']  = b_stance_encoded.T
data_preproc.head(10)

data_preproc.drop(columns='B_Stance', inplace = True)

In [57]:
ohe = OneHotEncoder(sparse_output= False)
ohe.fit(data_preproc[["R_Stance"]])
r_stance_encoded = ohe.transform(data_preproc[['R_Stance']])


data_preproc["R_Stance_Orthodox"], data_preproc['R_Stance_Southpaw'], data_preproc['R_Stance_Switch'], data_preproc['R_Stance_Open Stance'], data_preproc['R_Stance_Sideways']  = r_stance_encoded.T
data_preproc.head(10)

data_preproc.drop(columns='R_Stance', inplace = True)

In [58]:
data_preproc.head()


Unnamed: 0,R_fighter,B_fighter,date,Winner,title_bout,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,...,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,B_Stance_Open Stance,B_Stance_Sideways,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch,R_Stance_Open Stance,R_Stance_Sideways
0,Adrian Yanez,Gustavo Lopez,2021-03-20,Red,False,0.0,0.0,0.42,0.495,0.33,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,Trevin Giles,Roman Dolidze,2021-03-20,Red,False,0.5,0.0,0.66,0.305,0.3,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,Tai Tuivasa,Harry Hunsucker,2021-03-20,Red,False,,,,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Cheyanne Buys,Montserrat Conejo,2021-03-20,Blue,False,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Marion Reneau,Macy Chiasson,2021-03-20,Blue,False,0.125,0.0,0.535625,0.57875,0.185,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## 7) Saving the data ##


In [59]:
data_preproc.to_csv(os.path.join(csv_path, 'data_preprocessed_file.csv'), index=False)

# Pipeline Preprocessing

In [60]:
num_transformer = make_pipeline(SimpleImputer(strategy="Median"))
num_col = make_column_selector(dtype_include=['float64'])

cat_transformer = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object', 'bool'])

preproc_pipe = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough'
)

preproc_pipe

# Data Manipulation



Loading preprocessed dataset as preproc_data


In [61]:
preproc_data = pd.read_csv(os.path.join(csv_path, 'data_preprocessed_file.csv'))

In [62]:
preproc_data.columns.tolist()

['R_fighter',
 'B_fighter',
 'date',
 'Winner',
 'title_bout',
 'B_avg_KD',
 'B_avg_opp_KD',
 'B_avg_SIG_STR_pct',
 'B_avg_opp_SIG_STR_pct',
 'B_avg_TD_pct',
 'B_avg_opp_TD_pct',
 'B_avg_SUB_ATT',
 'B_avg_opp_SUB_ATT',
 'B_avg_REV',
 'B_avg_opp_REV',
 'B_avg_SIG_STR_att',
 'B_avg_SIG_STR_landed',
 'B_avg_opp_SIG_STR_att',
 'B_avg_opp_SIG_STR_landed',
 'B_avg_TOTAL_STR_att',
 'B_avg_TOTAL_STR_landed',
 'B_avg_opp_TOTAL_STR_att',
 'B_avg_opp_TOTAL_STR_landed',
 'B_avg_TD_att',
 'B_avg_TD_landed',
 'B_avg_opp_TD_att',
 'B_avg_opp_TD_landed',
 'B_avg_HEAD_att',
 'B_avg_HEAD_landed',
 'B_avg_opp_HEAD_att',
 'B_avg_opp_HEAD_landed',
 'B_avg_BODY_att',
 'B_avg_BODY_landed',
 'B_avg_opp_BODY_att',
 'B_avg_opp_BODY_landed',
 'B_avg_LEG_att',
 'B_avg_LEG_landed',
 'B_avg_opp_LEG_att',
 'B_avg_opp_LEG_landed',
 'B_avg_DISTANCE_att',
 'B_avg_DISTANCE_landed',
 'B_avg_opp_DISTANCE_att',
 'B_avg_opp_DISTANCE_landed',
 'B_avg_CLINCH_att',
 'B_avg_CLINCH_landed',
 'B_avg_opp_CLINCH_att',
 'B_avg_opp_C

### Fighter Data Scraping ###


In [63]:
df = preproc_data.copy()

In [64]:
df.head()

Unnamed: 0,R_fighter,B_fighter,date,Winner,title_bout,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,...,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,B_Stance_Open Stance,B_Stance_Sideways,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch,R_Stance_Open Stance,R_Stance_Sideways
0,Adrian Yanez,Gustavo Lopez,2021-03-20,Red,False,0.0,0.0,0.42,0.495,0.33,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,Trevin Giles,Roman Dolidze,2021-03-20,Red,False,0.5,0.0,0.66,0.305,0.3,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,Tai Tuivasa,Harry Hunsucker,2021-03-20,Red,False,,,,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Cheyanne Buys,Montserrat Conejo,2021-03-20,Blue,False,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Marion Reneau,Macy Chiasson,2021-03-20,Blue,False,0.125,0.0,0.535625,0.57875,0.185,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [65]:
# Select only features describing a fighter by prefix R or B
blue = df[[col for col in df.columns if col.startswith('B_') or col.startswith('date')]]
red = df[[col for col in df.columns if col.startswith('R_') or col.startswith('date')]]

# Remove prefixes in column names
blue.columns = [col.replace('B_', '') if col.startswith('B_') else col for col in blue.columns]
red.columns = [col.replace('R_', '') if col.startswith('R_') else col for col in red.columns]

# Concat all fighters
all_fighters = pd.concat([blue,red])

# Keep only first occurence of fighter (most recent stats)
all_fighters = all_fighters.drop_duplicates(subset='fighter', keep='first')

# List all columns with  val
(all_fighters.isnull().sum()/len(all_fighters)).sort_values().head(60)

fighter                       0.000000
total_rounds_fought           0.000000
total_title_bouts             0.000000
current_win_streak            0.000000
current_lose_streak           0.000000
longest_win_streak            0.000000
wins                          0.000000
draw                          0.000000
win_by_Decision_Majority      0.000000
win_by_Decision_Split         0.000000
win_by_Decision_Unanimous     0.000000
win_by_KO/TKO                 0.000000
losses                        0.000000
win_by_TKO_Doctor_Stoppage    0.000000
win_by_Submission             0.000000
Stance_Sideways               0.000000
Stance_Open Stance            0.000000
Stance_Switch                 0.000000
Stance_Southpaw               0.000000
date                          0.000000
Stance_Orthodox               0.000000
Weight_lbs                    0.004690
Reach_cms                     0.005629
Height_cms                    0.006098
age                           0.065666
avg_opp_DISTANCE_landed  

In [66]:
all_fighters = all_fighters.dropna(axis=1)


In [67]:
features = all_fighters.columns

# Select columns that contain any of the patterns
filtered_columns = [col for col in df.columns if any(feature in col for feature in features)]

filtered_df = df[filtered_columns+['Winner']]

filtered_df.sample(10)

Unnamed: 0,R_fighter,B_fighter,date,B_total_rounds_fought,B_total_title_bouts,B_current_win_streak,B_current_lose_streak,B_longest_win_streak,B_wins,B_losses,...,B_Stance_Southpaw,B_Stance_Switch,B_Stance_Open Stance,B_Stance_Sideways,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch,R_Stance_Open Stance,R_Stance_Sideways,Winner
3668,Lucas Martins,Jeremy Larsen,2013-05-18,1,0,0,1,0,0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Red
4208,Jose Aldo,Kenny Florian,2011-10-08,37,3,0,1,6,12,4,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,Red
1988,Alan Jouban,Mike Perry,2016-12-17,4,0,2,0,2,2,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Red
4332,Michael McDonald,Chris Cariaso,2011-05-28,3,0,1,0,1,1,0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,Red
597,Antonio Arroyo,Andre Muniz,2019-11-16,0,0,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,Blue
1686,Krzysztof Jotko,Uriah Hall,2017-09-16,22,1,0,2,3,5,6,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Blue
2331,Jon Jones,Ovince Saint Preux,2016-04-23,20,0,4,0,4,7,2,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,Red
1054,Glover Teixeira,Karl Roberson,2019-01-19,5,0,1,0,1,2,1,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,Red
3187,Aleksei Oleinik,Anthony Hamilton,2014-06-28,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Red
1246,Drew Dober,Jon Tuck,2018-08-25,20,0,1,0,1,4,4,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Red


In [68]:
X = filtered_df.drop(columns=['R_fighter','B_fighter','Winner','date'])
y = filtered_df['Winner']


filtered_df.head()

Unnamed: 0,R_fighter,B_fighter,date,B_total_rounds_fought,B_total_title_bouts,B_current_win_streak,B_current_lose_streak,B_longest_win_streak,B_wins,B_losses,...,B_Stance_Southpaw,B_Stance_Switch,B_Stance_Open Stance,B_Stance_Sideways,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch,R_Stance_Open Stance,R_Stance_Sideways,Winner
0,Adrian Yanez,Gustavo Lopez,2021-03-20,4,0,0,1,1,1,1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Red
1,Trevin Giles,Roman Dolidze,2021-03-20,4,0,2,0,2,2,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Red
2,Tai Tuivasa,Harry Hunsucker,2021-03-20,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Red
3,Cheyanne Buys,Montserrat Conejo,2021-03-20,0,0,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,Blue
4,Marion Reneau,Macy Chiasson,2021-03-20,11,1,3,0,3,4,1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Blue


In [69]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

cv_results = cross_validate(LogisticRegression(max_iter=10000), X,y, cv=10)

cv_results['test_score'].mean()

0.6730018067624538

## User Interaction


In [70]:
model = LogisticRegression(max_iter=10000).fit(X,y)

In [71]:
all_fighters['fighter'].value_counts().sort_values()

fighter
Gustavo Lopez        1
Montserrat Conejo    1
Macy Chiasson        1
Grant Dawson         1
Max Griffin          1
                    ..
Josh Bryant          1
Jamie Yager          1
James McSweeney      1
Kazushi Sakuraba     1
Frank Hamaker        1
Name: count, Length: 2132, dtype: int64

In [86]:
fighter_red = "Colby Covington"
fighter_blue = 'Kamaru Usman'

In [87]:
X_blue = all_fighters[all_fighters['fighter'] == fighter_blue]
X_blue.columns = ["B_"+col for col in X_blue.columns]
X_blue

Unnamed: 0,B_fighter,B_date,B_total_rounds_fought,B_total_title_bouts,B_current_win_streak,B_current_lose_streak,B_longest_win_streak,B_wins,B_losses,B_draw,...,B_win_by_Decision_Split,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,B_Stance_Open Stance,B_Stance_Sideways
994,Kamaru Usman,2019-03-02,28,1,9,0,9,9,0,0,...,0,7,1,1,0,0.0,0.0,0.0,0.0,1.0


In [88]:
X_red = all_fighters[all_fighters['fighter'] == fighter_red]
X_red.columns = ["R_"+col for col in X_red.columns]
X_red

Unnamed: 0,R_fighter,R_date,R_total_rounds_fought,R_total_title_bouts,R_current_win_streak,R_current_lose_streak,R_longest_win_streak,R_wins,R_losses,R_draw,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch,R_Stance_Open Stance,R_Stance_Sideways
575,Colby Covington,2019-12-14,33,1,3,0,7,10,1,0,...,0,6,2,2,0,0.0,1.0,0.0,0.0,0.0


In [89]:
new_column_names = pd.concat([X_red, X_blue]).columns
data = list(X_red.iloc[0]) + list(X_blue.iloc[0])

new_df = pd.DataFrame(data).T
new_df.columns = new_column_names

new_df = new_df[X.columns]
new_df

Unnamed: 0,B_total_rounds_fought,B_total_title_bouts,B_current_win_streak,B_current_lose_streak,B_longest_win_streak,B_wins,B_losses,B_draw,B_win_by_Decision_Majority,B_win_by_Decision_Split,...,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,B_Stance_Open Stance,B_Stance_Sideways,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch,R_Stance_Open Stance,R_Stance_Sideways
0,28,1,9,0,9,9,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [90]:
model.predict(new_df)

array(['Blue'], dtype=object)

In [None]:
df2 = df.copy()

In [None]:
df2.drop(columns=['R_date', 'B_date'], inplace=True)

KeyError: "['R_fighter', 'B_fighter', 'R_date', 'B_date'] not found in axis"

## 7) Feature selection trying out different methods of feature selection ##

In [None]:
correlation_matrix = df2.corr()
column_names = correlation_matrix.columns
sns.heatmap(correlation_matrix, xticklabels=column_names, yticklabels=column_names, cmap='bwr')

ValueError: could not convert string to float: '2021-03-20'

In [92]:
corr_df = correlation_matrix.unstack().reset_index()

corr_df.columns = ['feature_1','feature_2', 'correlation']

no_self_correlation = (corr_df['feature_1'] != corr_df['feature_2'])
corr_df = corr_df[no_self_correlation]
corr_df['absolute_correlation'] = np.abs(corr_df['correlation'])


corr_df.sort_values(by="absolute_correlation", ascending=False).head(30)

NameError: name 'correlation_matrix' is not defined

In [None]:
corr_df[corr_df['feature_1']=='Winner'].sort_values(by="absolute_correlation", ascending = False)

Unnamed: 0,feature_1,feature_2,correlation,absolute_correlation
123,Winner,R_losses,-0.159319,0.159319
135,Winner,R_age,-0.156431,0.156431
81,Winner,R_avg_opp_SIG_STR_landed,-0.144787,0.144787
93,Winner,R_avg_opp_HEAD_landed,-0.141462,0.141462
105,Winner,R_avg_opp_DISTANCE_landed,-0.133272,0.133272
...,...,...,...,...
6,Winner,B_avg_TD_pct,0.002260,0.002260
10,Winner,B_avg_REV,-0.001890,0.001890
11,Winner,B_avg_opp_REV,-0.001193,0.001193
58,Winner,B_draw,,


In [None]:
data_preproc['age_difference'] = (data_preproc['B_age'] - data_preproc['R_age'])
data_preproc

Unnamed: 0,Winner,title_bout,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,...,B_Stance_Orthodox,B_Stance_Sideways,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch,age_difference
0,1.0,False,0.000000,0.0,0.420000,0.49500,0.330,0.36000,0.500000,1.000000,...,1,0,0,0,0,1,0,0,0,4.0
1,1.0,False,0.500000,0.0,0.660000,0.30500,0.300,0.50000,1.500000,0.000000,...,1,0,0,0,0,1,0,0,0,4.0
2,1.0,False,0.015625,0.0,0.450000,0.42750,0.250,0.20000,0.148468,0.098389,...,1,0,0,0,0,0,0,1,0,4.0
3,0.0,False,0.015625,0.0,0.450000,0.42750,0.250,0.20000,0.148468,0.098389,...,0,0,1,0,0,0,0,0,1,3.0
4,0.0,False,0.125000,0.0,0.535625,0.57875,0.185,0.16625,0.125000,0.187500,...,1,0,0,0,0,1,0,0,0,-14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5897,1.0,False,0.015625,0.0,0.450000,0.42750,0.250,0.20000,0.148468,0.098389,...,1,0,0,0,0,0,0,1,0,0.0
5898,1.0,False,0.015625,0.0,0.450000,0.42750,0.250,0.20000,0.148468,0.098389,...,1,0,0,0,0,0,0,1,0,0.0
5899,1.0,False,0.015625,0.0,0.450000,0.42750,0.250,0.20000,0.148468,0.098389,...,1,0,0,0,0,1,0,0,0,0.0
5900,1.0,False,0.015625,0.0,0.450000,0.42750,0.250,0.20000,0.148468,0.098389,...,1,0,0,0,0,1,0,0,0,-1.0


In [None]:
corr_df[corr_df['feature_1']=='Winner'].sort_values(by="absolute_correlation", ascending = False).head(30)

Unnamed: 0,feature_1,feature_2,correlation,absolute_correlation
123,Winner,R_losses,-0.159319,0.159319
135,Winner,R_age,-0.156431,0.156431
81,Winner,R_avg_opp_SIG_STR_landed,-0.144787,0.144787
93,Winner,R_avg_opp_HEAD_landed,-0.141462,0.141462
105,Winner,R_avg_opp_DISTANCE_landed,-0.133272,0.133272
37,Winner,B_avg_DISTANCE_landed,-0.126577,0.126577
126,Winner,R_win_by_Decision_Split,-0.126547,0.126547
13,Winner,B_avg_SIG_STR_landed,-0.12599,0.12599
12,Winner,B_avg_SIG_STR_att,-0.125394,0.125394
117,Winner,R_total_rounds_fought,-0.124428,0.124428


In [None]:
y = data_preproc['Winner']
y

0       1.0
1       1.0
2       1.0
3       0.0
4       0.0
       ... 
5897    1.0
5898    1.0
5899    1.0
5900    1.0
5901    1.0
Name: Winner, Length: 5902, dtype: float64

## 7) Trying to reduce number of columns to relevant ones ##

In [None]:
X = data_preproc[[
                        'title_bout',

                        'R_avg_KD',
                        'B_avg_KD',


                        'R_avg_SIG_STR_pct',
                        'B_avg_SIG_STR_pct',

                        'R_avg_opp_SIG_STR_pct',
                        'B_avg_opp_SIG_STR_pct',

                        'R_avg_TD_pct',
                        'B_avg_TD_pct',

                        'R_avg_SUB_ATT',
                        'B_avg_SUB_ATT',


                        'B_avg_opp_SUB_ATT',
                        'R_avg_opp_SUB_ATT',



                        'B_avg_TOTAL_STR_att',
                        'R_avg_TOTAL_STR_att',

                        'B_avg_TOTAL_STR_landed',
                        'R_avg_TOTAL_STR_landed',



                        'B_avg_HEAD_att',
                        'B_avg_HEAD_landed',
                        'B_avg_BODY_att',
                        'B_avg_BODY_landed',
                        'B_avg_LEG_att',
                        'B_avg_LEG_landed',
                        'B_avg_DISTANCE_att',
                        'B_avg_DISTANCE_landed',
                        'B_avg_CLINCH_att',
                        'B_avg_CLINCH_landed',
                        'B_avg_GROUND_att',
                        'B_avg_GROUND_landed',
                        'B_avg_CTRL_time(seconds)',
                        'B_total_time_fought(seconds)',
                        'B_total_rounds_fought',
                        'B_total_title_bouts',
                        'B_current_win_streak',
                        'B_current_lose_streak',
                        'B_longest_win_streak',
                        'B_wins',
                        'B_losses',
                        'B_draw',
                        'B_Height_cms',
                        'B_Reach_cms',
                        'B_Weight_lbs',


                        'R_avg_HEAD_att',
                        'R_avg_HEAD_landed',
                        'R_avg_BODY_att',
                        'R_avg_BODY_landed',
                        'R_avg_LEG_att',
                        'R_avg_LEG_landed',
                        'R_avg_DISTANCE_att',
                        'R_avg_DISTANCE_landed',
                        'R_avg_CLINCH_att',
                        'R_avg_CLINCH_landed',
                        'R_avg_GROUND_att',
                        'R_avg_GROUND_landed',
                        'R_avg_CTRL_time(seconds)',
                        'R_total_time_fought(seconds)',
                        'R_total_rounds_fought',
                        'R_total_title_bouts',
                        'R_current_win_streak',
                        'R_current_lose_streak',
                        'R_longest_win_streak',
                        'R_wins',
                        'R_losses',
                        'R_draw',
                        'R_Height_cms',
                        'R_Reach_cms',
                        'R_Weight_lbs',

                        'B_age',
                        'R_age',

                        'R_Stance_Open Stance',
                        'B_Stance_Open Stance',

                        'R_Stance_Orthodox',
                        'B_Stance_Orthodox',

                        'R_Stance_Sideways',
                        'B_Stance_Sideways',

                        'R_Stance_Southpaw',
                        'B_Stance_Southpaw',

                        'R_Stance_Switch',
                        'B_Stance_Switch',

                        'age_difference'
                        ]]

In [None]:
log_reg = LogisticRegression(max_iter = 1000)
log_reg

In [None]:
scores = cross_val_score(log_reg, X, y, cv=10)

scores.mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.67165820642978

## 8) Trying to compare with whole dataset ##

In [None]:
X1 = data_preproc.drop(columns=['Winner'])

In [None]:
log_reg_x1 = LogisticRegression(max_iter = 1000)
log_reg_x1

In [None]:

scores_x1 = cross_val_score(log_reg_x1, X1, y, cv=10)

scores_x1.mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.6726757291577046

## 9) Feature Permutations on simplified df ##


In [None]:
log_model = LogisticRegression().fit(X,y)
permutation_score = permutation_importance(log_model, X, y, n_repeats=10)

permutation_score

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'importances_mean': array([-1.69434090e-05,  1.69434090e-05,  5.08302270e-05,  0.00000000e+00,
         0.00000000e+00,  1.69434090e-05,  0.00000000e+00,  2.37207726e-04,
         6.77736361e-05, -1.18603863e-04,  2.71094544e-04, -1.35547272e-04,
         0.00000000e+00,  3.54117248e-03,  7.20094883e-03,  4.40528634e-04,
         7.40426974e-03,  2.91426635e-03,  2.45679431e-03,  3.04981362e-04,
         1.69434090e-05,  1.86377499e-04, -2.03320908e-04,  9.09861064e-03,
         2.01626567e-03, -1.18603863e-04,  6.60792952e-04,  1.01660454e-03,
         1.69434090e-03,  4.57472043e-04,  1.05049136e-03,  2.15181294e-03,
         1.18603863e-04,  3.72754998e-04,  6.77736361e-05, -1.18603863e-04,
         4.74415452e-04,  4.40528634e-04,  0.00000000e+00,  3.82921044e-03,
         2.06201288e-02,  2.55845476e-03,  2.10606574e-02,  1.15215181e-03,
         2.71094544e-03,  3.21924771e-04, -5.08302270e-04, -1.38935954e-03,
         2.41951881e-02,  9.31887496e-04, -1.11022302e-17,  7.116231

In [None]:
importance_df = pd.DataFrame(np.vstack((X.columns,
                                       permutation_score.importances_mean)).T)

importance_df.columns=['feature', 'score_decrease']


In [None]:

importance_df.sort_values(by="score_decrease", ascending = False)

Unnamed: 0,feature,score_decrease
48,R_avg_DISTANCE_att,0.024195
42,R_avg_HEAD_att,0.021061
40,B_Reach_cms,0.02062
23,B_avg_DISTANCE_att,0.009099
79,age_difference,0.008556
...,...,...
67,B_age,-0.000237
57,R_total_title_bouts,-0.000288
46,R_avg_LEG_att,-0.000508
60,R_longest_win_streak,-0.000661


## 10) Feature Permutations on Whole Dataset ##

In [None]:
log_model_x1 = LogisticRegression().fit(X1,y)
permutation_score_x1 = permutation_importance(log_model_x1, X1, y, n_repeats=10)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
importance_df_x1 = pd.DataFrame(np.vstack((X1.columns,
                                       permutation_score_x1.importances_mean)).T)

importance_df_x1.columns=['feature', 'score_decrease']

In [None]:
importance_df_x1.sort_values(by="score_decrease", ascending = False).tail()

Unnamed: 0,feature,score_decrease
41,B_avg_opp_CLINCH_att,-0.001457
26,B_avg_opp_HEAD_landed,-0.001542
50,B_total_rounds_fought,-0.001593
77,R_avg_SIG_STR_att,-0.001644
36,B_avg_DISTANCE_landed,-0.001677
