<h1>After Scraping: Cleaning and Feature Engineering</h1>

- Acknowledgements:
    - ufcstats for comprehensive data sets on past MMA bouts: http://ufcstats.com/
    - Rajeev Warrier for providing the groundwork for this prediction project: https://github.com/WarrierRajeev/UFC-Predictions

In [2352]:
import pandas as pd
import numpy as np

DATA_PATH ='./data'
df_fighters = pd.read_csv(DATA_PATH+'/fighter_details_20200510.csv')
df_fights = pd.read_csv(DATA_PATH+'/total_fight_data_20200510.csv', sep=';')

In [2353]:
df_fighters.head(3)

Unnamed: 0,fighter_name,Height,Weight,Reach,Stance,DOB
0,Tom Aaron,,155 lbs.,,,"Jul 13, 1978"
1,Danny Abbadi,"5' 11""",155 lbs.,,Orthodox,"Jul 03, 1983"
2,David Abbott,"6' 0""",265 lbs.,,Switch,


In [2354]:
df_fights.head(3)

Unnamed: 0,R_fighter,B_fighter,R_KD,B_KD,R_SIG_STR.,B_SIG_STR.,R_SIG_STR_pct,B_SIG_STR_pct,R_TOTAL_STR.,B_TOTAL_STR.,...,B_GROUND,win_by,last_round,last_round_time,Format,Referee,date,location,Fight_type,Winner
0,Israel Adesanya,Yoel Romero,0,0,48 of 132,40 of 89,36%,44%,48 of 132,40 of 89,...,0 of 0,Decision - Unanimous,5,5:00,5 Rnd (5-5-5-5-5),Dan Miragliotta,"March 07, 2020","Las Vegas, Nevada, USA",UFC Middleweight Title Bout,Israel Adesanya
1,Weili Zhang,Joanna Jedrzejczyk,0,0,165 of 408,186 of 360,40%,51%,170 of 413,196 of 370,...,0 of 0,Decision - Split,5,5:00,5 Rnd (5-5-5-5-5),Keith Peterson,"March 07, 2020","Las Vegas, Nevada, USA",UFC Women's Strawweight Title Bout,Weili Zhang
2,Beneil Dariush,Drakkar Klose,1,0,12 of 20,15 of 25,60%,60%,17 of 28,27 of 37,...,0 of 0,KO/TKO,2,1:00,3 Rnd (5-5-5),Jason Herzog,"March 07, 2020","Las Vegas, Nevada, USA",Lightweight Bout,Beneil Dariush


<h3>Processing Fighter data set</h3> 

In [2355]:
df_fighters.dtypes

fighter_name    object
Height          object
Weight          object
Reach           object
Stance          object
DOB             object
dtype: object

In [2356]:
df_fighters.isna().sum()

fighter_name       0
Height           257
Weight            74
Reach           1714
Stance           840
DOB              741
dtype: int64

- fighters with NaN Weight values have little to no useful data
    - therefore, these rows will be excluded

In [2357]:
df_fighters[pd.isnull(df_fighters['Weight'])].isna().sum()

fighter_name     0
Height          68
Weight          74
Reach           74
Stance          65
DOB             72
dtype: int64

In [2358]:
df_fighters = df_fighters[df_fighters['Weight'].notna()]

- to fill NaN values in bodily metrics, find:
    - average reach for each height increment
    - average height for each weight increment

In [2359]:
df_fighters['Weight'] = df_fighters['Weight'].apply(lambda x: x.split(' ')[0])
df_fighters['Weight'] = df_fighters['Weight'].astype(float)

In [2360]:
df_fighters['Height'] = df_fighters['Height'].fillna('0\' 0\"')
df_fighters['Height'] = df_fighters['Height'].apply(lambda x: int(x.split('\' ')[0])*12 + int(x.split('\' ')[1].replace('\"','')))
df_fighters['Height'] = df_fighters['Height'].replace(0, np.nan).astype(float)

In [2361]:
df_fighters['Height'] = df_fighters.groupby('Weight')['Height'].apply(lambda x: x.fillna(x.mean()))
df_fighters['Height'] = df_fighters['Height'].fillna(df_fighters['Height'].mean())

In [2362]:
df_fighters['Reach'] = df_fighters['Reach'].fillna('0')
df_fighters['Reach'] = df_fighters['Reach'].apply(lambda x: x.replace('\"',''))
df_fighters['Reach'] = df_fighters['Reach'].replace('0', np.nan).astype(float)

In [2363]:
df_fighters['Reach'] = df_fighters.groupby('Height')['Reach'].apply(lambda x: x.fillna(x.mean()))
df_fighters['Reach'] = df_fighters['Reach'].fillna(df_fighters['Reach'].mean())

In [2364]:
df_fighters['Stance'].value_counts()

Orthodox       1999
Southpaw        450
Switch          100
Open Stance       7
Sideways          3
Name: Stance, dtype: int64

<h3>Processing Fight data set</h3>

In [2365]:
df_fights.dtypes

R_fighter          object
B_fighter          object
R_KD                int64
B_KD                int64
R_SIG_STR.         object
B_SIG_STR.         object
R_SIG_STR_pct      object
B_SIG_STR_pct      object
R_TOTAL_STR.       object
B_TOTAL_STR.       object
R_TD               object
B_TD               object
R_TD_pct           object
B_TD_pct           object
R_SUB_ATT           int64
B_SUB_ATT           int64
R_PASS              int64
B_PASS              int64
R_REV               int64
B_REV               int64
R_HEAD             object
B_HEAD             object
R_BODY             object
B_BODY             object
R_LEG              object
B_LEG              object
R_DISTANCE         object
B_DISTANCE         object
R_CLINCH           object
B_CLINCH           object
R_GROUND           object
B_GROUND           object
win_by             object
last_round          int64
last_round_time    object
Format             object
Referee            object
date               object
location    

- split attack stats into attempts/landed numerical format

In [2366]:
df_fights.columns
attack_cols = ['R_SIG_STR.', 'B_SIG_STR.','R_TOTAL_STR.', 'B_TOTAL_STR.',
       'R_TD', 'B_TD', 'R_HEAD', 'B_HEAD', 'R_BODY',
       'B_BODY', 'R_LEG', 'B_LEG', 'R_DISTANCE', 'B_DISTANCE', 'R_CLINCH',
       'B_CLINCH', 'R_GROUND', 'B_GROUND']

In [2367]:
for col in attack_cols:
    df_fights[col+'_ATT'] = df_fights[col].apply(lambda x: int(x.split('of')[1]))
    df_fights[col+'_LANDED'] = df_fights[col].apply(lambda x: int(x.split('of')[0]))

In [2368]:
df_fights.drop(attack_cols, axis=1, inplace=True)

- check for NULL values

In [2369]:
for col in df_fights:
    if df_fights[col].isnull().sum()!=0:
        print(f'Null count in {col} = {df_fights[col].isnull().sum()}')

Null count in Referee = 25
Null count in Winner = 92


In [2370]:
df_fights[df_fights['Winner'].isnull()]['win_by'].value_counts()

Overturned              37
Decision - Majority     22
Could Not Continue      15
Decision - Split        11
Decision - Unanimous     5
Other                    2
Name: win_by, dtype: int64

In [2371]:
df_fights['Winner'].fillna('Draw', inplace=True)

- convert percentages to decimal values

In [2372]:
percentage_columns = ['R_SIG_STR_pct', 'B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct']

for col in percentage_columns:
    df_fights[col] = df_fights[col].apply(lambda x : float(x.replace('%',''))/100)

- isolating Title fights and weight classes

In [2373]:
df_fights['Fight_type'].value_counts()[df_fights['Fight_type'].value_counts() > 1].index

Index(['Lightweight Bout', 'Welterweight Bout', 'Middleweight Bout',
       'Light Heavyweight Bout', 'Heavyweight Bout', 'Featherweight Bout',
       'Bantamweight Bout', 'Flyweight Bout', 'Women's Strawweight Bout',
       'Women's Bantamweight Bout', 'Open Weight Bout',
       'Women's Flyweight Bout', 'UFC Light Heavyweight Title Bout',
       'UFC Welterweight Title Bout', 'UFC Heavyweight Title Bout',
       'UFC Middleweight Title Bout', 'UFC Lightweight Title Bout',
       'Catch Weight Bout', 'UFC Flyweight Title Bout',
       'UFC Women's Bantamweight Title Bout', 'UFC Featherweight Title Bout',
       'UFC Bantamweight Title Bout', 'UFC Women's Strawweight Title Bout',
       'Women's Featherweight Bout', 'UFC Interim Heavyweight Title Bout',
       'UFC Women's Flyweight Title Bout',
       'UFC Women's Featherweight Title Bout',
       'UFC Superfight Championship Bout',
       'UFC Interim Bantamweight Title Bout',
       'UFC Interim Middleweight Title Bout',
       'UFC

In [2374]:
df_fights['title_bout'] = df_fights['Fight_type'].apply(lambda x: 1 if 'Title Bout' in x else 0) 

In [2375]:
weight_classes = ['Women\'s Strawweight', 'Women\'s Bantamweight', 
                  'Women\'s Featherweight', 'Women\'s Flyweight', 'Lightweight', 
                  'Welterweight', 'Middleweight','Light Heavyweight', 
                  'Heavyweight', 'Featherweight','Bantamweight', 'Flyweight', 'Open Weight']

def make_weight_class(x):
    for weight_class in weight_classes:
        if weight_class in x:
            return weight_class
    if x == 'Catch Weight Bout' or 'Catchweight Bout':
        return 'Catch Weight'
    else:
        return 'Open Weight'

In [2376]:
df_fights['weight_class'] = df_fights['Fight_type'].apply(make_weight_class)

In [2377]:
df_fights['weight_class'].value_counts()

Lightweight              1039
Welterweight             1023
Middleweight              762
Heavyweight               537
Light Heavyweight         535
Featherweight             488
Bantamweight              421
Flyweight                 204
Women's Strawweight       164
Women's Bantamweight      129
Open Weight                92
Women's Flyweight          77
Catch Weight               39
Women's Featherweight      14
Name: weight_class, dtype: int64

- isolate total fight time (seconds)

In [2378]:
df_fights['Format'].value_counts()

3 Rnd (5-5-5)           4847
5 Rnd (5-5-5-5-5)        458
1 Rnd + OT (12-3)         79
No Time Limit             37
3 Rnd + OT (5-5-5-5)      22
1 Rnd + 2OT (15-3-3)      20
1 Rnd (20)                20
2 Rnd (5-5)               11
1 Rnd (15)                 8
1 Rnd (10)                 6
1 Rnd (12)                 4
1 Rnd + OT (30-5)          3
1 Rnd (18)                 2
1 Rnd + OT (15-3)          2
1 Rnd (30)                 1
1 Rnd + 2OT (24-3-3)       1
1 Rnd + OT (27-3)          1
1 Rnd + OT (30-3)          1
1 Rnd + OT (31-5)          1
Name: Format, dtype: int64

In [2379]:
time_in_first_round = {'3 Rnd (5-5-5)': 5*60, 
                       '5 Rnd (5-5-5-5-5)': 5*60, 
                       '1 Rnd + OT (12-3)': 12*60,
                       'No Time Limit': 1, 
                       '3 Rnd + OT (5-5-5-5)': 5*60, 
                       '1 Rnd (20)': 1*20,
                       '2 Rnd (5-5)': 5*60, 
                       '1 Rnd (15)': 15*60, 
                       '1 Rnd (10)': 10*60,
                       '1 Rnd (12)':12*60, 
                       '1 Rnd + OT (30-5)': 30*60, 
                       '1 Rnd (18)': 18*60, 
                       '1 Rnd + OT (15-3)': 15*60,
                       '1 Rnd (30)': 30*60, 
                       '1 Rnd + OT (31-5)': 31*5,
                       '1 Rnd + OT (27-3)': 27*60, 
                       '1 Rnd + OT (30-3)': 30*60}

exception_format_time = {'1 Rnd + 2OT (15-3-3)': [15*60, 3*60], 
                         '1 Rnd + 2OT (24-3-3)': [24*60, 3*60]}

# '1 Rnd + 2OT (15-3-3)' and '1 Rnd + 2OT (24-3-3)' is not included because it has 3 uneven timed rounds. 
# We'll have to deal with it separately

In [2380]:
# Converting to seconds
df_fights['last_round_time'] = df_fights['last_round_time'].apply(lambda x: int(x.split(':')[0])*60 + int(x.split(':')[1]))

In [2381]:
def get_total_time(row):
    if row['Format'] in time_in_first_round.keys():
        return (row['last_round'] - 1) * time_in_first_round[row['Format']] + row['last_round_time']
    elif row['Format'] in exception_format_time.keys():
        if (row['last_round'] - 1) >= 2:
            return exception_format_time[row['Format']][0] + (row['last_round'] - 2) * \
                    exception_format_time[row['Format']][1] + row['last_round_time']
        else:
            return (row['last_round'] - 1) * exception_format_time[row['Format']][0] + row['last_round_time']

In [2382]:
df_fights['total_time_fought(sec)'] = df_fights.apply(get_total_time, axis=1)

In [2383]:
def get_num_rounds(x):
    if x == 'No Time Limit':
        return 1
    else:
        return len((x.split('(')[1].replace(')','').split('-')))
    
df_fights['no_of_rounds'] = df_fights['Format'].apply(get_num_rounds)

<h3>Create master DataFrame by integrating fight/fighter data</h3>

<h4>Consolidate red/blue corner stats to align them with the correct fighter</h4>

In [2384]:
df_master = df_fights.copy(deep=True)

In [2385]:
df_master.columns

Index(['R_fighter', 'B_fighter', 'R_KD', 'B_KD', 'R_SIG_STR_pct',
       'B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct', 'R_SUB_ATT', 'B_SUB_ATT',
       'R_PASS', 'B_PASS', 'R_REV', 'B_REV', 'win_by', 'last_round',
       'last_round_time', 'Format', 'Referee', 'date', 'location',
       'Fight_type', 'Winner', 'R_SIG_STR._ATT', 'R_SIG_STR._LANDED',
       'B_SIG_STR._ATT', 'B_SIG_STR._LANDED', 'R_TOTAL_STR._ATT',
       'R_TOTAL_STR._LANDED', 'B_TOTAL_STR._ATT', 'B_TOTAL_STR._LANDED',
       'R_TD_ATT', 'R_TD_LANDED', 'B_TD_ATT', 'B_TD_LANDED', 'R_HEAD_ATT',
       'R_HEAD_LANDED', 'B_HEAD_ATT', 'B_HEAD_LANDED', 'R_BODY_ATT',
       'R_BODY_LANDED', 'B_BODY_ATT', 'B_BODY_LANDED', 'R_LEG_ATT',
       'R_LEG_LANDED', 'B_LEG_ATT', 'B_LEG_LANDED', 'R_DISTANCE_ATT',
       'R_DISTANCE_LANDED', 'B_DISTANCE_ATT', 'B_DISTANCE_LANDED',
       'R_CLINCH_ATT', 'R_CLINCH_LANDED', 'B_CLINCH_ATT', 'B_CLINCH_LANDED',
       'R_GROUND_ATT', 'R_GROUND_LANDED', 'B_GROUND_ATT', 'B_GROUND_LANDED',
       'titl

In [2386]:
df_master.drop(['R_KD', 'B_KD', 'R_SIG_STR_pct',
       'B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct', 'R_SUB_ATT', 'B_SUB_ATT',
       'R_PASS', 'B_PASS', 'R_REV', 'B_REV', 'win_by', 'last_round',
       'last_round_time', 'Format',
       'Fight_type', 'R_SIG_STR._ATT', 'R_SIG_STR._LANDED',
       'B_SIG_STR._ATT', 'B_SIG_STR._LANDED', 'R_TOTAL_STR._ATT',
       'R_TOTAL_STR._LANDED', 'B_TOTAL_STR._ATT', 'B_TOTAL_STR._LANDED',
       'R_TD_ATT', 'R_TD_LANDED', 'B_TD_ATT', 'B_TD_LANDED', 'R_HEAD_ATT',
       'R_HEAD_LANDED', 'B_HEAD_ATT', 'B_HEAD_LANDED', 'R_BODY_ATT',
       'R_BODY_LANDED', 'B_BODY_ATT', 'B_BODY_LANDED', 'R_LEG_ATT',
       'R_LEG_LANDED', 'B_LEG_ATT', 'B_LEG_LANDED', 'R_DISTANCE_ATT',
       'R_DISTANCE_LANDED', 'B_DISTANCE_ATT', 'B_DISTANCE_LANDED',
       'R_CLINCH_ATT', 'R_CLINCH_LANDED', 'B_CLINCH_ATT', 'B_CLINCH_LANDED',
       'R_GROUND_ATT', 'R_GROUND_LANDED', 'B_GROUND_ATT', 'B_GROUND_LANDED'], axis = 1, inplace=True)

In [2387]:
df_master.head(3)

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,total_time_fought(sec),no_of_rounds
0,Israel Adesanya,Yoel Romero,Dan Miragliotta,"March 07, 2020","Las Vegas, Nevada, USA",Israel Adesanya,1,Middleweight,1500,5
1,Weili Zhang,Joanna Jedrzejczyk,Keith Peterson,"March 07, 2020","Las Vegas, Nevada, USA",Weili Zhang,1,Women's Strawweight,1500,5
2,Beneil Dariush,Drakkar Klose,Jason Herzog,"March 07, 2020","Las Vegas, Nevada, USA",Beneil Dariush,0,Lightweight,360,3


In [2388]:
df_red = df_fights[['R_fighter','R_KD', 'R_SIG_STR_pct',
       'R_TD_pct', 'R_SUB_ATT',
       'R_PASS', 'R_REV', 'win_by', 'last_round',
       'last_round_time', 'Format', 'Referee', 'date', 'location',
       'Fight_type', 'Winner', 'R_SIG_STR._ATT', 'R_SIG_STR._LANDED',
       'R_TOTAL_STR._ATT',
       'R_TOTAL_STR._LANDED',
       'R_TD_ATT', 'R_TD_LANDED', 'R_HEAD_ATT',
       'R_HEAD_LANDED', 'R_BODY_ATT',
       'R_BODY_LANDED',  'R_LEG_ATT',
       'R_LEG_LANDED',  'R_DISTANCE_ATT',
       'R_DISTANCE_LANDED', 
       'R_CLINCH_ATT', 'R_CLINCH_LANDED',
       'R_GROUND_ATT', 'R_GROUND_LANDED',
       'title_bout', 'weight_class', 'total_time_fought(sec)', 'no_of_rounds']]

df_blue = df_fights[['B_fighter',  'B_KD',
       'B_SIG_STR_pct','B_TD_pct', 'B_SUB_ATT',
       'B_PASS',  'B_REV', 'win_by', 'last_round',
       'last_round_time', 'Format', 'Referee', 'date', 'location',
       'Fight_type', 'Winner',
       'B_SIG_STR._ATT', 'B_SIG_STR._LANDED',
       'B_TOTAL_STR._ATT', 'B_TOTAL_STR._LANDED',
       'B_TD_ATT', 'B_TD_LANDED',
       'B_HEAD_ATT', 'B_HEAD_LANDED', 
       'B_BODY_ATT', 'B_BODY_LANDED', 
       'B_LEG_ATT', 'B_LEG_LANDED', 
       'B_DISTANCE_ATT', 'B_DISTANCE_LANDED',
       'B_CLINCH_ATT', 'B_CLINCH_LANDED',
       'B_GROUND_ATT', 'B_GROUND_LANDED',
       'title_bout', 'weight_class', 'total_time_fought(sec)', 'no_of_rounds']]

- get rid of red/blue corner prefixes in order to union fighter history

In [2389]:
def drop_prefix(self, prefix):
    self.columns = self.columns.str.replace('^'+prefix,'')
    return self

pd.core.frame.DataFrame.drop_prefix = drop_prefix

In [2390]:
union = pd.concat([df_red.drop_prefix('R_'), df_blue.drop_prefix('B_')])

- join this combined fight history DataFrame to the originial fighter DataFrame

In [2391]:
union

Unnamed: 0,fighter,KD,SIG_STR_pct,TD_pct,SUB_ATT,PASS,REV,win_by,last_round,last_round_time,...,DISTANCE_ATT,DISTANCE_LANDED,CLINCH_ATT,CLINCH_LANDED,GROUND_ATT,GROUND_LANDED,title_bout,weight_class,total_time_fought(sec),no_of_rounds
0,Israel Adesanya,0,0.36,0.00,0,0,0,Decision - Unanimous,5,300,...,132,48,0,0,0,0,1,Middleweight,1500,5
1,Weili Zhang,0,0.40,0.12,0,0,0,Decision - Split,5,300,...,393,158,15,7,0,0,1,Women's Strawweight,1500,5
2,Beneil Dariush,1,0.60,0.33,2,1,0,KO/TKO,2,60,...,20,12,0,0,0,0,0,Lightweight,360,3
3,Neil Magny,0,0.53,0.57,0,2,0,Decision - Unanimous,3,300,...,96,34,20,18,23,22,0,Welterweight,900,3
4,Alex Oliveira,0,0.51,0.20,0,2,0,Decision - Split,3,300,...,67,35,8,3,2,2,0,Welterweight,900,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5519,Kevin Rosier,0,0.00,0.00,0,0,0,KO/TKO,1,59,...,3,0,0,0,0,0,0,Open Weight,59,1
5520,Patrick Smith,0,0.50,0.00,0,0,0,Submission,1,109,...,1,1,1,1,6,2,0,Open Weight,109,1
5521,Art Jimmerson,0,0.00,0.00,0,0,0,Submission,1,138,...,0,0,0,0,0,0,0,Open Weight,138,1
5522,Zane Frazier,0,0.42,0.00,0,0,0,KO/TKO,1,260,...,7,0,19,10,2,2,0,Open Weight,260,1


In [2392]:
union[union['fighter']=='Daniel Cormier'].head(3)

Unnamed: 0,fighter,KD,SIG_STR_pct,TD_pct,SUB_ATT,PASS,REV,win_by,last_round,last_round_time,...,DISTANCE_ATT,DISTANCE_LANDED,CLINCH_ATT,CLINCH_LANDED,GROUND_ATT,GROUND_LANDED,title_bout,weight_class,total_time_fought(sec),no_of_rounds
268,Daniel Cormier,0,0.68,0.33,0,2,0,KO/TKO,4,249,...,209,139,27,21,27,21,1,Heavyweight,1149,5
698,Daniel Cormier,0,0.76,1.0,1,4,0,Submission,2,134,...,6,4,1,1,18,14,1,Heavyweight,434,5
1052,Daniel Cormier,0,0.52,0.66,1,3,0,KO/TKO,2,120,...,46,24,8,3,7,5,1,Light Heavyweight,420,5


In [2393]:
df_fighter_history = pd.merge(df_fighters, union, left_on='fighter_name', right_on='fighter', how='left', indicator=True)

- 1,330 fighters without any fight stats (in original fighter dataset)
    - However, every fighter involved in a historical bout is contained in the original fighter dataset
    - UPDATE: after analysis using the above 1,330 fighters, they will be dropped to ensure data quality and avoid "garbage in, garbage out

In [2399]:
df_fighter_history._merge.value_counts()

both          11040
right_only        0
left_only         0
Name: _merge, dtype: int64

In [2401]:
df_fighter_history = df_fighter_history[df_fighter_history._merge != 'left_only']

In [2402]:
union.shape

(11048, 38)

In [2403]:
df_fighter_history.shape

(11040, 45)

- lack of depth in individual fight history presents a problem for forecasting fighter performance

In [2404]:
df_fighter_history['fighter_name'].value_counts()

Donald Cerrone      34
Jim Miller          34
Jeremy Stephens     32
Demian Maia         31
Andrei Arlovski     31
                    ..
Tyler Toner          1
Fred Ettish          1
Wang Guan            1
Tristan Connelly     1
Jermaine Andre       1
Name: fighter_name, Length: 2004, dtype: int64

<h3>Feature Engineering</h3>

In [2405]:
df_fighter_history.head(6).T

Unnamed: 0,1,2,3,4,5,6
fighter_name,Danny Abbadi,Danny Abbadi,David Abbott,David Abbott,David Abbott,David Abbott
Height,71,71,72,72,72,72
Weight,155,155,265,265,265,265
Reach,72.6813,72.6813,73.75,73.75,73.75,73.75
Stance,Orthodox,Orthodox,Switch,Switch,Switch,Switch
DOB,"Jul 03, 1983","Jul 03, 1983",,,,
fighter,Danny Abbadi,Danny Abbadi,David Abbott,David Abbott,David Abbott,David Abbott
KD,0,0,0,0,1,0
SIG_STR_pct,0.38,0.33,0.68,0.41,0.52,0.44
TD_pct,0,0,0,0.75,1,0


In [2406]:
df_fighter_history.drop(columns = ['fighter','Format','Referee','Fight_type'], inplace=True)

- there are too many distinct locations
    - in order to create a more signifcant feature, location is adapted to a binary indicator of whether or not the fight took place in Las Vegas, Nevada (i.e. the most popular fight location)

In [2407]:
df_fighter_history['location'].value_counts()

Las Vegas, Nevada, USA             2528
London, England, United Kingdom     228
Newark, New Jersey, USA             178
Anaheim, California, USA            168
Chicago, Illinois, USA              162
                                   ... 
Bossier City, Louisiana, USA         16
Mobile, Alabama, USA                 15
Chiba, Japan                         14
Dothan, Alabama, USA                 14
Yokohama, Kanagawa, Japan            14
Name: location, Length: 165, dtype: int64

In [2408]:
df_fighter_history['location']=df_fighter_history['location'].apply(lambda x: 1 if str(x).find('Las Vegas')!=-1 else 0)

- NaN values in fight stats columns were resulting from the left merge
    - remaining NaN values only exist from fighter data set

In [2410]:
df_fighter_history.isna().sum()

fighter_name                0
Height                      0
Weight                      0
Reach                       0
Stance                    304
DOB                       227
KD                          0
SIG_STR_pct                 0
TD_pct                      0
SUB_ATT                     0
PASS                        0
REV                         0
win_by                      0
last_round                  0
last_round_time             0
date                        0
location                    0
Winner                      0
SIG_STR._ATT                0
SIG_STR._LANDED             0
TOTAL_STR._ATT              0
TOTAL_STR._LANDED           0
TD_ATT                      0
TD_LANDED                   0
HEAD_ATT                    0
HEAD_LANDED                 0
BODY_ATT                    0
BODY_LANDED                 0
LEG_ATT                     0
LEG_LANDED                  0
DISTANCE_ATT                0
DISTANCE_LANDED             0
CLINCH_ATT                  0
CLINCH_LAN

- recode winner column to binary

In [2411]:
df_fighter_history['won'] = df_fighter_history.apply(lambda row: 1 if row['Winner'] == row['fighter_name'] else 0, axis=1)

df_fighter_history.drop(columns = ['Winner'], inplace=True)

- change Date of Birth and fight date from string to datetime

In [2412]:
from datetime import datetime

month_code = {'Jan ': 'January ', 
      'Feb ': 'February ', 
      'Mar ': 'March ', 
      'Apr ': 'April ', 
      'May ': 'May ', 
      'Jun ': 'June ', 
      'Jul ': 'July ', 
      'Aug ': 'August ', 
      'Sep ': 'September ', 
      'Oct ': 'October ', 
      'Nov ': 'November ', 
      'Dec ': 'December '}

for k, v in month_code.items():
    df_fighter_history['DOB'] = df_fighter_history['DOB'].apply(lambda x: x.replace(k, v) if type(x) == str else x)

df_fighter_history['DOB'] = df_fighter_history['DOB'].apply(lambda row: datetime.strptime(row, '%B %d, %Y') if type(row) == str else row)
df_fighter_history['date'] = df_fighter_history['date'].apply(lambda row: datetime.strptime(row, '%B %d, %Y') if type(row) == str else row)

- replacing NaN values:
    - numerical: column mean
    - categorica: column mode
    - date: column mean

In [2413]:
df_fighter_history.fillna(df_fighter_history.mean(), inplace=True)
df_fighter_history = df_fighter_history.apply(lambda x:x.fillna(x.value_counts().index[0]))
df_fighter_history['date'] = df_fighter_history['date'].apply(lambda row: datetime.strptime(row, '%B %d, %Y') if type(row) == str else row)

- creating age (at fight date) feature

In [2414]:
df_fighter_history['age'] = df_fighter_history['date'] - df_fighter_history['DOB']
df_fighter_history['age']=df_fighter_history['age']/np.timedelta64(1,'Y')
df_fighter_history['age']=df_fighter_history['age'].apply(lambda x: 25 if x <=18 else x)

In [2416]:
df_fighter_history.isna().sum()

fighter_name              0
Height                    0
Weight                    0
Reach                     0
Stance                    0
DOB                       0
KD                        0
SIG_STR_pct               0
TD_pct                    0
SUB_ATT                   0
PASS                      0
REV                       0
win_by                    0
last_round                0
last_round_time           0
date                      0
location                  0
SIG_STR._ATT              0
SIG_STR._LANDED           0
TOTAL_STR._ATT            0
TOTAL_STR._LANDED         0
TD_ATT                    0
TD_LANDED                 0
HEAD_ATT                  0
HEAD_LANDED               0
BODY_ATT                  0
BODY_LANDED               0
LEG_ATT                   0
LEG_LANDED                0
DISTANCE_ATT              0
DISTANCE_LANDED           0
CLINCH_ATT                0
CLINCH_LANDED             0
GROUND_ATT                0
GROUND_LANDED             0
title_bout          

In [2417]:
df_fighter_history.drop(columns='_merge', inplace=True)

In [2418]:
df_fighter_history.columns

Index(['fighter_name', 'Height', 'Weight', 'Reach', 'Stance', 'DOB', 'KD',
       'SIG_STR_pct', 'TD_pct', 'SUB_ATT', 'PASS', 'REV', 'win_by',
       'last_round', 'last_round_time', 'date', 'location', 'SIG_STR._ATT',
       'SIG_STR._LANDED', 'TOTAL_STR._ATT', 'TOTAL_STR._LANDED', 'TD_ATT',
       'TD_LANDED', 'HEAD_ATT', 'HEAD_LANDED', 'BODY_ATT', 'BODY_LANDED',
       'LEG_ATT', 'LEG_LANDED', 'DISTANCE_ATT', 'DISTANCE_LANDED',
       'CLINCH_ATT', 'CLINCH_LANDED', 'GROUND_ATT', 'GROUND_LANDED',
       'title_bout', 'weight_class', 'total_time_fought(sec)', 'no_of_rounds',
       'won', 'age'],
      dtype='object')

- create features for 1) # of fights they've been in, 2) what % they won, and 3) the ranked order of past fights

In [2419]:
df_fighter_history['num_fights'] = df_fighter_history['date'].groupby(df_fighter_history['fighter_name']).transform('count')

In [2420]:
df_fighter_history['num_wins'] = df_fighter_history['won'].groupby(df_fighter_history['fighter_name']).transform('sum')

In [2421]:
df_fighter_history['record'] = df_fighter_history['num_wins']/df_fighter_history['num_fights']

In [2422]:
df_fighter_history['title_bout']=df_fighter_history['title_bout'].apply(lambda x: 1 if x == 1 else 0)

In [2435]:
df_fighter_history['fight_rank']=df_fighter_history.groupby('fighter_name')['date'].rank(ascending=True, method='first')

In [2436]:
df_fighter_history[df_fighter_history['fighter_name']=='David Abbott'].T

Unnamed: 0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
fighter_name,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott
Height,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72
Weight,265,265,265,265,265,265,265,265,265,265,265,265,265,265,265,265,265,265
Reach,73.75,73.75,73.75,73.75,73.75,73.75,73.75,73.75,73.75,73.75,73.75,73.75,73.75,73.75,73.75,73.75,73.75,73.75
Stance,Switch,Switch,Switch,Switch,Switch,Switch,Switch,Switch,Switch,Switch,Switch,Switch,Switch,Switch,Switch,Switch,Switch,Switch
DOB,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00
KD,0,0,1,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0
SIG_STR_pct,0.68,0.41,0.52,0.44,0.88,1,0.66,0.58,0.42,0,0.2,0.33,0.22,0.4,0.46,0.42,0.5,0.37
TD_pct,0,0.75,1,0,1,1,1,0,0,0,0,0.5,0,0,0,0,0,0.66
SUB_ATT,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1


- define feature groups by broad data type

In [2423]:
num_features = ['fighter_name','KD', 'SIG_STR_pct', 'TD_pct', 'SUB_ATT', 'PASS', 'REV',
       'last_round', 'last_round_time','SIG_STR._ATT',
       'SIG_STR._LANDED', 'TOTAL_STR._ATT', 'TOTAL_STR._LANDED', 'TD_ATT',
       'TD_LANDED', 'HEAD_ATT', 'HEAD_LANDED', 'BODY_ATT', 'BODY_LANDED',
       'LEG_ATT', 'LEG_LANDED', 'DISTANCE_ATT', 'DISTANCE_LANDED',
       'CLINCH_ATT', 'CLINCH_LANDED', 'GROUND_ATT', 'GROUND_LANDED','num_fights','num_wins','record',
       'total_time_fought(sec)', 'no_of_rounds']

categorical_features = ['Stance','win_by',
       'last_round', 'last_round_time', 'location',
       'title_bout', 'weight_class']

date_features = ['DOB','date']

- creating data frame of estimated fighter stats
    - avg. from all fights
    - avg. from last 5 fights
    - values from last fight

In [2424]:
df_fighter_estimates = df_fighter_history[num_features].groupby('fighter_name',as_index=False).mean()

- removing unnecessary stat features
    - total_time_fought(sec) and no_of_rounds are not fighter-specific
    - num_wins is removed to prevent multicollinearity with num_fights and record

In [2425]:
df_fighter_estimates.drop(columns=['total_time_fought(sec)','no_of_rounds','num_wins'], inplace=True)

- Check top fighters by record (i.e. undefeated fighters)

In [2426]:
df_fighter_estimates[df_fighter_estimates['num_fights']>5].sort_values(by='record', ascending=False).head()

Unnamed: 0,fighter_name,KD,SIG_STR_pct,TD_pct,SUB_ATT,PASS,REV,last_round,last_round_time,SIG_STR._ATT,...,LEG_ATT,LEG_LANDED,DISTANCE_ATT,DISTANCE_LANDED,CLINCH_ATT,CLINCH_LANDED,GROUND_ATT,GROUND_LANDED,num_fights,record
1068,Kamaru Usman,0.454545,0.54,0.356364,0.181818,3.727273,0.0,3.454545,263.363636,144.909091,...,8.181818,6.909091,96.545455,38.909091,16.272727,13.727273,32.090909,24.0,11,1.0
1114,Khabib Nurmagomedov,0.166667,0.513333,0.419167,0.666667,4.583333,0.0,3.0,226.75,113.666667,...,3.166667,2.916667,67.0,22.75,3.333333,2.25,43.333333,31.583333,12,1.0
764,Israel Adesanya,1.25,0.4975,0.0,0.375,0.25,0.0,3.25,277.625,129.0,...,20.625,17.625,119.5,55.375,6.375,5.125,3.125,2.375,8,1.0
154,Arnold Allen,0.285714,0.384286,0.321429,0.428571,1.0,0.0,3.0,251.714286,110.285714,...,5.285714,3.142857,100.285714,39.571429,4.0,1.571429,6.0,4.142857,7,1.0
1513,Petr Yan,1.166667,0.498333,0.413333,0.166667,0.666667,0.166667,2.5,241.833333,140.333333,...,3.333333,2.666667,113.0,43.833333,14.0,11.333333,13.333333,10.0,6,1.0


In [2249]:
df_fighter_estimates.head()

Unnamed: 0,fighter_name,KD,SIG_STR_pct,TD_pct,SUB_ATT,PASS,REV,last_round,last_round_time,SIG_STR._ATT,...,LEG_ATT,LEG_LANDED,DISTANCE_ATT,DISTANCE_LANDED,CLINCH_ATT,CLINCH_LANDED,GROUND_ATT,GROUND_LANDED,num_fights,record
0,AJ Fonseca,0.204225,0.448521,0.210563,0.140845,1.014085,0.140845,2.492958,229.267606,89.105634,...,7.746479,6.112676,74.295775,28.528169,7.471831,5.34507,7.338028,5.119718,1,0.0
1,AJ Matthews,0.204225,0.448521,0.210563,0.140845,1.014085,0.140845,2.492958,229.267606,89.105634,...,7.746479,6.112676,74.295775,28.528169,7.471831,5.34507,7.338028,5.119718,1,0.0
2,AJ McKee,0.204225,0.448521,0.210563,0.140845,1.014085,0.140845,2.492958,229.267606,89.105634,...,7.746479,6.112676,74.295775,28.528169,7.471831,5.34507,7.338028,5.119718,1,0.0
3,AJ Siscoe,0.204225,0.448521,0.210563,0.140845,1.014085,0.140845,2.492958,229.267606,89.105634,...,7.746479,6.112676,74.295775,28.528169,7.471831,5.34507,7.338028,5.119718,1,0.0
4,Aalon Cruz,0.204225,0.448521,0.210563,0.140845,1.014085,0.140845,2.492958,229.267606,89.105634,...,7.746479,6.112676,74.295775,28.528169,7.471831,5.34507,7.338028,5.119718,1,0.0


In [2250]:
df_fighter_history.head()

Unnamed: 0,fighter_name,Height,Weight,Reach,Stance,DOB,KD,SIG_STR_pct,TD_pct,SUB_ATT,...,GROUND_LANDED,title_bout,weight_class,total_time_fought(sec),no_of_rounds,won,age,num_fights,num_wins,record
0,Tom Aaron,69.579741,155.0,70.0,Orthodox,1978-07-13,0.204225,0.448521,0.210563,0.140845,...,5.119718,0,Featherweight,677.15493,3.197183,0,41.019323,1,0,0.0
1,Danny Abbadi,71.0,155.0,72.68125,Orthodox,1983-07-03,0.204225,0.448521,0.210563,0.140845,...,5.119718,0,Featherweight,677.15493,3.197183,0,36.047284,1,0,0.0
2,David Abbott,72.0,265.0,73.75,Switch,1988-08-08,0.204225,0.448521,0.210563,0.140845,...,5.119718,0,Featherweight,677.15493,3.197183,0,30.946563,1,0,0.0
3,Shamil Abdurakhimov,75.0,235.0,76.0,Orthodox,1981-09-02,0.204225,0.448521,0.210563,0.140845,...,5.119718,0,Featherweight,677.15493,3.197183,0,37.878943,1,0,0.0
4,Hiroyuki Abe,66.0,145.0,67.401786,Orthodox,1988-08-08,0.204225,0.448521,0.210563,0.140845,...,5.119718,0,Featherweight,677.15493,3.197183,0,30.946563,1,0,0.0


- Quickly repeat some relevant actions for the fight dataset and master dataset
    - R_Win will end up being our target variable

In [2251]:
df_fights['R_Win'] = df_fights.apply(lambda row: 1 if row['Winner'] == row['R_fighter'] else 0, axis=1)

In [2252]:
df_fights.drop(columns=['last_round','last_round_time', 'Format', 'Referee','Fight_type','Winner'], inplace=True)

In [2253]:
df_fights.columns

Index(['R_fighter', 'B_fighter', 'R_KD', 'B_KD', 'R_SIG_STR_pct',
       'B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct', 'R_SUB_ATT', 'B_SUB_ATT',
       'R_PASS', 'B_PASS', 'R_REV', 'B_REV', 'win_by', 'date', 'location',
       'R_SIG_STR._ATT', 'R_SIG_STR._LANDED', 'B_SIG_STR._ATT',
       'B_SIG_STR._LANDED', 'R_TOTAL_STR._ATT', 'R_TOTAL_STR._LANDED',
       'B_TOTAL_STR._ATT', 'B_TOTAL_STR._LANDED', 'R_TD_ATT', 'R_TD_LANDED',
       'B_TD_ATT', 'B_TD_LANDED', 'R_HEAD_ATT', 'R_HEAD_LANDED', 'B_HEAD_ATT',
       'B_HEAD_LANDED', 'R_BODY_ATT', 'R_BODY_LANDED', 'B_BODY_ATT',
       'B_BODY_LANDED', 'R_LEG_ATT', 'R_LEG_LANDED', 'B_LEG_ATT',
       'B_LEG_LANDED', 'R_DISTANCE_ATT', 'R_DISTANCE_LANDED', 'B_DISTANCE_ATT',
       'B_DISTANCE_LANDED', 'R_CLINCH_ATT', 'R_CLINCH_LANDED', 'B_CLINCH_ATT',
       'B_CLINCH_LANDED', 'R_GROUND_ATT', 'R_GROUND_LANDED', 'B_GROUND_ATT',
       'B_GROUND_LANDED', 'title_bout', 'weight_class',
       'total_time_fought(sec)', 'no_of_rounds', 'R_Win'],
      dt

In [2254]:
df_fights['win_by'].value_counts()

Decision - Unanimous    34
KO/TKO                  26
Decision - Split         5
Submission               4
Overturned               1
Decision - Majority      1
Name: win_by, dtype: int64

In [2255]:
df_master.columns

Index(['R_fighter', 'B_fighter', 'Referee', 'date', 'location', 'Winner',
       'title_bout', 'weight_class', 'total_time_fought(sec)', 'no_of_rounds'],
      dtype='object')

In [2256]:
df_master.drop(columns=['Referee'], inplace=True)

In [2257]:
df_fights.head(3).T

Unnamed: 0,0,1,2
R_fighter,Max Holloway,Cris Cyborg,Geoff Neal
B_fighter,Frankie Edgar,Felicia Spencer,Niko Price
R_KD,0,0,0
B_KD,0,0,1
R_SIG_STR_pct,0.39,0.56,0.67
B_SIG_STR_pct,0.36,0.44,0.42
R_TD_pct,0,0,0.5
B_TD_pct,0.06,0,0
R_SUB_ATT,0,0,0
B_SUB_ATT,0,0,0


In [2258]:
df_master.head(3).T

Unnamed: 0,0,1,2
R_fighter,Max Holloway,Cris Cyborg,Geoff Neal
B_fighter,Frankie Edgar,Felicia Spencer,Niko Price
date,"July 27, 2019","July 27, 2019","July 27, 2019"
location,"Edmonton, Alberta, Canada","Edmonton, Alberta, Canada","Edmonton, Alberta, Canada"
Winner,Max Holloway,Cris Cyborg,Geoff Neal
title_bout,1,0,0
weight_class,Featherweight,Women's Featherweight,Welterweight
total_time_fought(sec),1500,900,459
no_of_rounds,5,3,3


In [2259]:
df_fighter_estimates.drop(columns=['last_round_time','last_round'])
df_fighter_estimates.head(3).T #fighter_estimates is good to go

Unnamed: 0,0,1,2
fighter_name,AJ Fonseca,AJ Matthews,AJ McKee
KD,0.204225,0.204225,0.204225
SIG_STR_pct,0.448521,0.448521,0.448521
TD_pct,0.210563,0.210563,0.210563
SUB_ATT,0.140845,0.140845,0.140845
PASS,1.01408,1.01408,1.01408
REV,0.140845,0.140845,0.140845
last_round,2.49296,2.49296,2.49296
last_round_time,229.268,229.268,229.268
SIG_STR._ATT,89.1056,89.1056,89.1056


In [2260]:
df_fighter_history.head(3).T

Unnamed: 0,0,1,2
fighter_name,Tom Aaron,Danny Abbadi,David Abbott
Height,69.5797,71,72
Weight,155,155,265
Reach,70,72.6813,73.75
Stance,Orthodox,Orthodox,Switch
DOB,1978-07-13 00:00:00,1983-07-03 00:00:00,1988-08-08 00:00:00
KD,0.204225,0.204225,0.204225
SIG_STR_pct,0.448521,0.448521,0.448521
TD_pct,0.210563,0.210563,0.210563
SUB_ATT,0.140845,0.140845,0.140845
