<h1>Cleaning, EDA, and Feature Engineering after Scraping</h1>

- Acknowledgements:
    - ufcstats for comprehensive data sets on past MMA bouts: http://ufcstats.com/
    - Rajeev Warrier for providing the groundwork for this prediction project: https://github.com/WarrierRajeev/UFC-Predictions

In [1480]:
import pandas as pd
import numpy as np

DATA_PATH ='./data'
df_fighters = pd.read_csv(DATA_PATH+'/fighter_details.csv')
df_fights = pd.read_csv(DATA_PATH+'/total_fight_data.csv', sep=';')

In [1481]:
df_fighters.head()

Unnamed: 0,fighter_name,Height,Weight,Reach,Stance,DOB
0,Tom Aaron,,155 lbs.,,,"Jul 13, 1978"
1,Danny Abbadi,"5' 11""",155 lbs.,,Orthodox,"Jul 03, 1983"
2,David Abbott,"6' 0""",265 lbs.,,Switch,
3,Shamil Abdurakhimov,"6' 3""",235 lbs.,"76""",Orthodox,"Sep 02, 1981"
4,Hiroyuki Abe,"5' 6""",145 lbs.,,Orthodox,


In [1482]:
df_fights.head(3).T

Unnamed: 0,0,1,2
R_fighter,Israel Adesanya,Weili Zhang,Beneil Dariush
B_fighter,Yoel Romero,Joanna Jedrzejczyk,Drakkar Klose
R_KD,0,0,1
B_KD,0,0,0
R_SIG_STR.,48 of 132,165 of 408,12 of 20
B_SIG_STR.,40 of 89,186 of 360,15 of 25
R_SIG_STR_pct,36%,40%,60%
B_SIG_STR_pct,44%,51%,60%
R_TOTAL_STR.,48 of 132,170 of 413,17 of 28
B_TOTAL_STR.,40 of 89,196 of 370,27 of 37


<h3>Processing Fighter data set</h3> 

In [1483]:
df_fighters.dtypes

fighter_name    object
Height          object
Weight          object
Reach           object
Stance          object
DOB             object
dtype: object

In [1484]:
df_fighters.isna().sum()

fighter_name       0
Height           257
Weight            74
Reach           1714
Stance           840
DOB              741
dtype: int64

- fighters with NaN Weight values have little to no useful data
    - therefore, these rows will be excluded

In [1485]:
df_fighters[pd.isnull(df_fighters['Weight'])]

Unnamed: 0,fighter_name,Height,Weight,Reach,Stance,DOB
37,Juan Alcain,,,,,
53,Levi Alford,,,,,
77,Anthony Alves,,,,,
125,Joey Armstrong,,,,,
180,Yohan Banks,,,,,
...,...,...,...,...,...,...
3085,Carl Toomey,,,,,
3228,Jeremy Wallace,"5' 9""",,,,
3304,Karl Willis,,,,,
3317,Ray Wizard,,,,,


In [1486]:
df_fighters = df_fighters[df_fighters['Weight'].notna()]

- to fill NaN values in bodily metrics, find:
    - average reach for each height increment
    - average height for each weight increment

In [1487]:
df_fighters['Weight'] = df_fighters['Weight'].apply(lambda x: x.split(' ')[0])
df_fighters['Weight'] = df_fighters['Weight'].astype(float)

In [1488]:
df_fighters['Height'] = df_fighters['Height'].fillna('0\' 0\"')
df_fighters['Height'] = df_fighters['Height'].apply(lambda x: int(x.split('\' ')[0])*12 + int(x.split('\' ')[1].replace('\"','')))
df_fighters['Height'] = df_fighters['Height'].replace(0, np.nan).astype(float)

In [1489]:
df_fighters['Height'] = df_fighters.groupby('Weight')['Height'].apply(lambda x: x.fillna(x.mean()))
df_fighters['Height'] = df_fighters['Height'].fillna(df_fighters['Height'].mean())

In [1490]:
df_fighters['Reach'] = df_fighters['Reach'].fillna('0')
df_fighters['Reach'] = df_fighters['Reach'].apply(lambda x: x.replace('\"',''))
df_fighters['Reach'] = df_fighters['Reach'].replace('0', np.nan).astype(float)

In [1491]:
df_fighters['Reach'] = df_fighters.groupby('Height')['Reach'].apply(lambda x: x.fillna(x.mean()))
df_fighters['Reach'] = df_fighters['Reach'].fillna(df_fighters['Reach'].mean())

In [1492]:
df_fighters['Stance'].value_counts()

Orthodox       1999
Southpaw        450
Switch          100
Open Stance       7
Sideways          3
Name: Stance, dtype: int64

<h3>Processing Fight data set</h3>

In [1493]:
df_fights.dtypes

R_fighter          object
B_fighter          object
R_KD                int64
B_KD                int64
R_SIG_STR.         object
B_SIG_STR.         object
R_SIG_STR_pct      object
B_SIG_STR_pct      object
R_TOTAL_STR.       object
B_TOTAL_STR.       object
R_TD               object
B_TD               object
R_TD_pct           object
B_TD_pct           object
R_SUB_ATT           int64
B_SUB_ATT           int64
R_PASS              int64
B_PASS              int64
R_REV               int64
B_REV               int64
R_HEAD             object
B_HEAD             object
R_BODY             object
B_BODY             object
R_LEG              object
B_LEG              object
R_DISTANCE         object
B_DISTANCE         object
R_CLINCH           object
B_CLINCH           object
R_GROUND           object
B_GROUND           object
win_by             object
last_round          int64
last_round_time    object
Format             object
Referee            object
date               object
location    

- split attack stats into attempts/landed numerical format

In [1494]:
df_fights.columns
attack_cols = ['R_SIG_STR.', 'B_SIG_STR.','R_TOTAL_STR.', 'B_TOTAL_STR.',
       'R_TD', 'B_TD', 'R_HEAD', 'B_HEAD', 'R_BODY',
       'B_BODY', 'R_LEG', 'B_LEG', 'R_DISTANCE', 'B_DISTANCE', 'R_CLINCH',
       'B_CLINCH', 'R_GROUND', 'B_GROUND']

In [1495]:
for col in attack_cols:
    df_fights[col+'_ATT'] = df_fights[col].apply(lambda x: int(x.split('of')[1]))
    df_fights[col+'_LANDED'] = df_fights[col].apply(lambda x: int(x.split('of')[0]))

In [1496]:
df_fights.drop(attack_cols, axis=1, inplace=True)

In [1497]:
df_fights.head().T

Unnamed: 0,0,1,2,3,4
R_fighter,Israel Adesanya,Weili Zhang,Beneil Dariush,Neil Magny,Alex Oliveira
B_fighter,Yoel Romero,Joanna Jedrzejczyk,Drakkar Klose,Jingliang Li,Max Griffin
R_KD,0,0,1,0,0
B_KD,0,0,0,0,0
R_SIG_STR_pct,36%,40%,60%,53%,51%
B_SIG_STR_pct,44%,51%,60%,25%,46%
R_TD_pct,0%,12%,33%,57%,20%
B_TD_pct,0%,0%,0%,66%,66%
R_SUB_ATT,0,0,2,0,0
B_SUB_ATT,0,0,0,0,0


- check for NULL values

In [1498]:
for col in df_fights:
    if df_fights[col].isnull().sum()!=0:
        print(f'Null count in {col} = {df_fights[col].isnull().sum()}')

Null count in Referee = 25
Null count in Winner = 92


In [1499]:
df_fights[df_fights['Winner'].isnull()]['win_by'].value_counts()

Overturned              37
Decision - Majority     22
Could Not Continue      15
Decision - Split        11
Decision - Unanimous     5
Other                    2
Name: win_by, dtype: int64

In [1500]:
df_fights['Winner'].fillna('Draw', inplace=True)

- convert percentages to decimal values

In [1501]:
percentage_columns = ['R_SIG_STR_pct', 'B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct']

for col in percentage_columns:
    df_fights[col] = df_fights[col].apply(lambda x : float(x.replace('%',''))/100)

- isolating Title fights and weight classes

In [1502]:
df_fights['Fight_type'].value_counts()[df_fights['Fight_type'].value_counts() > 1].index

Index(['Lightweight Bout', 'Welterweight Bout', 'Middleweight Bout',
       'Light Heavyweight Bout', 'Heavyweight Bout', 'Featherweight Bout',
       'Bantamweight Bout', 'Flyweight Bout', 'Women's Strawweight Bout',
       'Women's Bantamweight Bout', 'Open Weight Bout',
       'Women's Flyweight Bout', 'UFC Light Heavyweight Title Bout',
       'UFC Welterweight Title Bout', 'UFC Heavyweight Title Bout',
       'UFC Middleweight Title Bout', 'UFC Lightweight Title Bout',
       'Catch Weight Bout', 'UFC Flyweight Title Bout',
       'UFC Women's Bantamweight Title Bout', 'UFC Featherweight Title Bout',
       'UFC Bantamweight Title Bout', 'UFC Women's Strawweight Title Bout',
       'Women's Featherweight Bout', 'UFC Interim Heavyweight Title Bout',
       'UFC Women's Flyweight Title Bout',
       'UFC Women's Featherweight Title Bout',
       'UFC Superfight Championship Bout',
       'UFC Interim Bantamweight Title Bout',
       'UFC Interim Middleweight Title Bout',
       'UFC

In [1503]:
df_fights['title_bout'] = df_fights['Fight_type'].apply(lambda x: 1 if 'Title Bout' in x else 0) 

In [1504]:
weight_classes = ['Women\'s Strawweight', 'Women\'s Bantamweight', 
                  'Women\'s Featherweight', 'Women\'s Flyweight', 'Lightweight', 
                  'Welterweight', 'Middleweight','Light Heavyweight', 
                  'Heavyweight', 'Featherweight','Bantamweight', 'Flyweight', 'Open Weight']

def make_weight_class(x):
    for weight_class in weight_classes:
        if weight_class in x:
            return weight_class
    if x == 'Catch Weight Bout' or 'Catchweight Bout':
        return 'Catch Weight'
    else:
        return 'Open Weight'

In [1505]:
df_fights['weight_class'] = df_fights['Fight_type'].apply(make_weight_class)

In [1506]:
df_fights['weight_class'].value_counts()

Lightweight              1039
Welterweight             1023
Middleweight              762
Heavyweight               537
Light Heavyweight         535
Featherweight             488
Bantamweight              421
Flyweight                 204
Women's Strawweight       164
Women's Bantamweight      129
Open Weight                92
Women's Flyweight          77
Catch Weight               39
Women's Featherweight      14
Name: weight_class, dtype: int64

- isolate total fight time (seconds)

In [1507]:
df_fights['Format'].value_counts()

3 Rnd (5-5-5)           4847
5 Rnd (5-5-5-5-5)        458
1 Rnd + OT (12-3)         79
No Time Limit             37
3 Rnd + OT (5-5-5-5)      22
1 Rnd + 2OT (15-3-3)      20
1 Rnd (20)                20
2 Rnd (5-5)               11
1 Rnd (15)                 8
1 Rnd (10)                 6
1 Rnd (12)                 4
1 Rnd + OT (30-5)          3
1 Rnd (18)                 2
1 Rnd + OT (15-3)          2
1 Rnd (30)                 1
1 Rnd + 2OT (24-3-3)       1
1 Rnd + OT (27-3)          1
1 Rnd + OT (30-3)          1
1 Rnd + OT (31-5)          1
Name: Format, dtype: int64

In [1508]:
time_in_first_round = {'3 Rnd (5-5-5)': 5*60, 
                       '5 Rnd (5-5-5-5-5)': 5*60, 
                       '1 Rnd + OT (12-3)': 12*60,
                       'No Time Limit': 1, 
                       '3 Rnd + OT (5-5-5-5)': 5*60, 
                       '1 Rnd (20)': 1*20,
                       '2 Rnd (5-5)': 5*60, 
                       '1 Rnd (15)': 15*60, 
                       '1 Rnd (10)': 10*60,
                       '1 Rnd (12)':12*60, 
                       '1 Rnd + OT (30-5)': 30*60, 
                       '1 Rnd (18)': 18*60, 
                       '1 Rnd + OT (15-3)': 15*60,
                       '1 Rnd (30)': 30*60, 
                       '1 Rnd + OT (31-5)': 31*5,
                       '1 Rnd + OT (27-3)': 27*60, 
                       '1 Rnd + OT (30-3)': 30*60}

exception_format_time = {'1 Rnd + 2OT (15-3-3)': [15*60, 3*60], 
                         '1 Rnd + 2OT (24-3-3)': [24*60, 3*60]}

# '1 Rnd + 2OT (15-3-3)' and '1 Rnd + 2OT (24-3-3)' is not included because it has 3 uneven timed rounds. 
# We'll have to deal with it separately

In [1509]:
# Converting to seconds
df_fights['last_round_time'] = df_fights['last_round_time'].apply(lambda x: int(x.split(':')[0])*60 + int(x.split(':')[1]))

In [1510]:
def get_total_time(row):
    if row['Format'] in time_in_first_round.keys():
        return (row['last_round'] - 1) * time_in_first_round[row['Format']] + row['last_round_time']
    elif row['Format'] in exception_format_time.keys():
        if (row['last_round'] - 1) >= 2:
            return exception_format_time[row['Format']][0] + (row['last_round'] - 2) * \
                    exception_format_time[row['Format']][1] + row['last_round_time']
        else:
            return (row['last_round'] - 1) * exception_format_time[row['Format']][0] + row['last_round_time']
    
# So if the fight ended in round 1, we only need last_round_time. 
# If it ended in round 2, we need the full time of round 1 and the last_round_time
# This works for fights with same time in each round and fights with only two rounds.

In [1511]:
df_fights['total_time_fought(sec)'] = df_fights.apply(get_total_time, axis=1)

In [1512]:
def get_num_rounds(x):
    if x == 'No Time Limit':
        return 1
    else:
        return len((x.split('(')[1].replace(')','').split('-')))
    
df_fights['no_of_rounds'] = df_fights['Format'].apply(get_num_rounds)

<h3>Create master DataFrame by integrating fight/fighter data</h3>

<h4>Consolidate red/blue corner stats to align them with the correct fighter</h4>

In [1513]:
df_master = df_fights.copy(deep=True)

In [1514]:
df_master.columns

Index(['R_fighter', 'B_fighter', 'R_KD', 'B_KD', 'R_SIG_STR_pct',
       'B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct', 'R_SUB_ATT', 'B_SUB_ATT',
       'R_PASS', 'B_PASS', 'R_REV', 'B_REV', 'win_by', 'last_round',
       'last_round_time', 'Format', 'Referee', 'date', 'location',
       'Fight_type', 'Winner', 'R_SIG_STR._ATT', 'R_SIG_STR._LANDED',
       'B_SIG_STR._ATT', 'B_SIG_STR._LANDED', 'R_TOTAL_STR._ATT',
       'R_TOTAL_STR._LANDED', 'B_TOTAL_STR._ATT', 'B_TOTAL_STR._LANDED',
       'R_TD_ATT', 'R_TD_LANDED', 'B_TD_ATT', 'B_TD_LANDED', 'R_HEAD_ATT',
       'R_HEAD_LANDED', 'B_HEAD_ATT', 'B_HEAD_LANDED', 'R_BODY_ATT',
       'R_BODY_LANDED', 'B_BODY_ATT', 'B_BODY_LANDED', 'R_LEG_ATT',
       'R_LEG_LANDED', 'B_LEG_ATT', 'B_LEG_LANDED', 'R_DISTANCE_ATT',
       'R_DISTANCE_LANDED', 'B_DISTANCE_ATT', 'B_DISTANCE_LANDED',
       'R_CLINCH_ATT', 'R_CLINCH_LANDED', 'B_CLINCH_ATT', 'B_CLINCH_LANDED',
       'R_GROUND_ATT', 'R_GROUND_LANDED', 'B_GROUND_ATT', 'B_GROUND_LANDED',
       'titl

In [1515]:
df_master.drop(['R_KD', 'B_KD', 'R_SIG_STR_pct',
       'B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct', 'R_SUB_ATT', 'B_SUB_ATT',
       'R_PASS', 'B_PASS', 'R_REV', 'B_REV', 'win_by', 'last_round',
       'last_round_time', 'Format',
       'Fight_type', 'R_SIG_STR._ATT', 'R_SIG_STR._LANDED',
       'B_SIG_STR._ATT', 'B_SIG_STR._LANDED', 'R_TOTAL_STR._ATT',
       'R_TOTAL_STR._LANDED', 'B_TOTAL_STR._ATT', 'B_TOTAL_STR._LANDED',
       'R_TD_ATT', 'R_TD_LANDED', 'B_TD_ATT', 'B_TD_LANDED', 'R_HEAD_ATT',
       'R_HEAD_LANDED', 'B_HEAD_ATT', 'B_HEAD_LANDED', 'R_BODY_ATT',
       'R_BODY_LANDED', 'B_BODY_ATT', 'B_BODY_LANDED', 'R_LEG_ATT',
       'R_LEG_LANDED', 'B_LEG_ATT', 'B_LEG_LANDED', 'R_DISTANCE_ATT',
       'R_DISTANCE_LANDED', 'B_DISTANCE_ATT', 'B_DISTANCE_LANDED',
       'R_CLINCH_ATT', 'R_CLINCH_LANDED', 'B_CLINCH_ATT', 'B_CLINCH_LANDED',
       'R_GROUND_ATT', 'R_GROUND_LANDED', 'B_GROUND_ATT', 'B_GROUND_LANDED'], axis = 1, inplace=True)

In [1516]:
df_master.head(2)

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,total_time_fought(sec),no_of_rounds
0,Israel Adesanya,Yoel Romero,Dan Miragliotta,"March 07, 2020","Las Vegas, Nevada, USA",Israel Adesanya,1,Middleweight,1500,5
1,Weili Zhang,Joanna Jedrzejczyk,Keith Peterson,"March 07, 2020","Las Vegas, Nevada, USA",Weili Zhang,1,Women's Strawweight,1500,5


In [1517]:
df_fights.head()

Unnamed: 0,R_fighter,B_fighter,R_KD,B_KD,R_SIG_STR_pct,B_SIG_STR_pct,R_TD_pct,B_TD_pct,R_SUB_ATT,B_SUB_ATT,...,B_CLINCH_ATT,B_CLINCH_LANDED,R_GROUND_ATT,R_GROUND_LANDED,B_GROUND_ATT,B_GROUND_LANDED,title_bout,weight_class,total_time_fought(sec),no_of_rounds
0,Israel Adesanya,Yoel Romero,0,0,0.36,0.44,0.0,0.0,0,0,...,7,5,0,0,0,0,1,Middleweight,1500,5
1,Weili Zhang,Joanna Jedrzejczyk,0,0,0.4,0.51,0.12,0.0,0,0,...,18,15,0,0,0,0,1,Women's Strawweight,1500,5
2,Beneil Dariush,Drakkar Klose,1,0,0.6,0.6,0.33,0.0,2,0,...,3,2,0,0,0,0,0,Lightweight,360,3
3,Neil Magny,Jingliang Li,0,0,0.53,0.25,0.57,0.66,0,0,...,9,3,23,22,0,0,0,Welterweight,900,3
4,Alex Oliveira,Max Griffin,0,0,0.51,0.46,0.2,0.66,0,0,...,3,3,2,2,0,0,0,Welterweight,900,3


In [1518]:
df_red = df_fights[['R_fighter','R_KD', 'R_SIG_STR_pct',
       'R_TD_pct', 'R_SUB_ATT',
       'R_PASS', 'R_REV', 'win_by', 'last_round',
       'last_round_time', 'Format', 'Referee', 'date', 'location',
       'Fight_type', 'Winner', 'R_SIG_STR._ATT', 'R_SIG_STR._LANDED',
       'R_TOTAL_STR._ATT',
       'R_TOTAL_STR._LANDED',
       'R_TD_ATT', 'R_TD_LANDED', 'R_HEAD_ATT',
       'R_HEAD_LANDED', 'R_BODY_ATT',
       'R_BODY_LANDED',  'R_LEG_ATT',
       'R_LEG_LANDED',  'R_DISTANCE_ATT',
       'R_DISTANCE_LANDED', 
       'R_CLINCH_ATT', 'R_CLINCH_LANDED',
       'R_GROUND_ATT', 'R_GROUND_LANDED',
       'title_bout', 'weight_class', 'total_time_fought(sec)', 'no_of_rounds']]

df_blue = df_fights[['B_fighter',  'B_KD',
       'B_SIG_STR_pct','B_TD_pct', 'B_SUB_ATT',
       'B_PASS',  'B_REV', 'win_by', 'last_round',
       'last_round_time', 'Format', 'Referee', 'date', 'location',
       'Fight_type', 'Winner',
       'B_SIG_STR._ATT', 'B_SIG_STR._LANDED',
       'B_TOTAL_STR._ATT', 'B_TOTAL_STR._LANDED',
       'B_TD_ATT', 'B_TD_LANDED',
       'B_HEAD_ATT', 'B_HEAD_LANDED', 
       'B_BODY_ATT', 'B_BODY_LANDED', 
       'B_LEG_ATT', 'B_LEG_LANDED', 
       'B_DISTANCE_ATT', 'B_DISTANCE_LANDED',
       'B_CLINCH_ATT', 'B_CLINCH_LANDED',
       'B_GROUND_ATT', 'B_GROUND_LANDED',
       'title_bout', 'weight_class', 'total_time_fought(sec)', 'no_of_rounds']]

- get rid of red/blue corner prefixes in order to union fighter history

In [1519]:
def drop_prefix(self, prefix):
    self.columns = self.columns.str.replace('^'+prefix,'')
    return self

pd.core.frame.DataFrame.drop_prefix = drop_prefix

In [1520]:
union = pd.concat([df_red.drop_prefix('R_'), df_blue.drop_prefix('B_')])

- join this combined fight history DataFrame to the originial fighter DataFrame

In [1521]:
union[union['fighter']=='Daniel Cormier'].T

Unnamed: 0,268,698,1052,1269,1426,1790,2130,2822,2932,3091,838,2314,2485,3269
fighter,Daniel Cormier,Daniel Cormier,Daniel Cormier,Daniel Cormier,Daniel Cormier,Daniel Cormier,Daniel Cormier,Daniel Cormier,Daniel Cormier,Daniel Cormier,Daniel Cormier,Daniel Cormier,Daniel Cormier,Daniel Cormier
KD,0,0,0,0,0,0,0,0,1,0,1,0,0,0
SIG_STR_pct,0.68,0.76,0.52,0.41,0.87,0.51,0.49,0.62,0.48,0.5,0.67,0.56,0.35,0.61
TD_pct,0.33,1,0.66,0,0.5,1,0.2,1,0,0.6,0,0.37,0.12,0
SUB_ATT,0,1,1,0,1,0,0,2,0,0,0,3,0,0
PASS,2,4,3,0,1,3,2,11,0,0,0,4,0,0
REV,0,0,0,0,0,0,0,0,0,0,0,0,0,0
win_by,KO/TKO,Submission,KO/TKO,Overturned,Submission,Decision - Unanimous,Decision - Split,Submission,KO/TKO,Decision - Unanimous,KO/TKO,Submission,Decision - Unanimous,Decision - Unanimous
last_round,4,2,2,3,2,3,5,3,1,3,1,3,5,3
last_round_time,249,134,120,181,217,300,300,233,79,300,273,159,300,300


In [1522]:
df_fighter_history = pd.merge(df_fighters, union, left_on='fighter_name', right_on='fighter', how='left', indicator=True)

- 1,330 fighters without any fight stats (in original fighter dataset)
    - However, every fighter involved in a historical bout is contained in the original fighter dataset

In [1523]:
union.shape

(11048, 38)

In [1524]:
df_fighter_history.shape

(12370, 45)

In [1525]:
df_fighter_history._merge.value_counts()

both          11040
left_only      1330
right_only        0
Name: _merge, dtype: int64

- lack of depth in individual fight history presents a problem for forecasting fighter performance

In [1526]:
df_fighter_history['fighter_name'].value_counts()

Donald Cerrone      34
Jim Miller          34
Jeremy Stephens     32
Andrei Arlovski     31
Diego Sanchez       31
                    ..
Ricardo Palacios     1
Matt Horning         1
Kelly Dullanty       1
Phil Cardella        1
Alex Gorgees         1
Name: fighter_name, Length: 3334, dtype: int64

<h3>Feature Engineering</h3>

In [1527]:
df_fighter_history.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12360,12361,12362,12363,12364,12365,12366,12367,12368,12369
fighter_name,Tom Aaron,Danny Abbadi,Danny Abbadi,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,...,Cat Zingano,Cat Zingano,Cat Zingano,Cat Zingano,Igor Zinoviev,Dave Zitanick,Alex Zuniga,George Zuniga,Allan Zuniga,Virgil Zwicker
Height,69.5797,71,71,72,72,72,72,72,72,72,...,66,66,66,66,73,71.0858,68.4629,69,67,74
Weight,155,155,155,265,265,265,265,265,265,265,...,145,145,145,145,199,170,145,185,155,205
Reach,70,72.6813,72.6813,73.75,73.75,73.75,73.75,73.75,73.75,73.75,...,68,68,68,68,74.7083,72.018,72.018,70.8698,70,74
Stance,,Orthodox,Orthodox,Switch,Switch,Switch,Switch,Switch,Switch,Switch,...,Southpaw,Southpaw,Southpaw,Southpaw,Orthodox,,,,Orthodox,
DOB,"Jul 13, 1978","Jul 03, 1983","Jul 03, 1983",,,,,,,,...,"Jul 01, 1982","Jul 01, 1982","Jul 01, 1982","Jul 01, 1982",,"Mar 05, 1980",,,"Apr 04, 1992","Jun 26, 1982"
fighter,,Danny Abbadi,Danny Abbadi,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,...,Cat Zingano,Cat Zingano,Cat Zingano,Cat Zingano,Igor Zinoviev,,,,Allan Zuniga,
KD,,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,,,,0,
SIG_STR_pct,,0.38,0.33,0.68,0.41,0.52,0.44,0.88,1,0.66,...,0.47,0.66,0,0.83,0,,,,0.52,
TD_pct,,0,0,0,0.75,1,0,1,1,1,...,0.75,1,0,1,0,,,,0,


In [1528]:
df_fighter_history.drop(columns = ['fighter','Format','Referee','Fight_type'], inplace=True)

- there are too many distinct locations
    - in order to create a more signifcant feature, location is adapted to a binary indicator of whether or not the fight took place in Las Vegas, Nevada (i.e. the most popular fight location)

In [1562]:
df_fighter_history['location'].value_counts()

0    8512
1    3858
Name: location, dtype: int64

In [None]:
df_fighter_history['location']=df_fighter_history['location'].apply(lambda x: 1 if x.find('Las Vegas')!=-1 else 0)

- can see that NaN values in fight stats columns are resulting from the left merge
    - need to populate these values (e.g. averages)
    - this will not detract from overall data quality because these fighters will not be included in the training data (i.e. real historical bouts)

In [1546]:
df_fighter_history.isna().sum()

fighter_name              0
Height                    0
Weight                    0
Reach                     0
Stance                    0
DOB                       0
KD                        0
SIG_STR_pct               0
TD_pct                    0
SUB_ATT                   0
PASS                      0
REV                       0
win_by                    0
last_round                0
last_round_time           0
date                      0
location                  0
SIG_STR._ATT              0
SIG_STR._LANDED           0
TOTAL_STR._ATT            0
TOTAL_STR._LANDED         0
TD_ATT                    0
TD_LANDED                 0
HEAD_ATT                  0
HEAD_LANDED               0
BODY_ATT                  0
BODY_LANDED               0
LEG_ATT                   0
LEG_LANDED                0
DISTANCE_ATT              0
DISTANCE_LANDED           0
CLINCH_ATT                0
CLINCH_LANDED             0
GROUND_ATT                0
GROUND_LANDED             0
title_bout          

- recode winner column to binary

In [1530]:
df_fighter_history['won'] = df_fighter_history.apply(lambda row: 1 if row['Winner'] == row['fighter_name'] else 0, axis=1)

df_fighter_history.drop(columns = ['Winner'], inplace=True)

- change Date of Birth and fight date from string to datetime

In [1532]:
from datetime import datetime

month_code = {'Jan ': 'January ', 
      'Feb ': 'February ', 
      'Mar ': 'March ', 
      'Apr ': 'April ', 
      'May ': 'May ', 
      'Jun ': 'June ', 
      'Jul ': 'July ', 
      'Aug ': 'August ', 
      'Sep ': 'September ', 
      'Oct ': 'October ', 
      'Nov ': 'November ', 
      'Dec ': 'December '}

for k, v in month_code.items():
    df_fighter_history['DOB'] = df_fighter_history['DOB'].apply(lambda x: x.replace(k, v) if type(x) == str else x)

df_fighter_history['DOB'] = df_fighter_history['DOB'].apply(lambda row: datetime.strptime(row, '%B %d, %Y') if type(row) == str else row)
df_fighter_history['date'] = df_fighter_history['date'].apply(lambda row: datetime.strptime(row, '%B %d, %Y') if type(row) == str else row)

- define feature groups by broad data type

In [1533]:
num_features = ['KD', 'SIG_STR_pct', 'TD_pct', 'SUB_ATT', 'PASS', 'REV',
       'last_round', 'last_round_time','SIG_STR._ATT',
       'SIG_STR._LANDED', 'TOTAL_STR._ATT', 'TOTAL_STR._LANDED', 'TD_ATT',
       'TD_LANDED', 'HEAD_ATT', 'HEAD_LANDED', 'BODY_ATT', 'BODY_LANDED',
       'LEG_ATT', 'LEG_LANDED', 'DISTANCE_ATT', 'DISTANCE_LANDED',
       'CLINCH_ATT', 'CLINCH_LANDED', 'GROUND_ATT', 'GROUND_LANDED',
       'total_time_fought(sec)', 'no_of_rounds']

categorical_features = ['Stance','win_by',
       'last_round', 'last_round_time', 'location',
       'title_bout', 'weight_class']

date_features = ['DOB','date']

- replacing NaN values:
    - numerical: column mean
    - categorica: column mode
    - date: column mean

In [1540]:
df_fighter_history.fillna(df_fighter_history.mean(), inplace=True)
df_fighter_history.fillna(df.mode().iloc[0], inplace=True)
df_fighter_history = df_fighter_history.apply(lambda x:x.fillna(x.value_counts().index[0]))
df_fighter_history['date'] = df_fighter_history['date'].apply(lambda row: datetime.strptime(row, '%B %d, %Y') if type(row) == str else row)

- creating age (at fight date) feature

In [None]:
df_fighter_history['age'] = df_fighter_history['date'] - df_fighter_history['DOB']
df_fighter_history['age']=df_fighter_history['age']/np.timedelta64(1,'Y')
df_fighter_history['age']=df_fighter_history['age'].apply(lambda x: 25 if x <=18 else x)

In [1552]:
df_fighter_history.isna().sum()

fighter_name              0
Height                    0
Weight                    0
Reach                     0
Stance                    0
DOB                       0
KD                        0
SIG_STR_pct               0
TD_pct                    0
SUB_ATT                   0
PASS                      0
REV                       0
win_by                    0
last_round                0
last_round_time           0
date                      0
location                  0
SIG_STR._ATT              0
SIG_STR._LANDED           0
TOTAL_STR._ATT            0
TOTAL_STR._LANDED         0
TD_ATT                    0
TD_LANDED                 0
HEAD_ATT                  0
HEAD_LANDED               0
BODY_ATT                  0
BODY_LANDED               0
LEG_ATT                   0
LEG_LANDED                0
DISTANCE_ATT              0
DISTANCE_LANDED           0
CLINCH_ATT                0
CLINCH_LANDED             0
GROUND_ATT                0
GROUND_LANDED             0
title_bout          

In [1554]:
df_fighter_history.drop(columns='_merge', inplace=True)

In [1555]:
df_fighter_history.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12360,12361,12362,12363,12364,12365,12366,12367,12368,12369
fighter_name,Tom Aaron,Danny Abbadi,Danny Abbadi,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,...,Cat Zingano,Cat Zingano,Cat Zingano,Cat Zingano,Igor Zinoviev,Dave Zitanick,Alex Zuniga,George Zuniga,Allan Zuniga,Virgil Zwicker
Height,69.5797,71,71,72,72,72,72,72,72,72,...,66,66,66,66,73,71.0858,68.4629,69,67,74
Weight,155,155,155,265,265,265,265,265,265,265,...,145,145,145,145,199,170,145,185,155,205
Reach,70,72.6813,72.6813,73.75,73.75,73.75,73.75,73.75,73.75,73.75,...,68,68,68,68,74.7083,72.018,72.018,70.8698,70,74
Stance,Orthodox,Orthodox,Orthodox,Switch,Switch,Switch,Switch,Switch,Switch,Switch,...,Southpaw,Southpaw,Southpaw,Southpaw,Orthodox,Orthodox,Orthodox,Orthodox,Orthodox,Orthodox
DOB,1978-07-13 00:00:00,1983-07-03 00:00:00,1983-07-03 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,...,1982-07-01 00:00:00,1982-07-01 00:00:00,1982-07-01 00:00:00,1982-07-01 00:00:00,1984-09-26 00:00:00,1980-03-05 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1992-04-04 00:00:00,1982-06-26 00:00:00
KD,0.215127,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0.215127,0.215127,0.215127,0,0.215127
SIG_STR_pct,0.440723,0.38,0.33,0.68,0.41,0.52,0.44,0.88,1,0.66,...,0.47,0.66,0,0.83,0,0.440723,0.440723,0.440723,0.52,0.440723
TD_pct,0.275527,0,0,0,0.75,1,0,1,1,1,...,0.75,1,0,1,0,0.275527,0.275527,0.275527,0,0.275527
SUB_ATT,0.422011,0,0,0,0,0,1,1,1,0,...,0,3,0,1,0,0.422011,0.422011,0.422011,1,0.422011


In [None]:
df_fighter_history.columns

In [1564]:
df_fighter_history.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12360,12361,12362,12363,12364,12365,12366,12367,12368,12369
fighter_name,Tom Aaron,Danny Abbadi,Danny Abbadi,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,David Abbott,...,Cat Zingano,Cat Zingano,Cat Zingano,Cat Zingano,Igor Zinoviev,Dave Zitanick,Alex Zuniga,George Zuniga,Allan Zuniga,Virgil Zwicker
Height,69.5797,71,71,72,72,72,72,72,72,72,...,66,66,66,66,73,71.0858,68.4629,69,67,74
Weight,155,155,155,265,265,265,265,265,265,265,...,145,145,145,145,199,170,145,185,155,205
Reach,70,72.6813,72.6813,73.75,73.75,73.75,73.75,73.75,73.75,73.75,...,68,68,68,68,74.7083,72.018,72.018,70.8698,70,74
Stance,Orthodox,Orthodox,Orthodox,Switch,Switch,Switch,Switch,Switch,Switch,Switch,...,Southpaw,Southpaw,Southpaw,Southpaw,Orthodox,Orthodox,Orthodox,Orthodox,Orthodox,Orthodox
DOB,1978-07-13 00:00:00,1983-07-03 00:00:00,1983-07-03 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,...,1982-07-01 00:00:00,1982-07-01 00:00:00,1982-07-01 00:00:00,1982-07-01 00:00:00,1984-09-26 00:00:00,1980-03-05 00:00:00,1984-09-26 00:00:00,1984-09-26 00:00:00,1992-04-04 00:00:00,1982-06-26 00:00:00
KD,0.215127,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0.215127,0.215127,0.215127,0,0.215127
SIG_STR_pct,0.440723,0.38,0.33,0.68,0.41,0.52,0.44,0.88,1,0.66,...,0.47,0.66,0,0.83,0,0.440723,0.440723,0.440723,0.52,0.440723
TD_pct,0.275527,0,0,0,0.75,1,0,1,1,1,...,0.75,1,0,1,0,0.275527,0.275527,0.275527,0,0.275527
SUB_ATT,0.422011,0,0,0,0,0,1,1,1,0,...,0,3,0,1,0,0.422011,0.422011,0.422011,1,0.422011


In [1563]:
df_fighter_history['win_by'].value_counts()

Decision - Unanimous       5115
KO/TKO                     3517
Submission                 2260
Decision - Split           1066
TKO - Doctor's Stoppage     148
Decision - Majority         122
Overturned                   74
DQ                           34
Could Not Continue           30
Other                         4
Name: win_by, dtype: int64