<h1>Cleaning, EDA, and Feature Engineering after Scraping</h1>

- Acknowledgements:
    - ufcstats for comprehensive data sets on past MMA bouts: http://ufcstats.com/
    - Rajeev Warrier for providing the groundwork for this prediction project: https://github.com/WarrierRajeev/UFC-Predictions

In [24]:
import pandas as pd
DATA_PATH ='./data'
df_fighters = pd.read_csv(DATA_PATH+'/fighter_details.csv')
df_fights = pd.read_csv(DATA_PATH+'/total_fight_data.csv', sep=';')

In [25]:
df_fighters.head()

Unnamed: 0,fighter_name,Height,Weight,Reach,Stance,DOB
0,Tom Aaron,,155 lbs.,,,"Jul 13, 1978"
1,Danny Abbadi,"5' 11""",155 lbs.,,Orthodox,"Jul 03, 1983"
2,David Abbott,"6' 0""",265 lbs.,,Switch,
3,Shamil Abdurakhimov,"6' 3""",235 lbs.,"76""",Orthodox,"Sep 02, 1981"
4,Hiroyuki Abe,"5' 6""",145 lbs.,,Orthodox,


In [26]:
df_fights.head(3).T

Unnamed: 0,0,1,2
R_fighter,Israel Adesanya,Weili Zhang,Beneil Dariush
B_fighter,Yoel Romero,Joanna Jedrzejczyk,Drakkar Klose
R_KD,0,0,1
B_KD,0,0,0
R_SIG_STR.,48 of 132,165 of 408,12 of 20
B_SIG_STR.,40 of 89,186 of 360,15 of 25
R_SIG_STR_pct,36%,40%,60%
B_SIG_STR_pct,44%,51%,60%
R_TOTAL_STR.,48 of 132,170 of 413,17 of 28
B_TOTAL_STR.,40 of 89,196 of 370,27 of 37


<h3>Processing Fight data set</h3>

In [27]:
df_fights.dtypes

R_fighter          object
B_fighter          object
R_KD                int64
B_KD                int64
R_SIG_STR.         object
B_SIG_STR.         object
R_SIG_STR_pct      object
B_SIG_STR_pct      object
R_TOTAL_STR.       object
B_TOTAL_STR.       object
R_TD               object
B_TD               object
R_TD_pct           object
B_TD_pct           object
R_SUB_ATT           int64
B_SUB_ATT           int64
R_PASS              int64
B_PASS              int64
R_REV               int64
B_REV               int64
R_HEAD             object
B_HEAD             object
R_BODY             object
B_BODY             object
R_LEG              object
B_LEG              object
R_DISTANCE         object
B_DISTANCE         object
R_CLINCH           object
B_CLINCH           object
R_GROUND           object
B_GROUND           object
win_by             object
last_round          int64
last_round_time    object
Format             object
Referee            object
date               object
location    

- split attack stats into attempts/landed numerical format

In [28]:
df_fights.columns
attack_cols = ['R_SIG_STR.', 'B_SIG_STR.','R_TOTAL_STR.', 'B_TOTAL_STR.',
       'R_TD', 'B_TD', 'R_HEAD', 'B_HEAD', 'R_BODY',
       'B_BODY', 'R_LEG', 'B_LEG', 'R_DISTANCE', 'B_DISTANCE', 'R_CLINCH',
       'B_CLINCH', 'R_GROUND', 'B_GROUND']

In [29]:
for col in attack_cols:
    df_fights[col+'_ATT'] = df_fights[col].apply(lambda x: int(x.split('of')[1]))
    df_fights[col+'_LANDED'] = df_fights[col].apply(lambda x: int(x.split('of')[0]))

In [30]:
df_fights.drop(attack_cols, axis=1, inplace=True)

In [31]:
df_fights.head().T

Unnamed: 0,0,1,2,3,4
R_fighter,Israel Adesanya,Weili Zhang,Beneil Dariush,Neil Magny,Alex Oliveira
B_fighter,Yoel Romero,Joanna Jedrzejczyk,Drakkar Klose,Jingliang Li,Max Griffin
R_KD,0,0,1,0,0
B_KD,0,0,0,0,0
R_SIG_STR_pct,36%,40%,60%,53%,51%
B_SIG_STR_pct,44%,51%,60%,25%,46%
R_TD_pct,0%,12%,33%,57%,20%
B_TD_pct,0%,0%,0%,66%,66%
R_SUB_ATT,0,0,2,0,0
B_SUB_ATT,0,0,0,0,0


- check for NULL values

In [34]:
for col in df_fights:
    if df_fights[col].isnull().sum()!=0:
        print(f'Null count in {col} = {df_fights[col].isnull().sum()}')

Null count in Referee = 25
Null count in Winner = 92


In [38]:
df_fights[df_fights['Winner'].isnull()]['win_by'].value_counts()

Overturned              37
Decision - Majority     22
Could Not Continue      15
Decision - Split        11
Decision - Unanimous     5
Other                    2
Name: win_by, dtype: int64

In [40]:
df_fights['Winner'].fillna('Draw', inplace=True)

- convert percentages to decimal values

In [44]:
percentage_columns = ['R_SIG_STR_pct', 'B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct']

for col in percentage_columns:
    df_fights[col] = df_fights[col].apply(lambda x : float(x.replace('%',''))/100)

- isolating Title fights and weight classes

In [50]:
df_fights['Fight_type'].value_counts()[df_fights['Fight_type'].value_counts() > 1].index

Index(['Lightweight Bout', 'Welterweight Bout', 'Middleweight Bout',
       'Light Heavyweight Bout', 'Heavyweight Bout', 'Featherweight Bout',
       'Bantamweight Bout', 'Flyweight Bout', 'Women's Strawweight Bout',
       'Women's Bantamweight Bout', 'Open Weight Bout',
       'Women's Flyweight Bout', 'UFC Light Heavyweight Title Bout',
       'UFC Welterweight Title Bout', 'UFC Heavyweight Title Bout',
       'UFC Middleweight Title Bout', 'UFC Lightweight Title Bout',
       'Catch Weight Bout', 'UFC Flyweight Title Bout',
       'UFC Women's Bantamweight Title Bout', 'UFC Featherweight Title Bout',
       'UFC Women's Strawweight Title Bout', 'UFC Bantamweight Title Bout',
       'Women's Featherweight Bout', 'UFC Interim Heavyweight Title Bout',
       'UFC Superfight Championship Bout', 'UFC Women's Flyweight Title Bout',
       'UFC Women's Featherweight Title Bout',
       'UFC Interim Middleweight Title Bout',
       'UFC Interim Featherweight Title Bout',
       'UFC Inter

In [51]:
df_fights['title_bout'] = df_fights['Fight_type'].apply(lambda x: 1 if 'Title Bout' in x else 0) 

In [56]:
weight_classes = ['Women\'s Strawweight', 'Women\'s Bantamweight', 
                  'Women\'s Featherweight', 'Women\'s Flyweight', 'Lightweight', 
                  'Welterweight', 'Middleweight','Light Heavyweight', 
                  'Heavyweight', 'Featherweight','Bantamweight', 'Flyweight', 'Open Weight']

def make_weight_class(x):
    for weight_class in weight_classes:
        if weight_class in x:
            return weight_class
    if x == 'Catch Weight Bout' or 'Catchweight Bout':
        return 'Catch Weight'
    else:
        return 'Open Weight'

In [58]:
df_fights['weight_class'] = df_fights['Fight_type'].apply(make_weight_class)

In [60]:
df_fights['weight_class'].value_counts()

Lightweight              1039
Welterweight             1023
Middleweight              762
Heavyweight               537
Light Heavyweight         535
Featherweight             488
Bantamweight              421
Flyweight                 204
Women's Strawweight       164
Women's Bantamweight      129
Open Weight                92
Women's Flyweight          77
Catch Weight               39
Women's Featherweight      14
Name: weight_class, dtype: int64

- isolate total fight time (seconds)

In [63]:
df_fights['Format'].value_counts()

3 Rnd (5-5-5)           4847
5 Rnd (5-5-5-5-5)        458
1 Rnd + OT (12-3)         79
No Time Limit             37
3 Rnd + OT (5-5-5-5)      22
1 Rnd (20)                20
1 Rnd + 2OT (15-3-3)      20
2 Rnd (5-5)               11
1 Rnd (15)                 8
1 Rnd (10)                 6
1 Rnd (12)                 4
1 Rnd + OT (30-5)          3
1 Rnd (18)                 2
1 Rnd + OT (15-3)          2
1 Rnd + OT (27-3)          1
1 Rnd + OT (31-5)          1
1 Rnd + OT (30-3)          1
1 Rnd (30)                 1
1 Rnd + 2OT (24-3-3)       1
Name: Format, dtype: int64

In [64]:
time_in_first_round = {'3 Rnd (5-5-5)': 5*60, 
                       '5 Rnd (5-5-5-5-5)': 5*60, 
                       '1 Rnd + OT (12-3)': 12*60,
                       'No Time Limit': 1, 
                       '3 Rnd + OT (5-5-5-5)': 5*60, 
                       '1 Rnd (20)': 1*20,
                       '2 Rnd (5-5)': 5*60, 
                       '1 Rnd (15)': 15*60, 
                       '1 Rnd (10)': 10*60,
                       '1 Rnd (12)':12*60, 
                       '1 Rnd + OT (30-5)': 30*60, 
                       '1 Rnd (18)': 18*60, 
                       '1 Rnd + OT (15-3)': 15*60,
                       '1 Rnd (30)': 30*60, 
                       '1 Rnd + OT (31-5)': 31*5,
                       '1 Rnd + OT (27-3)': 27*60, 
                       '1 Rnd + OT (30-3)': 30*60}

exception_format_time = {'1 Rnd + 2OT (15-3-3)': [15*60, 3*60], 
                         '1 Rnd + 2OT (24-3-3)': [24*60, 3*60]}

# '1 Rnd + 2OT (15-3-3)' and '1 Rnd + 2OT (24-3-3)' is not included because it has 3 uneven timed rounds. 
# We'll have to deal with it separately

In [66]:
# Converting to seconds
df_fights['last_round_time'] = df_fights['last_round_time'].apply(lambda x: int(x.split(':')[0])*60 + int(x.split(':')[1]))

In [67]:
def get_total_time(row):
    if row['Format'] in time_in_first_round.keys():
        return (row['last_round'] - 1) * time_in_first_round[row['Format']] + row['last_round_time']
    elif row['Format'] in exception_format_time.keys():
        if (row['last_round'] - 1) >= 2:
            return exception_format_time[row['Format']][0] + (row['last_round'] - 2) * \
                    exception_format_time[row['Format']][1] + row['last_round_time']
        else:
            return (row['last_round'] - 1) * exception_format_time[row['Format']][0] + row['last_round_time']
    
# So if the fight ended in round 1, we only need last_round_time. 
# If it ended in round 2, we need the full time of round 1 and the last_round_time
# This works for fights with same time in each round and fights with only two rounds.

In [68]:
df_fights['total_time_fought(sec)'] = df_fights.apply(get_total_time, axis=1)

In [79]:
def get_num_rounds(x):
    if x == 'No Time Limit':
        return 1
    else:
        return len((x.split('(')[1].replace(')','').split('-')))
    
df_fights['no_of_rounds'] = df_fights['Format'].apply(get_num_rounds)

<h3>Create prediction DataFrame by integrating fighter data</h3>

In [80]:
df = df_fights.copy()

In [81]:
df.columns

Index(['R_fighter', 'B_fighter', 'R_KD', 'B_KD', 'R_SIG_STR_pct',
       'B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct', 'R_SUB_ATT', 'B_SUB_ATT',
       'R_PASS', 'B_PASS', 'R_REV', 'B_REV', 'win_by', 'last_round',
       'last_round_time', 'Format', 'Referee', 'date', 'location',
       'Fight_type', 'Winner', 'R_SIG_STR._ATT', 'R_SIG_STR._LANDED',
       'B_SIG_STR._ATT', 'B_SIG_STR._LANDED', 'R_TOTAL_STR._ATT',
       'R_TOTAL_STR._LANDED', 'B_TOTAL_STR._ATT', 'B_TOTAL_STR._LANDED',
       'R_TD_ATT', 'R_TD_LANDED', 'B_TD_ATT', 'B_TD_LANDED', 'R_HEAD_ATT',
       'R_HEAD_LANDED', 'B_HEAD_ATT', 'B_HEAD_LANDED', 'R_BODY_ATT',
       'R_BODY_LANDED', 'B_BODY_ATT', 'B_BODY_LANDED', 'R_LEG_ATT',
       'R_LEG_LANDED', 'B_LEG_ATT', 'B_LEG_LANDED', 'R_DISTANCE_ATT',
       'R_DISTANCE_LANDED', 'B_DISTANCE_ATT', 'B_DISTANCE_LANDED',
       'R_CLINCH_ATT', 'R_CLINCH_LANDED', 'B_CLINCH_ATT', 'B_CLINCH_LANDED',
       'R_GROUND_ATT', 'R_GROUND_LANDED', 'B_GROUND_ATT', 'B_GROUND_LANDED',
       'titl

In [82]:
df.drop(['R_KD', 'B_KD', 'R_SIG_STR_pct',
       'B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct', 'R_SUB_ATT', 'B_SUB_ATT',
       'R_PASS', 'B_PASS', 'R_REV', 'B_REV', 'win_by', 'last_round',
       'last_round_time', 'Format',
       'Fight_type', 'R_SIG_STR._ATT', 'R_SIG_STR._LANDED',
       'B_SIG_STR._ATT', 'B_SIG_STR._LANDED', 'R_TOTAL_STR._ATT',
       'R_TOTAL_STR._LANDED', 'B_TOTAL_STR._ATT', 'B_TOTAL_STR._LANDED',
       'R_TD_ATT', 'R_TD_LANDED', 'B_TD_ATT', 'B_TD_LANDED', 'R_HEAD_ATT',
       'R_HEAD_LANDED', 'B_HEAD_ATT', 'B_HEAD_LANDED', 'R_BODY_ATT',
       'R_BODY_LANDED', 'B_BODY_ATT', 'B_BODY_LANDED', 'R_LEG_ATT',
       'R_LEG_LANDED', 'B_LEG_ATT', 'B_LEG_LANDED', 'R_DISTANCE_ATT',
       'R_DISTANCE_LANDED', 'B_DISTANCE_ATT', 'B_DISTANCE_LANDED',
       'R_CLINCH_ATT', 'R_CLINCH_LANDED', 'B_CLINCH_ATT', 'B_CLINCH_LANDED',
       'R_GROUND_ATT', 'R_GROUND_LANDED', 'B_GROUND_ATT', 'B_GROUND_LANDED',
       'total_time_fought(sec)'], axis = 1, inplace=True)

In [83]:
df

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,no_of_rounds
0,Israel Adesanya,Yoel Romero,Dan Miragliotta,"March 07, 2020","Las Vegas, Nevada, USA",Israel Adesanya,1,Middleweight,5
1,Weili Zhang,Joanna Jedrzejczyk,Keith Peterson,"March 07, 2020","Las Vegas, Nevada, USA",Weili Zhang,1,Women's Strawweight,5
2,Beneil Dariush,Drakkar Klose,Jason Herzog,"March 07, 2020","Las Vegas, Nevada, USA",Beneil Dariush,0,Lightweight,3
3,Neil Magny,Jingliang Li,Keith Peterson,"March 07, 2020","Las Vegas, Nevada, USA",Neil Magny,0,Welterweight,3
4,Alex Oliveira,Max Griffin,Mark Smith,"March 07, 2020","Las Vegas, Nevada, USA",Alex Oliveira,0,Welterweight,3
...,...,...,...,...,...,...,...,...,...
5519,Gerard Gordeau,Kevin Rosier,Joao Alberto Barreto,"November 12, 1993","Denver, Colorado, USA",Gerard Gordeau,0,Open Weight,1
5520,Ken Shamrock,Patrick Smith,Joao Alberto Barreto,"November 12, 1993","Denver, Colorado, USA",Ken Shamrock,0,Open Weight,1
5521,Royce Gracie,Art Jimmerson,Joao Alberto Barreto,"November 12, 1993","Denver, Colorado, USA",Royce Gracie,0,Open Weight,1
5522,Kevin Rosier,Zane Frazier,Joao Alberto Barreto,"November 12, 1993","Denver, Colorado, USA",Kevin Rosier,0,Open Weight,1


In [84]:
red_fighters = df['R_fighter'].value_counts().index
blue_fighters = df['B_fighter'].value_counts().index

fighters = list(set(red_fighters) | set(blue_fighters))