In [75]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import pandas as pd
import seaborn as sns


# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ann"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [16]:
# Define a function for opening the dataset
def load_dataset(file_path):
    return pd.read_csv(file_path)

In [22]:
# Open dataset for processing:
DATASET_PATH_CSV = "../data/data.csv"

# ufc dataset:
ufc = load_dataset(DATASET_PATH_CSV)

In [48]:
ufc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5144 entries, 0 to 5143
Columns: 145 entries, R_fighter to R_age
dtypes: bool(1), float64(134), int64(1), object(9)
memory usage: 5.7+ MB


Obtain relevant information about the dataset's datatypes, this way we can know if we have to change something within the dataset for further processing.

In [52]:
ufc.describe()

Unnamed: 0,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,B_avg_DISTANCE_att,B_avg_DISTANCE_landed,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
count,5144.0,5144.0,5144.0,5144.0,3879.0,3879.0,3879.0,3879.0,3879.0,3879.0,...,5144.0,5144.0,5144.0,5144.0,5144.0,5140.0,4828.0,5141.0,4972.0,5080.0
mean,3.119362,0.429821,0.837286,0.0,8.689387,6.083457,8.240461,5.556352,53.156807,19.329445,...,1.177294,1.25486,0.777605,0.071345,3.598173,179.274089,183.664412,172.075861,29.171963,29.442323
std,0.631457,0.731325,1.306203,0.0,7.087703,5.074464,7.575422,5.429531,41.252426,15.961112,...,1.600135,1.801056,1.326056,0.277771,3.709519,8.638978,10.304375,35.164075,4.078538,4.141927
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,152.4,152.4,115.0,18.0,19.0
25%,3.0,0.0,0.0,0.0,3.5,2.333333,3.0,2.0,22.0,7.666667,...,0.0,0.0,0.0,0.0,1.0,172.72,177.8,145.0,26.0,26.0
50%,3.0,0.0,0.0,0.0,7.0,5.0,6.333333,4.2,44.666667,15.2,...,1.0,1.0,0.0,0.0,2.0,180.34,182.88,170.0,29.0,29.0
75%,3.0,1.0,1.0,0.0,12.225,8.5,11.422222,7.738636,74.333333,27.142857,...,2.0,2.0,1.0,0.0,5.0,185.42,190.5,185.0,32.0,32.0
max,5.0,6.0,13.0,0.0,49.0,39.0,87.0,68.0,271.0,130.0,...,10.0,11.0,13.0,2.0,20.0,210.82,213.36,345.0,51.0,47.0


In [51]:
ufc.describe(include=['O'])

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,weight_class,B_Stance,R_Stance
count,5144,5144,5121,5144,5144,5144,5144,4985,5010
unique,1334,1774,190,476,157,3,14,5,5
top,Jim Miller,Jeremy Stephens,Herb Dean,2016-11-19,"Las Vegas, Nevada, USA",Red,Lightweight,Orthodox,Orthodox
freq,23,19,726,25,1216,3470,989,3829,3807


# Important notes to observe
1.- Red side wins more frequently than blue side (3470/5144 = 67%).

2.- Jeremy Stephens fights more frequently in the blue side than any other

3.- Jim Miller fights in the red side more frequently than any other

4.- Herb Dean has been the most frequently appearing referee, being in 726 of 5121 total fights.

5.- 2016 has been the year with most fights (476 of 5144)




Let's fill up NaN values using pandas

In [55]:
ufc.fillna(value=0, inplace=True)

In [56]:
# Let's see only the first entires in the dataset:
ufc.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Henry Cejudo,Marlon Moraes,Marc Goddard,2019-06-08,"Chicago, Illinois, USA",Red,True,Bantamweight,5,0.0,...,2.0,0.0,0.0,8.0,Orthodox,162.56,162.56,135.0,31.0,32.0
1,Valentina Shevchenko,Jessica Eye,Robert Madrigal,2019-06-08,"Chicago, Illinois, USA",Red,True,Women's Flyweight,5,0.0,...,0.0,2.0,0.0,5.0,Southpaw,165.1,167.64,125.0,32.0,31.0
2,Tony Ferguson,Donald Cerrone,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Red,False,Lightweight,3,0.0,...,3.0,6.0,1.0,14.0,Orthodox,180.34,193.04,155.0,36.0,35.0
3,Jimmie Rivera,Petr Yan,Kevin MacDonald,2019-06-08,"Chicago, Illinois, USA",Blue,False,Bantamweight,3,0.0,...,1.0,0.0,0.0,6.0,Orthodox,162.56,172.72,135.0,26.0,29.0
4,Tai Tuivasa,Blagoy Ivanov,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Blue,False,Heavyweight,3,0.0,...,2.0,0.0,0.0,3.0,Southpaw,187.96,190.5,264.0,32.0,26.0


In [57]:
ufc.tail()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
5139,Gerard Gordeau,Kevin Rosier,Joao Alberto Barreto,1993-11-12,"Denver, Colorado, USA",Red,False,Open Weight,1,0.0,...,1.0,0.0,0.0,1.0,Orthodox,195.58,0.0,216.0,0.0,34.0
5140,Ken Shamrock,Patrick Smith,Joao Alberto Barreto,1993-11-12,"Denver, Colorado, USA",Red,False,Open Weight,1,0.0,...,0.0,0.0,0.0,0.0,Orthodox,185.42,182.88,205.0,30.0,29.0
5141,Royce Gracie,Art Jimmerson,Joao Alberto Barreto,1993-11-12,"Denver, Colorado, USA",Red,False,Open Weight,1,0.0,...,0.0,0.0,0.0,0.0,Southpaw,185.42,0.0,175.0,30.0,26.0
5142,Kevin Rosier,Zane Frazier,Joao Alberto Barreto,1993-11-12,"Denver, Colorado, USA",Red,False,Open Weight,1,0.0,...,0.0,0.0,0.0,0.0,Orthodox,193.04,0.0,275.0,0.0,0.0
5143,Gerard Gordeau,Teila Tuli,Joao Alberto Barreto,1993-11-12,"Denver, Colorado, USA",Red,False,Open Weight,1,0.0,...,0.0,0.0,0.0,0.0,Orthodox,195.58,0.0,216.0,24.0,34.0


In [78]:
# Lets make a correlation matrix:
corr_matrix = ufc.corr(()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_matrix, vmax=.8, square=True);


TypeError: corr() got an unexpected keyword argument 'include'

In [36]:
# Correlation matrix to see what variables relate the most
corr_matrix = ufc.corr()
corr_matrix['B_avg_BODY_landed'].sort_values(ascending=False)

B_avg_BODY_landed       1.000000
B_avg_BODY_att          0.956776
B_avg_SIG_STR_landed    0.747965
B_avg_SIG_STR_att       0.658123
B_avg_TOTAL_STR_att     0.646417
                          ...   
R_Height_cms           -0.199331
B_Weight_lbs           -0.216211
R_Weight_lbs           -0.217841
B_draw                       NaN
R_draw                       NaN
Name: B_avg_BODY_landed, Length: 136, dtype: float64

This, as a first glimpse of the dataset, could give us really relevant information: We now know that the number of body shots landed, is directly and is most correlated to the number of body shots attempted.
We also know that the most SIGNIFICANT shots the fighter has given another, the most likely it is to land more shots, but this might not come as the most new information