# Process UFC events/fights dataset 

In [1]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

<IPython.core.display.Javascript object>

In [5]:
import json
import ufc_events_eda.utils.paths as path
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from ufc_events_eda.data.parse_json import parse_json

<IPython.core.display.Javascript object>

### Read dataset into dataframe

In [6]:
df = parse_json(path.data_raw_dir('ufc_events.json'))
df.head()

Unnamed: 0,fighter_1,fighter_2,fighter_1_kd,fighter_2_kd,fighter_1_str,fighter_2_str,fighter_1_td,fighter_2_td,fighter_1_sub,fighter_2_sub,weigh_class,method,method_detail,round,time,closure,is_main_event,event_name,event_date,event_location
0,Israel Adesanya,Marvin Vettori,0,0,96,58,0,4,0,0,Middleweight,U-DEC,,5,5:00,win,True,UFC 263: Adesanya vs. Vettori 2,"June 12, 2021","Glendale, Arizona, USA"
1,Brandon Moreno,Deiveson Figueiredo,1,0,47,24,2,1,2,0,Flyweight,SUB,Rear Naked Choke,3,2:26,win,False,UFC 263: Adesanya vs. Vettori 2,"June 12, 2021","Glendale, Arizona, USA"
2,Leon Edwards,Nate Diaz,0,0,77,76,4,0,0,0,Welterweight,U-DEC,,5,5:00,win,False,UFC 263: Adesanya vs. Vettori 2,"June 12, 2021","Glendale, Arizona, USA"
3,Belal Muhammad,Demian Maia,0,0,45,21,0,1,0,0,Welterweight,U-DEC,,3,5:00,win,False,UFC 263: Adesanya vs. Vettori 2,"June 12, 2021","Glendale, Arizona, USA"
4,Paul Craig,Jamahal Hill,0,0,23,1,0,0,3,0,Light Heavyweight,KO/TKO,Punches,1,1:59,win,False,UFC 263: Adesanya vs. Vettori 2,"June 12, 2021","Glendale, Arizona, USA"


<IPython.core.display.Javascript object>

### Print shape, columns and data types
Check references/columns.md to see what each column represents in detail

In [5]:
print(df.shape)
print(df.columns)

(6510, 20)
Index(['fighter_1', 'fighter_2', 'fighter_1_kd', 'fighter_2_kd',
       'fighter_1_str', 'fighter_2_str', 'fighter_1_td', 'fighter_2_td',
       'fighter_1_sub', 'fighter_2_sub', 'weigh_class', 'method',
       'method_detail', 'round', 'time', 'closure', 'is_main_event',
       'event_name', 'event_date', 'event_location'],
      dtype='object')


<IPython.core.display.Javascript object>

In [6]:
df.dtypes

fighter_1         object
fighter_2         object
fighter_1_kd      object
fighter_2_kd      object
fighter_1_str     object
fighter_2_str     object
fighter_1_td      object
fighter_2_td      object
fighter_1_sub     object
fighter_2_sub     object
weigh_class       object
method            object
method_detail     object
round             object
time              object
closure           object
is_main_event       bool
event_name        object
event_date        object
event_location    object
dtype: object

<IPython.core.display.Javascript object>

### Convert numeric strings to float or int

In [8]:
num_columns = [
    "fighter_1_kd",
    "fighter_2_kd",
    "fighter_1_str",
    "fighter_2_str",
    "fighter_1_td",
    "fighter_2_td",
    "fighter_1_sub",
    "fighter_2_sub",
    "round"
]
df[num_columns] = df[num_columns].apply(
    pd.to_numeric, errors="coerce", downcast="unsigned"
)


<IPython.core.display.Javascript object>

In [5]:
df[num_columns].dtypes

fighter_1_kd     float64
fighter_2_kd     float64
fighter_1_str    float64
fighter_2_str    float64
fighter_1_td     float64
fighter_2_td     float64
fighter_1_sub    float64
fighter_2_sub    float64
round              uint8
dtype: object

<IPython.core.display.Javascript object>

### Separate fights from events (keeping event name in fights dataframe)

In [9]:
df_events = df.drop_duplicates(subset=['event_name'])[['event_name', 'event_date', 'event_location']]
df_fights = df.drop(columns=['event_location', 'event_date'])

<IPython.core.display.Javascript object>

### Cast event_date to datetime

In [10]:
df_events["event_date"] = pd.to_datetime(df["event_date"], format="%B %d, %Y")

<IPython.core.display.Javascript object>

### Fix inconsistent location entries

In [11]:
def fix_inconsistent_location(df):
    city_patterns = [
        (
            df["event_location"].str.contains("Abu Dhabi", case=False),
            "Abu Dhabi, United Arab Emirates",
        ),
        (
            df["event_location"].str.contains("Sao Paulo", case=False),
            "Sao Paulo, Brazil",
        ),
        (
            df["event_location"].str.contains("Rio de Janeiro", case=False),
            "Rio de Janeiro, Brazil",
        ),
        (
            df["event_location"].str.contains("Berlin", case=False),
            "Berlin, Germany",
        ),
        (
            df["event_location"].str.contains("Saitama", case=False),
            "Saitama, Japan",
        ),
    ]

    city_criteria, city_values = zip(*city_patterns)
    df["event_location_normalized"] = np.select(city_criteria, city_values, None)
    # Replace "None" values with original position
    df["event_location_normalized"] = df["event_location_normalized"].combine_first(
        df["event_location"]
    )

    df["event_location"] = df["event_location_normalized"]
    return df.drop(columns=['event_location_normalized'])


<IPython.core.display.Javascript object>

In [12]:
df_events = fix_inconsistent_location(df_events)

<IPython.core.display.Javascript object>

### Merge main dataframe with df_city to have latitude and longitude fo event_location

In [13]:
df_city = pd.read_parquet(path.data_interim_dir('cities.parquet'))
df_events_merged=pd.merge(df_events, df_city, how='left', on='event_location').drop_duplicates(subset='event_name')

<IPython.core.display.Javascript object>

### Check null values

In [16]:
df_fights.isna().sum()

fighter_1           0
fighter_2           0
fighter_1_kd       21
fighter_2_kd       21
fighter_1_str      21
fighter_2_str      21
fighter_1_td       21
fighter_2_td       21
fighter_1_sub      21
fighter_2_sub      21
weigh_class         0
method              0
method_detail    3217
round               0
time                0
closure             0
is_main_event       0
event_name          0
event_date          0
dtype: int64

<IPython.core.display.Javascript object>

We can see there are 3217 fights that do not have method_detail. Also, there are 21 fights that do not have strikes, knockdowns, takedowns and submissions data. Let's see if we should keep those fights in the dataset.

In [17]:
df_fights[df_fights['fighter_1_kd'].isna()]

Unnamed: 0,fighter_1,fighter_2,fighter_1_kd,fighter_2_kd,fighter_1_str,fighter_2_str,fighter_1_td,fighter_2_td,fighter_1_sub,fighter_2_sub,weigh_class,method,method_detail,round,time,closure,is_main_event,event_name,event_date
306,Marcus Bossett,Eldo Xavier Dias,,,,,,,,,Open Weight,KO/TKO,,1,4:55,win,False,UFC 4: Revenge of the Warriors,"December 16, 1994"
307,Joe Charles,Kevin Rosier,,,,,,,,,Open Weight,SUB,Armbar,1,0:14,win,False,UFC 4: Revenge of the Warriors,"December 16, 1994"
326,Anthony Macias,He-Man Gipson,,,,,,,,,Open Weight,KO/TKO,Headbutts,1,3:06,win,False,UFC 6: Clash of the Titans,"July 14, 1995"
327,Joel Sutton,Jack McGlaughlin,,,,,,,,,Open Weight,KO/TKO,Punches,1,2:01,win,False,UFC 6: Clash of the Titans,"July 14, 1995"
337,Onassis Parungao,Francesco Maturi,,,,,,,,,Open Weight,KO/TKO,,1,5:26,win,False,UFC 7: The Brawl in Buffalo,"September 08, 1995"
338,Joel Sutton,Geza Kalman,,,,,,,,,Open Weight,KO/TKO,,1,0:48,win,False,UFC 7: The Brawl in Buffalo,"September 08, 1995"
346,Mark Hall,Trent Jenkins,,,,,,,,,Open Weight,SUB,,1,5:29,win,False,UFC - Ultimate Ultimate '95,"December 16, 1995"
347,Joe Charles,Scott Bessac,,,,,,,,,Open Weight,SUB,Armbar,1,4:38,win,False,UFC - Ultimate Ultimate '95,"December 16, 1995"
356,Sam Adkins,Keith Mielke,,,,,,,,,Open Weight,KO/TKO,Punches,1,0:50,win,False,UFC 8: David vs Goliath,"February 16, 1996"
371,Sam Adkins,Felix Lee Mitchell,,,,,,,,,Open Weight,U-DEC,,2,3:00,win,False,UFC 10: The Tournament,"July 12, 1996"


<IPython.core.display.Javascript object>

Although we have no data of what happened **during** those fights, there is data of how they ended, where they took place, etc. Therefore, I keep those 21 fights in the dataset.

### Save processed data to parquet files

In [14]:
df_events_merged.to_parquet(path.data_processed_dir('events_processed.parquet'))
df_fights.to_parquet(path.data_processed_dir('fights_processed.parquet'))

<IPython.core.display.Javascript object>