# Process UFC events/fights dataset 

In [1]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

<IPython.core.display.Javascript object>

In [2]:
import json
import ufc_events_eda.utils.paths as path
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim

<IPython.core.display.Javascript object>

### Read dataset into dataframe

In [3]:
with open(path.data_raw_dir('ufc_events.json'), 'r') as f:
    data = json.loads(f.read())

df = pd.json_normalize(data, record_path='fights', meta=['event_name', 'event_date', 'event_location'])
df.head()

Unnamed: 0,fighter_1,fighter_2,fighter_1_kd,fighter_2_kd,fighter_1_str,fighter_2_str,fighter_1_td,fighter_2_td,fighter_1_sub,fighter_2_sub,weigh_class,method,method_detail,round,time,closure,is_main_event,event_name,event_date,event_location
0,Marina Rodriguez,Michelle Waterson,0,0,125,88,0,1,0,0,Women's Flyweight,U-DEC,,5,5:00,win,True,UFC Fight Night: Rodriguez vs. Waterson,"May 08, 2021","Las Vegas, Nevada, USA"
1,Alex Morono,Donald Cerrone,0,0,35,17,0,0,0,0,Welterweight,KO/TKO,Punches,1,4:40,win,False,UFC Fight Night: Rodriguez vs. Waterson,"May 08, 2021","Las Vegas, Nevada, USA"
2,Neil Magny,Geoff Neal,0,0,55,35,2,1,0,0,Welterweight,U-DEC,,3,5:00,win,False,UFC Fight Night: Rodriguez vs. Waterson,"May 08, 2021","Las Vegas, Nevada, USA"
3,Marcos Rogerio de Lima,Maurice Greene,0,0,14,5,3,0,0,0,Heavyweight,U-DEC,,3,5:00,win,False,UFC Fight Night: Rodriguez vs. Waterson,"May 08, 2021","Las Vegas, Nevada, USA"
4,Gregor Gillespie,Diego Ferreira,0,0,45,31,4,0,0,0,Lightweight,KO/TKO,Punches,2,4:51,win,False,UFC Fight Night: Rodriguez vs. Waterson,"May 08, 2021","Las Vegas, Nevada, USA"


<IPython.core.display.Javascript object>

### Print shape, columns and data types
Check references/columns.md to see what each column represents in detail

In [4]:
print(df.shape)
print(df.columns)

(6474, 20)
Index(['fighter_1', 'fighter_2', 'fighter_1_kd', 'fighter_2_kd',
       'fighter_1_str', 'fighter_2_str', 'fighter_1_td', 'fighter_2_td',
       'fighter_1_sub', 'fighter_2_sub', 'weigh_class', 'method',
       'method_detail', 'round', 'time', 'closure', 'is_main_event',
       'event_name', 'event_date', 'event_location'],
      dtype='object')


<IPython.core.display.Javascript object>

In [5]:
df.dtypes

fighter_1         object
fighter_2         object
fighter_1_kd      object
fighter_2_kd      object
fighter_1_str     object
fighter_2_str     object
fighter_1_td      object
fighter_2_td      object
fighter_1_sub     object
fighter_2_sub     object
weigh_class       object
method            object
method_detail     object
round             object
time              object
closure           object
is_main_event       bool
event_name        object
event_date        object
event_location    object
dtype: object

<IPython.core.display.Javascript object>

### Convert numeric strings to float or int

In [4]:
df[
    [
        "fighter_1_kd",
        "fighter_2_kd",
        "fighter_1_str",
        "fighter_2_str",
        "fighter_1_td",
        "fighter_2_td",
        "fighter_1_sub",
        "fighter_2_sub",
        "round"
    ]
] = df[
    [
        "fighter_1_kd",
        "fighter_2_kd",
        "fighter_1_str",
        "fighter_2_str",
        "fighter_1_td",
        "fighter_2_td",
        "fighter_1_sub",
        "fighter_2_sub",
        "round"
    ]
].apply(pd.to_numeric, errors='coerce', downcast='unsigned')

<IPython.core.display.Javascript object>

In [5]:
df[
    [
        "fighter_1_kd",
        "fighter_2_kd",
        "fighter_1_str",
        "fighter_2_str",
        "fighter_1_td",
        "fighter_2_td",
        "fighter_1_sub",
        "fighter_2_sub",
        "round",
    ]
].dtypes


fighter_1_kd     float64
fighter_2_kd     float64
fighter_1_str    float64
fighter_2_str    float64
fighter_1_td     float64
fighter_2_td     float64
fighter_1_sub    float64
fighter_2_sub    float64
round              uint8
dtype: object

<IPython.core.display.Javascript object>

### Separate fights from events (keeping event name in fights dataframe)

In [6]:
df_events = df.drop_duplicates(subset=['event_name'])[['event_name', 'event_date', 'event_location']]
df_fights = df.drop(columns=['event_date', 'event_location'])

<IPython.core.display.Javascript object>

### Cast event_date to datetime

In [7]:
df_events['event_date']=pd.to_datetime(df['event_date'], format='%B %d, %Y')

<IPython.core.display.Javascript object>

### Geocode event_location to plot maps later 

In [8]:
def geocode_city(city):
    geocode = Nominatim(user_agent="MyApp").geocode
    geocoded = geocode(city)
    return geocoded.latitude, geocoded.longitude


df_city = pd.DataFrame(
    data=df_events[df_events["event_location"].str.contains("USA")][
        "event_location"
    ].unique(),
    columns=["event_location"],
)
lat, lon = [], []

for city in df_city["event_location"]:
    geocoded = geocode_city(city)
    lat.append(geocoded[0])
    lon.append(geocoded[1])


<IPython.core.display.Javascript object>

In [9]:
df_city['latitude'] = lat
df_city['longitude'] = lon
df_city.head()

Unnamed: 0,event_location,latitude,longitude
0,"Las Vegas, Nevada, USA",36.167256,-115.148516
1,"Houston, Texas, USA",29.758938,-95.367697
2,"Glendale, Arizona, USA",33.538686,-112.185994
3,"New York City, New York, USA",40.712728,-74.006015
4,"Denver, Colorado, USA",39.739236,-104.984862


<IPython.core.display.Javascript object>

### Fix inconsistent location entries

In [11]:
def fix_inconsistent_location(df):
    city_patterns = [
        (
            df["event_location"].str.contains("Abu Dhabi", case=False),
            "Abu Dhabi, United Arab Emirates",
        ),
        (
            df["event_location"].str.contains("Sao Paulo", case=False),
            "Sao Paulo, Brazil",
        ),
        (
            df["event_location"].str.contains("Rio de Janeiro", case=False),
            "Rio de Janeiro, Brazil",
        ),
        (
            df["event_location"].str.contains("Berlin", case=False),
            "Berlin, Germany",
        ),
        (
            df["event_location"].str.contains("Saitama", case=False),
            "Saitama, Japan",
        ),
    ]

    city_criteria, city_values = zip(*city_patterns)
    df["event_location_normalized"] = np.select(city_criteria, city_values, None)
    # Replace "None" values with original position
    df["event_location_normalized"] = df["event_location_normalized"].combine_first(
        df["event_location"]
    )

    df["event_location"] = df["event_location_normalized"]
    return df.drop(columns=['event_location_normalized'])


<IPython.core.display.Javascript object>

In [12]:
df_events = fix_inconsistent_location(df_events)

<IPython.core.display.Javascript object>

### Merge main dataframe with df_city to have latitude and longitude fo event_location

In [13]:
df_events_merged=pd.merge(df_events, df_city, how='left', on='event_location').drop_duplicates(subset='event_name')

<IPython.core.display.Javascript object>

### Check null values

In [18]:
df_fights.isna().sum()

fighter_1           0
fighter_2           0
fighter_1_kd       21
fighter_2_kd       21
fighter_1_str      21
fighter_2_str      21
fighter_1_td       21
fighter_2_td       21
fighter_1_sub      21
fighter_2_sub      21
weigh_class         0
method              0
method_detail    3200
round               0
time                0
closure             0
is_main_event       0
event_name          0
dtype: int64

<IPython.core.display.Javascript object>

We can see there are 3200 fights that do not have method_detail. Also, there are 21 fights that do not have strikes, knockdowns, takedowns and submissions data. Let's see if we should keep those fights in the dataset.

In [28]:
df_fights[df_fights['fighter_1_kd'].isna()]

Unnamed: 0,fighter_1,fighter_2,fighter_1_kd,fighter_2_kd,fighter_1_str,fighter_2_str,fighter_1_td,fighter_2_td,fighter_1_sub,fighter_2_sub,weigh_class,method,method_detail,round,time,closure,is_main_event,event_name
307,Marcus Bossett,Eldo Xavier Dias,,,,,,,,,Open Weight,KO/TKO,,1,4:55,win,False,UFC 4: Revenge of the Warriors
308,Joe Charles,Kevin Rosier,,,,,,,,,Open Weight,SUB,Armbar,1,0:14,win,False,UFC 4: Revenge of the Warriors
327,Anthony Macias,He-Man Gipson,,,,,,,,,Open Weight,KO/TKO,Headbutts,1,3:06,win,False,UFC 6: Clash of the Titans
328,Joel Sutton,Jack McGlaughlin,,,,,,,,,Open Weight,KO/TKO,Punches,1,2:01,win,False,UFC 6: Clash of the Titans
338,Onassis Parungao,Francesco Maturi,,,,,,,,,Open Weight,KO/TKO,,1,5:26,win,False,UFC 7: The Brawl in Buffalo
339,Joel Sutton,Geza Kalman,,,,,,,,,Open Weight,KO/TKO,,1,0:48,win,False,UFC 7: The Brawl in Buffalo
347,Mark Hall,Trent Jenkins,,,,,,,,,Open Weight,SUB,,1,5:29,win,False,UFC - Ultimate Ultimate '95
348,Joe Charles,Scott Bessac,,,,,,,,,Open Weight,SUB,Armbar,1,4:38,win,False,UFC - Ultimate Ultimate '95
357,Sam Adkins,Keith Mielke,,,,,,,,,Open Weight,KO/TKO,Punches,1,0:50,win,False,UFC 8: David vs Goliath
372,Sam Adkins,Felix Lee Mitchell,,,,,,,,,Open Weight,U-DEC,,2,3:00,win,False,UFC 10: The Tournament


<IPython.core.display.Javascript object>

Although we have no data of what happened **during** those fights, there is data of how they ended, where they took place, etc. Therefore, I keep those 21 fights in the dataset.

### Save processed data to parquet files

In [19]:
df_events_merged.to_parquet(path.data_processed_dir('events_processed.parquet'))
df_fights.to_parquet(path.data_processed_dir('fights_processed.parquet'))

<IPython.core.display.Javascript object>