# Initial Data Exploration and Processing 
1. Read dataset
2. Extract lightweight division rankings
3. Extract bantamweight division rankings 
4. Manipulate data to the correct format to be used in the visualization
5. Export rankings and list of fighters to files

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import random
import warnings
import datetime
warnings.simplefilter(action='ignore', category=FutureWarning)
import json

## Data

UFC rankings data was obtained from https://www.kaggle.com/martj42/ufc-rankings/data and http://mma-stats.com/rankings/2013-02-05.

In [2]:
df = pd.read_csv("rankings_history.csv") 

In [3]:
df.head()

Unnamed: 0,date,weightclass,fighter,rank
0,2013-02-04,Pound-for-Pound,Anderson Silva,1
1,2013-02-04,Pound-for-Pound,Jon Jones,2
2,2013-02-04,Pound-for-Pound,Georges St-Pierre,3
3,2013-02-04,Pound-for-Pound,Jose Aldo,4
4,2013-02-04,Pound-for-Pound,Benson Henderson,5


In [4]:
print("There are " + str(len(df['date'].unique())) + " ranking dates in the data.")

There are 228 ranking dates in the data.


In [5]:
# Label each date in order
date_values = dict()
count = 0
for date in df['date'].unique():
    date_values[date] = count
    count += 1

In [6]:
# Extract a sequence of dates
df['sequence'] = df['date'].map(date_values)
sequence_df = df[['sequence','date']]
sequence_df = sequence_df.drop_duplicates(subset = 'date', keep='first')
sequence_df['date_formatted'] = sequence_df['date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").strftime('%B %d, %Y'))
sequence_df.to_csv("sequence.csv",index=False)

In [7]:
# Display the weightclass categories
df['weightclass'].unique()

array(['Pound-for-Pound', 'Flyweight', 'Bantamweight', 'Featherweight',
       'Lightweight', 'Welterweight', 'Middleweight', 'Light Heavyweight',
       'Heavyweight', "Women's Bantamweight", "Women's Strawweight",
       "Women's Featherweight", "Women's Flyweight"], dtype=object)

In [8]:
df.head()

Unnamed: 0,date,weightclass,fighter,rank,sequence
0,2013-02-04,Pound-for-Pound,Anderson Silva,1,0
1,2013-02-04,Pound-for-Pound,Jon Jones,2,0
2,2013-02-04,Pound-for-Pound,Georges St-Pierre,3,0
3,2013-02-04,Pound-for-Pound,Jose Aldo,4,0
4,2013-02-04,Pound-for-Pound,Benson Henderson,5,0


## Extract the rankings for the lightweight division

In [9]:
# Select lightweight class
lightweight_df = df[df['weightclass']=='Lightweight']
lightweight_df = lightweight_df.reset_index()
lightweight_df = lightweight_df.drop(columns=['index'])
print("Number of fighters: " + str(len(lightweight_df['fighter'].unique())))
print("Number of entries: " + str(len(lightweight_df)))

Number of fighters: 44
Number of entries: 3531


In [10]:
lightweight_df.head()

Unnamed: 0,date,weightclass,fighter,rank,sequence
0,2013-02-04,Lightweight,Benson Henderson,0,0
1,2013-02-04,Lightweight,Gilbert Melendez,1,0
2,2013-02-04,Lightweight,Anthony Pettis,2,0
3,2013-02-04,Lightweight,Gray Maynard,3,0
4,2013-02-04,Lightweight,Nate Diaz,4,0


In [11]:
# Extract sequence of dates
lightweight_df['sequence'] = lightweight_df['date'].map(date_values)
sequence_df = lightweight_df[['sequence','date']]
sequence_df = sequence_df.drop_duplicates(subset = 'date', keep='first')
# sequence_df.to_csv("lightweight-sequence.csv",index=False)

In [12]:
# Take top 10
lightweight_df = lightweight_df[lightweight_df['rank']<=10]
lightweight_df['reverse_rank'] = lightweight_df['rank'].apply(lambda x: 11-x if x > 0 else 11)

In [93]:
print("Number of fighters: " + str(len(lightweight_df['fighter'].unique())))
print("Number of entries: " + str(len(lightweight_df)))
lightweight_df

Number of fighters: 31
Number of entries: 2511


Unnamed: 0,date,weightclass,fighter,rank,sequence,reverse_rank
0,2013-02-04,Lightweight,Benson Henderson,0,0,11
1,2013-02-04,Lightweight,Gilbert Melendez,1,0,10
2,2013-02-04,Lightweight,Anthony Pettis,2,0,9
3,2013-02-04,Lightweight,Gray Maynard,3,0,8
4,2013-02-04,Lightweight,Nate Diaz,4,0,7
...,...,...,...,...,...,...
3521,2020-03-16,Lightweight,Donald Cerrone,6,227,5
3522,2020-03-16,Lightweight,Paul Felder,7,227,4
3523,2020-03-16,Lightweight,Charles Oliveira,8,227,3
3524,2020-03-16,Lightweight,Al Iaquinta,9,227,2


In [15]:
# Export (reverse) rankings over time in csv
new_lightweight_df = lightweight_df.pivot_table('reverse_rank', 'fighter', 'sequence', fill_value="-1")
new_lightweight_df.to_csv("lightweight-rankings.csv", index=True)

In [16]:
# Export list of unique fighters
fighters_df = pd.DataFrame(lightweight_df['fighter'].unique(),columns=["fighter"])
fighters_df.to_csv("lightweight-fighters.csv",index=False)

In [17]:
# Provide a default bar color
fighters_df['bar_color'] = "#C5B0D5"

In [49]:
# lightweight_df.groupby(['rank']).agg(['count'])
lightweight_df.groupby(['rank']).get_group(1).to_csv('temp.csv')

In [None]:
# Read list of fighters and export to json
# fighters_df = pd.read_csv("UFC Fighters - lightweight-fighters.csv") 
# fighters_df[['fighter','bar_color']].to_json("lightweight-fighters.json", orient='records')

## Extract rankings for the bantamweight division

In [18]:
# Repeat for bantamweight class
bantamweight_df = df[df['weightclass']=='Bantamweight']
bantamweight_df = bantamweight_df.reset_index()
bantamweight_df = bantamweight_df.drop(columns=['index'])
print("Number of fighters: " + str(len(bantamweight_df['fighter'].unique())))
print("Number of entries: " + str(len(bantamweight_df)))

Number of fighters: 52
Number of entries: 3529


In [19]:
# Extract sequence of dates
bantamweight_df['sequence'] = bantamweight_df['date'].map(date_values)
sequence_df = bantamweight_df[['sequence','date']]
sequence_df = sequence_df.drop_duplicates(subset = 'date', keep='first')
# sequence_df.to_csv("bantamweight-sequence.csv",index=False)

In [20]:
# Take top 10
bantamweight_df = bantamweight_df[bantamweight_df['rank']<=10]
bantamweight_df['reverse_rank'] = bantamweight_df['rank'].apply(lambda x: 11-x if x > 0 else 11)

In [21]:
print("Number of fighters: " + str(len(bantamweight_df['fighter'].unique())))
print("Number of entries: " + str(len(bantamweight_df)))
bantamweight_df

Number of fighters: 34
Number of entries: 2508


Unnamed: 0,date,weightclass,fighter,rank,sequence,reverse_rank
0,2013-02-04,Bantamweight,Dominick Cruz,0,0,11
1,2013-02-04,Bantamweight,Renan Barao,1,0,10
2,2013-02-04,Bantamweight,Michael McDonald,2,0,9
3,2013-02-04,Bantamweight,Urijah Faber,3,0,8
4,2013-02-04,Bantamweight,Eddie Wineland,4,0,7
...,...,...,...,...,...,...
3519,2020-03-16,Bantamweight,Jose Aldo,6,227,5
3520,2020-03-16,Bantamweight,Pedro Munhoz,7,227,4
3521,2020-03-16,Bantamweight,Jimmie Rivera,8,227,3
3522,2020-03-16,Bantamweight,Cody Garbrandt,9,227,2


In [22]:
# Check if there was a vacant ranking 
# Between 2019-03-26 to 2019-05-13, there was no Bantamweight Champion.
bantamweight_df[bantamweight_df['fighter'].isnull()]

Unnamed: 0,date,weightclass,fighter,rank,sequence,reverse_rank
3145,2019-03-26,Bantamweight,,0,204,11
3161,2019-04-01,Bantamweight,,0,205,11
3177,2019-04-15,Bantamweight,,0,206,11
3193,2019-05-13,Bantamweight,,0,207,11


In [23]:
# Replace NaN fighter with "Vacant"
bantamweight_df['fighter'].fillna("Vacant", inplace=True)

In [24]:
# Export (reverse) rankings over time in csv
new_bantamweight_df = bantamweight_df.pivot_table('reverse_rank', 'fighter', 'sequence', fill_value="-1")
new_bantamweight_df.to_csv("bantamweight-rankings.csv", index=True)

In [25]:
# Export list of unique fighters
fighters_df = pd.DataFrame(bantamweight_df['fighter'].unique(),columns=["fighter"])
fighters_df.to_csv("bantamweight-fighters.csv",index=False)

In [26]:
# Provide a default bar color
fighters_df['bar_color'] = "#98DF8A"

In [None]:
# Read list of fighters and export to json
# fighters_df = pd.read_csv("UFC Fighters - bantamweight-fighters.csv") 
# fighters_df[['fighter','bar_color']].to_json("bantamweight-fighters.json", orient='records')