In [5]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
# from sklearn.preprocessing import MinMaxScaler
#import io

# Step 2: Load the dataset locally (from a path or buffer)
df = pd.read_csv('ufc-master-raw.csv')

df.head()

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,FinishDetails,FinishRound,FinishRoundTime,TotalFightTimeSecs,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds
0,Alex Pereira,Jamahal Hill,-130.0,110.0,76.9231,110.0,2024-04-13,"Las Vegas, Nevada, USA",USA,Red,...,Punch,1.0,3:14,194.0,600.0,650.0,2000.0,1400.0,110.0,200.0
1,Zhang Weili,Yan Xiaonan,-455.0,350.0,21.978,350.0,2024-04-13,"Las Vegas, Nevada, USA",USA,Red,...,,5.0,5:00,1500.0,200.0,,400.0,,140.0,
2,Justin Gaethje,Max Holloway,-185.0,154.0,54.0541,154.0,2024-04-13,"Las Vegas, Nevada, USA",USA,Blue,...,Punch,5.0,4:59,1499.0,300.0,250.0,2200.0,1600.0,150.0,550.0
3,Charles Oliveira,Arman Tsarukyan,190.0,-218.0,190.0,45.8716,2024-04-13,"Las Vegas, Nevada, USA",USA,Blue,...,,3.0,5:00,900.0,900.0,330.0,450.0,700.0,550.0,130.0
4,Bo Nickal,Cody Brundage,-1600.0,900.0,6.25,900.0,2024-04-13,"Las Vegas, Nevada, USA",USA,Red,...,Rear Naked Choke,2.0,3:38,518.0,1000.0,3000.0,-135.0,3000.0,150.0,1400.0


In [6]:
# Fill missing values in betting odds with the median of each column
odds_columns = ['RedOdds', 'BlueOdds', 'RedExpectedValue', 'BlueExpectedValue', 'RSubOdds', 'BSubOdds', 'RKOOdds', 'BKOOdds', 'RedDecOdds', 'BlueDecOdds']

# Fill missing odds with the median
df[odds_columns] = df[odds_columns].fillna(df[odds_columns].median())

# Fill missing 'FinishDetails' with 'Unknown'
df['FinishDetails'] = df['FinishDetails'].fillna('Unknown')

# Verify the changes
df[odds_columns].isnull().sum(), df['FinishDetails'].isnull().sum()


(RedOdds              0
 BlueOdds             0
 RedExpectedValue     0
 BlueExpectedValue    0
 RSubOdds             0
 BSubOdds             0
 RKOOdds              0
 BKOOdds              0
 RedDecOdds           0
 BlueDecOdds          0
 dtype: int64,
 np.int64(0))

In [7]:
# Convert 'Date' column into datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

# Verify the changes
df[['Date']].head()


Unnamed: 0,Date
0,2024-04-13
1,2024-04-13
2,2024-04-13
3,2024-04-13
4,2024-04-13


In [8]:
# Function to convert 'FinishRoundTime' (MM:SS) to seconds
def time_to_seconds(time_str):
    if pd.isna(time_str):
        return np.nan
    minutes, seconds = map(int, time_str.split(':'))
    return minutes * 60 + seconds

# Apply the conversion to the 'FinishRoundTime' column
df['FinishRoundTimeSecs'] = df['FinishRoundTime'].apply(time_to_seconds)

# Calculate the total fight duration in seconds
df['TotalFightDurationSecs'] = ((df['FinishRound'] - 1) * 300) + df['FinishRoundTimeSecs']

# Verify the results
df[['FinishRound', 'FinishRoundTime', 'FinishRoundTimeSecs', 'TotalFightDurationSecs']].head()


Unnamed: 0,FinishRound,FinishRoundTime,FinishRoundTimeSecs,TotalFightDurationSecs
0,1.0,3:14,194.0,194.0
1,5.0,5:00,300.0,1500.0
2,5.0,4:59,299.0,1499.0
3,3.0,5:00,300.0,900.0
4,2.0,3:38,218.0,518.0


In [9]:
# Initialize Min-Max Scaler
scaler = MinMaxScaler()

# Apply Min-Max scaling to the odds columns
df[odds_columns] = scaler.fit_transform(df[odds_columns])

# Verify the normalized values
df[odds_columns].head()


Unnamed: 0,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,RSubOdds,BSubOdds,RKOOdds,BKOOdds,RedDecOdds,BlueDecOdds
0,0.673874,0.524,0.093407,0.07871,0.470705,0.439105,0.204651,0.137931,0.366197,0.265625
1,0.556757,0.62,0.022049,0.264516,0.152929,0.372825,0.213953,0.231322,0.225352,0.171875
2,0.654054,0.5416,0.063707,0.112774,0.510427,0.472245,0.217054,0.238506,0.260563,0.140625
3,0.789189,0.3928,0.24026,0.029062,0.16286,0.323115,0.341085,0.117816,0.471831,0.165625
4,0.144144,0.84,0.001623,0.690323,0.046673,0.704225,0.217054,0.482759,0.507042,1.0


In [10]:
# Initialize win counters for red and blue fighters
df['RedWin'] = df['Winner'] == 'Red'
df['BlueWin'] = df['Winner'] == 'Blue'

# Calculate cumulative wins and win percentages for fighters over time
df['RedWinPercentage'] = df.groupby('RedFighter')['RedWin'].cumsum() / df.groupby('RedFighter').cumcount()
df['BlueWinPercentage'] = df.groupby('BlueFighter')['BlueWin'].cumsum() / df.groupby('BlueFighter').cumcount()

# Fill any NaN win percentages with 0 (for fighters' first match)
df['RedWinPercentage'] = df['RedWinPercentage'].fillna(0)
df['BlueWinPercentage'] = df['BlueWinPercentage'].fillna(0)

# Verify the win percentages
df[['RedFighter', 'BlueFighter', 'RedWinPercentage', 'BlueWinPercentage']].head()


Unnamed: 0,RedFighter,BlueFighter,RedWinPercentage,BlueWinPercentage
0,Alex Pereira,Jamahal Hill,inf,0.0
1,Zhang Weili,Yan Xiaonan,inf,0.0
2,Justin Gaethje,Max Holloway,0.0,inf
3,Charles Oliveira,Arman Tsarukyan,0.0,inf
4,Bo Nickal,Cody Brundage,inf,0.0


In [11]:
# Save the transformed dataframe to a CSV file
df.to_csv('ufc-master-transformed.csv', index=False)

# Confirm the file has been saved and verify the first few rows
df.head()


Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,RSubOdds,BSubOdds,RKOOdds,BKOOdds,FinishRoundTimeSecs,TotalFightDurationSecs,RedWin,BlueWin,RedWinPercentage,BlueWinPercentage
0,Alex Pereira,Jamahal Hill,0.673874,0.524,0.093407,0.07871,2024-04-13,"Las Vegas, Nevada, USA",USA,Red,...,0.470705,0.439105,0.204651,0.137931,194.0,194.0,True,False,inf,0.0
1,Zhang Weili,Yan Xiaonan,0.556757,0.62,0.022049,0.264516,2024-04-13,"Las Vegas, Nevada, USA",USA,Red,...,0.152929,0.372825,0.213953,0.231322,300.0,1500.0,True,False,inf,0.0
2,Justin Gaethje,Max Holloway,0.654054,0.5416,0.063707,0.112774,2024-04-13,"Las Vegas, Nevada, USA",USA,Blue,...,0.510427,0.472245,0.217054,0.238506,299.0,1499.0,False,True,0.0,inf
3,Charles Oliveira,Arman Tsarukyan,0.789189,0.3928,0.24026,0.029062,2024-04-13,"Las Vegas, Nevada, USA",USA,Blue,...,0.16286,0.323115,0.341085,0.117816,300.0,900.0,False,True,0.0,inf
4,Bo Nickal,Cody Brundage,0.144144,0.84,0.001623,0.690323,2024-04-13,"Las Vegas, Nevada, USA",USA,Red,...,0.046673,0.704225,0.217054,0.482759,218.0,518.0,True,False,inf,0.0
