In [18]:
import pymongo
import pandas as pd
import configparser
import numpy as np
from sklearn.model_selection import train_test_split

In [48]:
config = configparser.ConfigParser()
config.read('config.ini')
db_connection_string = config.get('database', 'connection_string')
client = pymongo.MongoClient(db_connection_string)

db = client['ufcv2']
collection = db['fighters']

cursor = collection.find({})
df = pd.DataFrame(list(cursor))
print(df.head())



                        _id        fighter_id               name height   
0  65bca6129cbb33544c215b51  0c277f3ff66b0208     Virgil Zwicker  6' 2"  \
1  65bca6129cbb33544c215b53  1291dd6b8ab4d952      George Zuniga  5' 9"   
2  65bca6129cbb33544c215b52  523af801b3429015       Allan Zuniga  5' 7"   
3  65bca6129cbb33544c215b56  e7bc606d269896aa  Osamu Tachihikari  6' 5"   
4  65bca6129cbb33544c215b57  94426bb170c88115       Sodiq Yusuff  5' 9"   

     weight reach           dob  n_win  n_loss  n_draw  sig_str_land_pM   
0  205 lbs.   74"  Jun 26, 1982     15       6       1             3.34  \
1  185 lbs.    --            --      3       1       0             7.64   
2  155 lbs.   70"  Apr 04, 1992     13       1       0             3.93   
3  300 lbs.    --            --      0       4       0             0.00   
4  145 lbs.   71"  May 19, 1993     13       3       0             5.72   

   sig_str_land_pct  sig_str_abs_pM  sig_str_def_pct  td_avg  td_land_pct   
0              0.48  

In [51]:
# Turn the stance into label encoding for model to understands it.
df['stance'] = df['stance'].replace('--', 'Unknown') # Replace empty stances with 'Unknown'
df['stance'] = df['stance'].replace(np.nan, 'Unknown')
stance_mapping = {'Orthodox': 1, 'Southpaw': 2, 'Switch': 3, 'Open Stance': 4, 'Sideways': 5, 'Square': 6, 'Unknown': 0}
df['stance'] = df['stance'].map(stance_mapping)

print(df.head())

                        _id        fighter_id               name height   
0  65bca6129cbb33544c215b51  0c277f3ff66b0208     Virgil Zwicker  6' 2"  \
1  65bca6129cbb33544c215b53  1291dd6b8ab4d952      George Zuniga  5' 9"   
2  65bca6129cbb33544c215b52  523af801b3429015       Allan Zuniga  5' 7"   
3  65bca6129cbb33544c215b56  e7bc606d269896aa  Osamu Tachihikari  6' 5"   
4  65bca6129cbb33544c215b57  94426bb170c88115       Sodiq Yusuff  5' 9"   

     weight reach           dob  n_win  n_loss  n_draw  sig_str_land_pM   
0  205 lbs.   74"  Jun 26, 1982     15       6       1             3.34  \
1  185 lbs.    --            --      3       1       0             7.64   
2  155 lbs.   70"  Apr 04, 1992     13       1       0             3.93   
3  300 lbs.    --            --      0       4       0             0.00   
4  145 lbs.   71"  May 19, 1993     13       3       0             5.72   

   sig_str_land_pct  sig_str_abs_pM  sig_str_def_pct  td_avg  td_land_pct   
0              0.48  

In [52]:
# Turn the reach into integer

# Replace empty values in 'reach' with a placeholder (e.g., -1)
df['reach'] = df['reach'].replace('--', np.nan).fillna(-1)

# Convert 'reach' to integers, handling non-finite values
df['reach'] = pd.to_numeric(df['reach'].str.replace('"', ''), errors='coerce')

# Replace non-finite values (NaN or inf) with another placeholder (e.g., -1)
df['reach'] = df['reach'].replace([np.inf, -np.inf, np.nan], -1).astype(int)

print(df.head())

                        _id        fighter_id               name height   
0  65bca6129cbb33544c215b51  0c277f3ff66b0208     Virgil Zwicker  6' 2"  \
1  65bca6129cbb33544c215b53  1291dd6b8ab4d952      George Zuniga  5' 9"   
2  65bca6129cbb33544c215b52  523af801b3429015       Allan Zuniga  5' 7"   
3  65bca6129cbb33544c215b56  e7bc606d269896aa  Osamu Tachihikari  6' 5"   
4  65bca6129cbb33544c215b57  94426bb170c88115       Sodiq Yusuff  5' 9"   

     weight  reach           dob  n_win  n_loss  n_draw  sig_str_land_pM   
0  205 lbs.     74  Jun 26, 1982     15       6       1             3.34  \
1  185 lbs.     -1            --      3       1       0             7.64   
2  155 lbs.     70  Apr 04, 1992     13       1       0             3.93   
3  300 lbs.     -1            --      0       4       0             0.00   
4  145 lbs.     71  May 19, 1993     13       3       0             5.72   

   sig_str_land_pct  sig_str_abs_pM  sig_str_def_pct  td_avg  td_land_pct   
0              

In [53]:
# Drop the columns that are not needed
columns_to_drop = ["_id", "fighter_id", "height", "weight", "dob"]
df = df.drop(columns_to_drop, axis=1)

print(df.head())

                name  reach  n_win  n_loss  n_draw  sig_str_land_pM   
0     Virgil Zwicker     74     15       6       1             3.34  \
1      George Zuniga     -1      3       1       0             7.64   
2       Allan Zuniga     70     13       1       0             3.93   
3  Osamu Tachihikari     -1      0       4       0             0.00   
4       Sodiq Yusuff     71     13       3       0             5.72   

   sig_str_land_pct  sig_str_abs_pM  sig_str_def_pct  td_avg  td_land_pct   
0              0.48            4.87             0.39    1.31          0.3  \
1              0.38            5.45             0.37    0.00          0.0   
2              0.52            1.80             0.61    0.00          0.0   
3              0.00            0.00             0.00    0.00          0.0   
4              0.49            4.33             0.54    0.28          0.2   

   td_def_pct  sub_avg  __v  stance  
0        0.50      0.0    0       0  
1        1.00      0.0    0       