In [1]:
import requests
import csv
import numpy as np
import pandas as pd
from unidecode import unidecode
import time
from bs4 import BeautifulSoup

# Reading in the data frame

In [433]:
# df_f4 = pd.read_csv('df_f4')  # df_f4 contains last fight info


In [1624]:
df_f5 = pd.read_csv('df_f5') # df_f5 contains streak info

In [1625]:
df_f5['date'] = pd.to_datetime(df_f5['date'], errors='coerce')

In [1213]:
df_f5.to_csv('df_f5', index=False)

# Changing the label column from -1 and 1 to 0 and 1

In [1627]:
df_f5['win'] = df_f5['win'].where(df_f5['win'] == 1, 0)

# Setting up for ML. Splitting the data into train and test. Establishing predictors

In [1755]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, PredefinedSplit, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

rf = RandomForestClassifier(n_estimators=50, min_samples_split = 10, random_state=1)

train, test, = train_test_split(df_f5, test_size = 0.25, random_state = 0)

predictors = ['fighter_ht', 'fighter_reach', 'opp_ht', 'opp_reach', 'wins', 'losses', 'draw',\
             'opp_wins', 'opp_losses', 'opp_draw', 'fighter_age', 'opp_age',\
              'w_streak', 'l_streak', 'opp_w_streak', 'opp_l_streak']

rf.fit(train[predictors], train['win'])
preds = rf.predict(test[predictors])

# Finding out the models accuracy on the test data

In [1756]:
acc = accuracy_score(test['win'],preds)

acc

0.58235054900366

# Visualizing predictions

In [1757]:
combined = pd.DataFrame(dict(actual=test['win'], prediction=preds))

pd.crosstab(index=combined['actual'], columns=combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,717,515
1,512,715


# Checking the other scores

In [1758]:
ps = precision_score(test['win'], preds)

f1 = f1_score(test['win'], preds)

recall =recall_score(test['win'], preds)

print('Precision Score:',ps, 'F1 Score:',f1, 'Recall Score:', recall )

Precision Score: 0.5813008130081301 F1 Score: 0.582010582010582 Recall Score: 0.582722086389568


# ML with boost rather than Random Forest

In [1759]:
from xgboost import XGBClassifier
from xgboost import plot_importance

xgb = XGBClassifier(objective='binary:logistic', random_state=0)

cv_params = {'max_depth': [4, 6],
              'min_child_weight': [3, 5],
              'learning_rate': [0.1, 0.2, 0.3],
              'n_estimators': [5,10,15],
              'subsample': [0.7],
              'colsample_bytree': [0.7]
              }

xgb_cv = GridSearchCV(xgb, cv_params,cv = 5,refit = 'f1', error_score='raise')

xgb_cv = xgb_cv.fit(train[predictors], train['win'])
xgb_cv

# Setting the predictions using predictors from RF model. Checking scores.

In [1760]:
y_pred = xgb_cv.predict(test[predictors])

# Checking the scores using the boost method on the test data.

In [1761]:
accuracy_score(test['win'],y_pred)

0.571370475803172

In [1733]:
precision_score(test['win'], y_pred)

0.5716652858326429

In [1734]:
recall_score(test['win'], y_pred)

0.5623471882640587

In [1735]:
f1_score(test['win'], y_pred)

0.5669679539852095

# Visualizing relationships

In [1736]:
combined2 = pd.DataFrame(dict(actual=test['win'], prediction=y_pred))

pd.crosstab(index=combined2['actual'], columns=combined2['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,715,517
1,537,690


# Because there are two rows for every fight, I am going to check the accuracy of the predictions when both rows give the same result. I am using the Random Forest Classifier becuase it returned better results initially.

In [1765]:
p = rf.predict(test[predictors])

pp = pd.DataFrame(dict(prediction=p), index=test.index)

ppp = pp.merge(test[['date', 'fighter', 'opponent', 'win']], left_index=True, right_index=True)

# Now we have a data frame with the predictions and the actual results.

In [1766]:
ppp

Unnamed: 0,prediction,date,fighter,opponent,win
4869,1,2022-12-03,Jonathan Pearce,Darren Elkins,1
9502,0,2021-12-04,Vince Morales,Louis Smolka,1
9835,0,2021-10-30,Zubaira Tukhugov,Ricardo Ramos,1
6533,1,2021-08-07,Melissa Gatto,Victoria Leonardo,1
9784,0,2016-12-10,Zach Makovsky,Dustin Ortiz,0
...,...,...,...,...,...
1515,0,2022-07-23,Charles Rosa,Nathaniel Wood,0
8260,0,2017-02-19,Ryan Janes,Gerald Meerschaert,0
6492,1,2021-01-16,Max Holloway,Calvin Kattar,1
5094,1,2023-03-04,Julian Marquez,Marc-Andre Barriault,0


# Selecting the rows where the model predicted a win for a fighter when the fighter was both 'figther' and 'opponent'

In [1767]:
merged = ppp.merge(ppp, left_on=['date', 'fighter'], right_on=['date', 'opponent'])

yy = merged[(merged.prediction_x==1) & (merged.prediction_y==0)]

# Totaling prediction == loss(0) and actual == loss(0)

In [1768]:
yy.prediction_y.sum(), yy.win_y.sum()

(0, 85)

In [1753]:
yy

Unnamed: 0,prediction_x,date,fighter_x,opponent_x,win_x,prediction_y,fighter_y,opponent_y,win_y
5,1,2011-12-10,Igor Pokrajac,Krzysztof Soszynski,1,0,Krzysztof Soszynski,Igor Pokrajac,0
10,1,2018-08-25,Bryan Barberena,Jake Ellenberger,1,0,Jake Ellenberger,Bryan Barberena,0
15,1,2015-01-18,Uriah Hall,Ron Stallings,1,0,Ron Stallings,Uriah Hall,0
17,1,2019-07-06,Ben Askren,Jorge Masvidal,0,0,Jorge Masvidal,Ben Askren,1
21,1,2018-07-06,Julian Marquez,Alessio Di Chirico,0,0,Alessio Di Chirico,Julian Marquez,1
...,...,...,...,...,...,...,...,...,...
589,1,2019-09-14,Michel Pereira,Tristan Connelly,0,0,Tristan Connelly,Michel Pereira,1
592,1,2012-12-14,Hector Lombard,Rousimar Palhares,1,0,Rousimar Palhares,Hector Lombard,0
593,1,2015-03-14,Anthony Pettis,Rafael Dos Anjos,0,0,Rafael Dos Anjos,Anthony Pettis,1
602,1,2017-03-04,Mirsad Bektic,Darren Elkins,0,0,Darren Elkins,Mirsad Bektic,1


# Correct 0 predictions over total outcomes == 61.5%

In [1754]:
x = 221-85
x/221

0.6153846153846154

# Finding the accuracy when the prediction == win(1)

In [1769]:
yy.prediction_x.sum(), yy.win_x.sum()

(221, 135)

# 61% for predictions of 1

In [1770]:
135/221

0.6108597285067874

# Selecting for matching results for both instances of the same event (the fight) has improved our model from 58 % accuracy to 61% accuracy

# Now I will create a df for the upcoming fights and make predictions. Fist I read in my csv for all fighters.

In [889]:
fighter_df = pd.read_csv('fighter_df')

# Cleaning fighter info df

In [898]:
fighter_df = fighter_df.drop(['Nickname', 'Stance', 'W', 'L', 'D', 'Belt'], axis=1)

fighter_df['fighter'] = fighter_df['First'] + ' ' + fighter_df['Last']

fighter_df = fighter_df.reset_index()

fighter_df = fighter_df.drop(['index'], axis=1)

fighter_df = fighter_df.drop(['First', 'Last'], axis=1)

# Requesting fight table for upcoming fight

In [1392]:
fights_url = "http://www.ufcstats.com/statistics/events/completed"

In [1395]:
data = requests.get(fights_url)

In [1396]:
soup = BeautifulSoup(data.text)
fight_table = soup.select("table.b-statistics__table-events")[0] 

links = fight_table.find_all('a')

links = [l.get("href") for l in links]

In [1397]:
upcoming = links[0] # selecting the upcoming fight

In [1398]:
data = requests.get(upcoming) # requesting each fight card individually
fights = pd.read_html(data.text)
card = fights[0]

# Seperating the names of the fighters into winner and loser, adding the date.

In [1466]:
soup = BeautifulSoup(data.text)
date = soup.select("li.b-list__box-list-item") # uses bs to find date, separate date, and convert
date1 = str(date[0])                           # date to date time.
date2 = date1.split('\n')
date = date2[-2].strip()
date = pd.to_datetime(date)
card['date'] = date
card[['w_fighter','l_fighter']] = card['Fighter'].str.split('  ', expand=True)

# Removing unwanted columns

In [1467]:
df = card.drop(['Fighter', 'Kd', 'Str', 'Td', 'Sub', 'Weight class', 'Method', 'Round', 'Time'], axis=1)

# Creating two data frames, making the winner and the losser the fighter and the opponent, respectively. Then I merge the two data frames together. This creates two rows per event with each fighter as 'fighter' and 'opponent'

In [1468]:
df2 = df.rename(columns={'w_fighter':'opponent', 'l_fighter':'fighter'})
df = df.rename(columns={'w_fighter':'fighter', 'l_fighter':'opponent'})
df3 = pd.concat([df.copy(),df2.copy()])

# Merging fighter info to 'fighter' and 'opponent', adding reach and height

In [1469]:
df3 = df3.merge(fighter_df, how='left', on='fighter')

In [1470]:
df3 = df3.merge(fighter_df, left_on='opponent', right_on='fighter')

# Cleaning data

In [1126]:
df3.columns

Index(['W/L', 'date', 'fighter_x', 'opponent', 'Ht._x', 'Wt._x', 'Reach_x',
       'Ht._y', 'Wt._y', 'Reach_y', 'fighter_y'],
      dtype='object')

In [1472]:
df3 = df3.drop(['fighter_y'], axis=1)

In [1473]:
df3 = df3.rename(columns={'W/L':'win', 'fighter_x':'fighter', 'Ht._x':'fighter_ht', 'Reach_x':'fighter_reach'\
                         ,'Ht._y':'opp_ht', 'Reach_y':'opp_reach'})

# Removing chicken scratches from feet and inches and creating numeric columns

In [1474]:
df3['opp_ht'] = df3['opp_ht'].map(lambda x: x.replace('\'', '').strip())
df3['opp_ht'] = df3['opp_ht'].map(lambda x: x.replace('\"', '').strip())

df3['fighter_ht'] = df3['fighter_ht'].map(lambda x: x.replace('\'', '').strip())
df3['fighter_ht'] = df3['fighter_ht'].map(lambda x: x.replace('\"', '').strip())

df3[['feet', 'inches']] = df3['opp_ht'].str.split(' ', expand=True)

df3[['ffeet', 'finches']] = df3['fighter_ht'].str.split(' ', expand=True)

df3['feet'] = pd.to_numeric(df3['feet'], errors='coerce').astype('Int64')

df3['ffeet'] = pd.to_numeric(df3['ffeet'], errors='coerce').astype('Int64')

df3['inches'] = pd.to_numeric(df3['inches'], errors='coerce').astype('Int64')

df3['finches'] = pd.to_numeric(df3['finches'], errors='coerce').astype('Int64')

df3['feet'] = df3['feet']*12

df3['ffeet'] = df3['ffeet']*12

df3['opp_ht'] = df3['feet']+df3['inches']

df3['fighter_ht'] = df3['ffeet']+df3['finches']

df3 = df3.drop(['feet', 'inches', 'ffeet', 'finches'], axis=1)

df3['fighter_reach'] = df3['fighter_reach'].map(lambda x: x.replace('\"', '').strip())
df3['opp_reach'] = df3['opp_reach'].map(lambda x: x.replace('\"', '').strip())

df3['fighter_reach'] = df3['fighter_reach'].map(lambda x: x.replace('.0', '').strip())
df3['opp_reach'] = df3['opp_reach'].map(lambda x: x.replace('.0', '').strip())

df3['fighter_reach'] = pd.to_numeric(df3['fighter_reach'], errors='coerce').astype('Int64')
df3['opp_reach'] = pd.to_numeric(df3['opp_reach'], errors='coerce').astype('Int64')

df3 = df3.drop(['Wt._x', 'Wt._y'], axis=1)

# Adding date of birth info for 'fighter' and 'opponent' 

In [1475]:
df3_ml = df3.copy()

df3_ml.to_csv('df3_ml', index=False)

age_df = pd.read_csv('age_df')

df3_ml = df3_ml.merge(age_df, on='fighter', how='left')

df3_ml = df3_ml.merge(age_df, left_on='opponent', right_on='fighter')

df3_ml = df3_ml.rename(columns={'dob_x':'fighter_dob', 'dob_y':'opp_dob','fighter_x':'fighter'})

df3_ml = df3_ml.drop(['fighter_y'], axis=1)

# Changing DOBs to datetime, subtracting from fight date to get age, replacing days with a blank so days can be converted to an int. Subtracting dob from date to get exact age on fight night.

In [1476]:
df3_ml['fighter_dob'] = pd.to_datetime(df3_ml['fighter_dob'], errors='coerce')
df3_ml['opp_dob'] = pd.to_datetime(df3_ml['opp_dob'], errors='coerce')
df3_ml['fighter_age'] = df3_ml['date'] - df3_ml['fighter_dob']
df3_ml['opp_age'] = df3_ml['date'] - df3_ml['opp_dob']
df3_ml[['fighter_age','opp_age']] = df3_ml[['fighter_age','opp_age']].astype(str)

df3_ml['fighter_age'] = df3_ml['fighter_age'].str.replace(' days', '')
df3_ml['opp_age'] = df3_ml['opp_age'].str.replace(' days', '')

In [1477]:
df3_ml = df3_ml.drop(['fighter_dob', 'opp_dob'], axis=1)

In [1412]:
df3_ml.to_csv('df3_ml', index=False)

In [1478]:
cf = df3_ml.copy()

# Adding NA columns so I can concatinated cf and df_f5. cf must be added to full df (df_f5) to calculate winning streak and losing streak.

In [1480]:
cf[['wins','losses','draw','opp_wins','opp_losses','opp_draw',\
    'w_streak', 'l_streak', 'opp_w_streak', 'opp_l_streak']] = None

In [1519]:
cf2 = pd.concat([df_f5, cf])

# Sorting values by ascending date so the streaks are shifted forward in time.

In [1391]:
df_f5.to_csv('df_f5', index=False)

In [1520]:
cf2 = cf2.sort_values(['fighter', 'date']).reset_index()

# dropping index row created by reseting the index.

cf2 = cf2.drop(['index'], axis=1)

# Changing the win == 0 to win == -1 so the streaks function works for the losing streak column.

In [1521]:
cf2['win'] = cf2['win'].where(cf2['win'] == 1, -1)



# Creating a function that finds the cumulative sum of recent wins and losses grouped by fighter.

In [1522]:
def streaks(df):
    s = df['win'].groupby([df['fighter'], (df['win']!=df['win'].shift()).cumsum()]).cumsum()
    return df.assign(w_streak=s.where(s>0, 0), l_streak=s.where(s<1, 0).abs())

cf2 = streaks(cf2)

In [1523]:
cf2[['w_streak', 'l_streak']] = cf2[['w_streak', 'l_streak']].groupby(cf2['fighter']).shift()

#cf2 = cf2.fillna(0)

In [600]:
pd.options.display.max_columns = None

# Resorting the values in descending order to backfill wins and losses.

In [1524]:
cf2 = cf2.sort_values(['fighter', 'date'], ascending=False).reset_index()

# dropping index row created by reseting the index.

cf2 = cf2.drop(['index'], axis=1)

# Filling in the wins and losses going into current week

In [1525]:
cf2['win'] = cf2['win'].where(cf2['win'] == 1, 0)


cf2['ww'] = cf2['wins'][cf2['win']==1].add(1)

cf2['ww'][cf2['win']==0] = cf2['wins']
cf2['ww'][cf2['win']==2] = cf2['wins']

cf2['ww'] = cf2.groupby('fighter')['ww'].fillna(method='bfill', limit=1)

cf2['wins'][cf2['wins'].isna()] = cf2['ww']

cf2['wl'] = cf2['losses'][cf2['win'] == 0].add(1)
cf2['wl'][cf2['win']==1] = cf2['losses']
cf2['wl'][cf2['win']==2] = cf2['losses']

cf2['wl'] = cf2.groupby('fighter')['wl'].fillna(method='bfill', limit=1)

cf2['losses'][cf2['losses'].isna()] = cf2['wl']

# Merging to get current winning and losing streak for 'opponent' column

In [1527]:
cf2 = cf2.merge(cf2, left_on=['opponent','date'], right_on=['fighter','date'])

# Dropping and renaming columns after merger.

In [1066]:
cf2.columns

Index(['date', 'fighter_x', 'opponent_x', 'fighter_ht_x', 'fighter_reach_x',
       'opp_ht_x', 'opp_reach_x', 'wins_x', 'losses_x', 'draw_x',
       'fighter_age_x', 'opp_age_x', 'w_streak_x', 'l_streak_x', 'win_x',
       'wins_y', 'losses_y', 'draw_y', 'w_streak_y', 'l_streak_y'],
      dtype='object')

In [1528]:
cf2 = cf2.drop(['opp_wins_x','opp_losses_x', 'opp_draw_x','opp_w_streak_x', 'opp_l_streak_x',\
               'ww_x', 'wl_x','fighter_y', 'opponent_y', 'fighter_ht_y','fighter_reach_y', 'opp_ht_y',\
                'opp_reach_y','opp_wins_y', 'opp_losses_y', 'opp_draw_y', 'fighter_age_y',\
                'opp_age_y','opp_w_streak_y','opp_l_streak_y', 'win_y', 'ww_y', 'wl_y'], axis=1)

In [1529]:
cf2 = cf2.rename(columns={'win_x':'win', 'fighter_x':'fighter', 'opponent_x':'opponent',\
                          'fighter_ht_x':'fighter_ht','fighter_reach_x':'fighter_reach', 'opp_ht_x':'opp_ht',\
                          'opp_reach_x':'opp_reach','wins_x':'wins', 'losses_x':'losses','draw_x':'draw',\
                          'fighter_age_x':'fighter_age','opp_age_x':'opp_age', 'w_streak_x':'w_streak',\
                          'l_streak_x':'l_streak','wins_y':'opp_wins', 'losses_y':'opp_losses',\
                          'draw_y':'opp_draw',\
                          'w_streak_y':'opp_w_streak', 'l_streak_y':'opp_l_streak'})

# Changing NA to 0 for fighters without a draw.

In [1530]:
cf2['draw'][cf2['draw'].isna()] = 0
cf2['opp_draw'][cf2['opp_draw'].isna()] = 0

In [1774]:
cf2[cf2['date']== '2023-11-11']

Unnamed: 0,date,fighter,opponent,fighter_ht,fighter_reach,opp_ht,opp_reach,wins,losses,draw,fighter_age,opp_age,win,w_streak,l_streak,opp_wins,opp_losses,opp_draw,opp_w_streak,opp_l_streak
294,2023-11-11,Viacheslav Borshchev,Nazim Sadykhov,71,69,70,69,7,3,0,11630,10771,0.0,1.0,0.0,9,1,0,2.0,0.0
503,2023-11-11,Tom Aspinall,Sergei Pavlovich,77,78,75,84,13,3,0,11171,11504,0.0,5.0,0.0,17,1,0,3.0,0.0
816,2023-11-11,Tabatha Ricci,Loopy Godinez,61,61,62,61,9,1,0,10490,11023,0.0,3.0,0.0,11,3,0,3.0,0.0
883,2023-11-11,Steve Erceg,Alessandro Costa,68,68,64,67,10,1,0,10334,10149,0.0,1.0,0.0,13,3,0,1.0,0.0
1122,2023-11-11,Sergei Pavlovich,Tom Aspinall,75,84,77,78,17,1,0,11504,11171,0.0,3.0,0.0,13,3,0,5.0,0.0
2061,2023-11-11,Pat Sabatini,Diego Lopes,68,70,71,72,17,4,0,12055,10543,0.0,0.0,1.0,22,6,0,1.0,0.0
2303,2023-11-11,Nazim Sadykhov,Viacheslav Borshchev,70,69,71,69,9,1,0,10771,11630,0.0,2.0,0.0,7,3,0,1.0,0.0
3012,2023-11-11,Matt Frevola,Benoit Saint Denis,69,71,71,73,10,3,0,12206,10190,0.0,1.0,0.0,12,1,0,4.0,0.0
3159,2023-11-11,Mark Madsen,Jared Gordon,68,72,69,68,12,1,0,14293,12849,0.0,0.0,1.0,19,6,0,0.0,1.0
3387,2023-11-11,Mackenzie Dern,Jessica Andrade,64,63,61,62,13,3,0,11189,11735,0.0,2.0,0.0,24,12,0,0.0,2.0


In [1608]:
cf2 = cf2.dropna(subset=['date', 'fighter', 'opponent', 'fighter_ht', 'fighter_reach',
       'opp_ht', 'opp_reach', 'wins', 'losses', 'draw', 'opp_wins',
       'opp_losses', 'opp_draw', 'fighter_age', 'opp_age', 'w_streak', 'l_streak',
       'opp_w_streak', 'opp_l_streak'])

# Changing the feature columns to integer data types

In [1609]:
cf2['fighter_age'] = cf2['fighter_age'].astype(int)

cf2['opp_age'] = cf2['opp_age'].astype(int)
cf2['wins'] = cf2['wins'].astype(int)
cf2['losses'] = cf2['losses'].astype(int)
cf2['draw'] = cf2['draw'].astype(int)
cf2['opp_wins'] = cf2['opp_wins'].astype(int)
cf2['opp_losses'] = cf2['opp_losses'].astype(int)
cf2['opp_draw'] = cf2['opp_draw'].astype(int)

# Creating a df for the upcoming event

In [1610]:
cf3 = cf2[cf2['date']=='2023-11-11']

# Using are model and selecting for rows with results that are in agreement, we get our predictions for the upcoming event.

In [1776]:
p = rf.predict(cf3[predictors])

pp = pd.DataFrame(dict(prediction=p), index=cf3.index)

ppp = pp.merge(cf3[['date', 'fighter', 'opponent', 'win']], left_index=True, right_index=True)

In [1777]:
merged = ppp.merge(ppp, left_on=['date', 'fighter'], right_on=['date', 'opponent'])

yy = merged[(merged.prediction_x==1) & (merged.prediction_y==0)]

In [1778]:
yy

Unnamed: 0,prediction_x,date,fighter_x,opponent_x,win_x,prediction_y,fighter_y,opponent_y,win_y
2,1,2023-11-11,Tabatha Ricci,Loopy Godinez,0.0,0,Loopy Godinez,Tabatha Ricci,0.0
4,1,2023-11-11,Sergei Pavlovich,Tom Aspinall,0.0,0,Tom Aspinall,Sergei Pavlovich,0.0
6,1,2023-11-11,Nazim Sadykhov,Viacheslav Borshchev,0.0,0,Viacheslav Borshchev,Nazim Sadykhov,0.0
9,1,2023-11-11,Mackenzie Dern,Jessica Andrade,0.0,0,Jessica Andrade,Mackenzie Dern,0.0
12,1,2023-11-11,John Castaneda,Kyung Ho Kang,0.0,0,Kyung Ho Kang,John Castaneda,0.0
13,1,2023-11-11,Jiri Prochazka,Alex Pereira,0.0,0,Alex Pereira,Jiri Prochazka,0.0
17,1,2023-11-11,Diego Lopes,Pat Sabatini,0.0,0,Pat Sabatini,Diego Lopes,0.0
18,1,2023-11-11,Dennis Buzukja,Jamall Emmers,0.0,0,Jamall Emmers,Dennis Buzukja,0.0
19,1,2023-11-11,Benoit Saint Denis,Matt Frevola,0.0,0,Matt Frevola,Benoit Saint Denis,0.0
