In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix




# Background

# Note to self, rewrite this as prose:

- Initial intituation on fighting weight cuts
- Fighters have cut drastic amounts of weight to get advantadge through rehydraiton. By cutting water weight before a fight, fighters can weigh in at their desired weight class then regain that weight before their fight through rapid rehydtation. Fighters have been known to regain up to 20 lbs between weigh in and fight night, with the highest ever gain from Geoff Neal, who gained 30.3 pounds at UFC 298 (https://www.espn.com/mma/story/_/id/39610394/seven-ufc-298-fighters-flagged-rehydration-issue)
- Huge advandtadge to cutting weight: you have a massive size advantadge on fight night compared to your opponent.
- However, weight cutting turns the sport of fighting into a weight regain contest. Organizations like One Championship have tried to combat this by measuring hydration levels, but in the UFC, there are no such measures.
- How much of an impact does weight cutting have on winning fights?
- Implications of the sport: weight cutting gives adantadge to fighters who may be less skilled but their bodies are naturally adapted to rapidly losing and gaining water weight
- Also a safety issue: fighters have died trying to cut too much water weight.
- Point of this study is to see the extent to which regaining water weight affects the odds of a fighter winning.

# Explanation of the Data and the Data Sources

# Hypothesis

In [2]:
stats_path = '/Users/caseymoser/Desktop/UFC Analysis/UFC/ufc_fight_stats.csv'

# data source: https://www.reddit.com/r/MMA/comments/evbnjd/released_offical_ufc_fight_night_weights/
weight_path = '/Users/caseymoser/Desktop/UFC Analysis/UFC/UFC Fight Night Weights.xlsx'

results_path = '/Users/caseymoser/Desktop/UFC Analysis/UFC/ufc_fight_results.csv'

stats_df = pd.read_csv(stats_path)

weight_df = pd.read_excel(weight_path)

results_df = pd.read_csv(results_path)


In [3]:
# Step 1: Split fighters into separate columns
results_df[['Fighter_1', 'Fighter_2']] = results_df['BOUT'].str.split(' vs. ', expand=True)

fighter1_df = results_df.copy()
fighter1_df['FIGHTER'] = fighter1_df['Fighter_1']
fighter1_df['RESULT'] = fighter1_df['OUTCOME'].str[0].map({'W': 'Win', 'L': 'Loss'})
fighter1_df = fighter1_df.drop(columns=['Fighter_1','Fighter_2'])


fighter2_df = results_df.copy()
fighter2_df['FIGHTER'] = fighter2_df['Fighter_2']
fighter2_df['RESULT'] = fighter2_df['OUTCOME'].str[2].map({'W': 'Win', 'L': 'Loss'})
fighter2_df = fighter2_df.drop(columns=['Fighter_1','Fighter_2'])


# Note to self, clean this code so that I first drop fighter_1 and fighter_2, then combine the data sets together/

# Combine both into a single dataframe
results_df_clean = pd.concat([fighter1_df, fighter2_df])

results_df_clean['UFC_EVENT'] = results_df_clean['EVENT'].str.extract(r'(UFC \d+)', expand=False)





In [4]:
weight_df['UFC_EVENT'] = weight_df['EVENT'].str.extract(r'(UFC \d+)', expand=False)
weight_df

weight_df['FIGHTER'] = weight_df['FIGHTER'].str.strip().str.lower()
weight_df['UFC_EVENT'] = weight_df['UFC_EVENT'].str.strip().str.upper()

results_df_clean['FIGHTER'] = results_df_clean['FIGHTER'].str.strip().str.lower()
results_df_clean['UFC_EVENT'] = results_df_clean['UFC_EVENT'].str.strip().str.upper()

merged_weight = pd.merge(
    results_df_clean,
    weight_df,
    on=['FIGHTER', 'UFC_EVENT'],
    how='left', 
    suffixes=('_result', '_weight')
)
results_df_clean

# Drop rows with any NaN values
merged_weight_clean = merged_weight.dropna(subset=['WEIGH IN WEIGHT (lbs)'])



In [5]:
#df.groupby('FIGHTER')['SIG.STR. %'].mean()

win_weight_df = merged_weight_clean

win_weight_df['RESULT'] = win_weight_df['RESULT'].map({'Win': 1, 'Loss': 0})


win_weight_df_clean = win_weight_df.dropna(subset=['RESULT', 'WEIGHT INCREASE (lbs)'])

# calculating percent regain (heavyweights more likely to just have higher raw numbers because they have more fat than lower weight classes)
win_weight_df_clean['PERCENT_REGAIN'] =win_weight_df_clean['WEIGHT INCREASE (lbs)']/win_weight_df_clean['WEIGH IN WEIGHT (lbs)']*100


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  win_weight_df['RESULT'] = win_weight_df['RESULT'].map({'Win': 1, 'Loss': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  win_weight_df_clean['PERCENT_REGAIN'] =win_weight_df_clean['WEIGHT INCREASE (lbs)']/win_weight_df_clean['WEIGH IN WEIGHT (lbs)']*100


### Choice of Regression

Given that the dependent variable is binary (win or loss), I have chosen to fit a logistic model to predict the log-odds of winning fights based on percentage weight regained.

In [10]:
#Logit Model

X = win_weight_df_clean[['PERCENT_REGAIN']]


y = win_weight_df_clean['RESULT']

# Add a constant (intercept) to the independent variable
X = sm.add_constant(X)

# Fit the OLS model
model = sm.Logit(y, X)
result = model.fit()
print(result.summary())


math.exp(0.0432)

Optimization terminated successfully.
         Current function value: 0.684487
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                 RESULT   No. Observations:                  437
Model:                          Logit   Df Residuals:                      435
Method:                           MLE   Df Model:                            1
Date:                Tue, 29 Jul 2025   Pseudo R-squ.:                0.004868
Time:                        12:28:42   Log-Likelihood:                -299.12
converged:                       True   LL-Null:                       -300.58
Covariance Type:            nonrobust   LLR p-value:                   0.08714
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.2334      0.275     -0.849      0.396      -0.772       0.306
PERCENT_REGAIN   

NameError: name 'math' is not defined


In this first log-odds model, the coefficient on PERCENT_REGAIN is 0.0432, meaning for every 1 percent increase in weight, there is an associated 01.06 percentage point increase in the probability of winning the fight holding all other variables constant. This model is not statistically significant at alpha = 0.05, but it is significant at alpha =0.10.  This means that PERCENT_REGAIN has a statistically signficant impact on winning at a 90% condifidence interval. However, given that the R$^{2}$ value is only 0.007, less than 1% of the variation in fight outcome is explained by percentage weight regain. In this way, this model has insufficient predictive power to be useful for fight prediction.


In [31]:
# testing for causality on randomzied data in logistic model

from sklearn.model_selection import train_test_split

X = win_weight_df_clean[['PERCENT_REGAIN']]

y = win_weight_df_clean['RESULT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

print("Logistic Accuracy:", accuracy_score(y_test, y_pred_log))
print("Logistic AUC:", roc_auc_score(y_test, log_model.predict_proba(X_test)[:, 1]))

### Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("\nRF Accuracy:", accuracy_score(y_test, y_pred_rf))
print("RF AUC:", roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1]))

Logistic Accuracy: 0.5568181818181818
Logistic AUC: 0.4728947368421053

RF Accuracy: 0.5227272727272727
RF AUC: 0.4723684210526315


In the logistic regression, the performance of the model further confirms that weight regain alone is insufficient to predict fight outcome. Since the AUC values is less than 0.5, the model is worse than random guessing at predicting fight outcomes. In this way, the amount of weight regained by a fighter does not have sufficient predictive power.

For both the OLS and logistic regression models, the biggest limitation to this analysis is the data set. Not all regions require fighters to publicize their fight night weight, meaning a lot of weight regains are not capture in my analysis. Having more post fight weigh in data could let us more accurately assess if the amount of weight regained does have predictive power on fight outcome.

In [35]:
# Model with squared term

win_weight_df_clean['PERCENT_REGAIN_SQ'] = win_weight_df_clean['PERCENT_REGAIN'] ** 2


X = win_weight_df_clean[['PERCENT_REGAIN', 'PERCENT_REGAIN_SQ']]
X = sm.add_constant(X)

y = win_weight_df_clean['RESULT']


# Fit logistic regression
logit_model = sm.Logit(y_clean, X_clean)
results = logit_model.fit()

# Print summary
print(results.summary())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  win_weight_df_clean['PERCENT_REGAIN_SQ'] = win_weight_df_clean['PERCENT_REGAIN'] ** 2


ValueError: zero-size array to reduction operation maximum which has no identity

In this third OLS model, the coefficients on PERCENT_REGAIN and PERCENT_REGAIN_SQ are both statsitically insignfnicant at a 95% confidence interval, implying that we fail to reject the null hypothesis that regaining weight has a downside at higher levels (in other words, the square term is used to see if there is a fall off in the benefit of regaining weight at higher percentage regains due to dehydration issues).

Again, the biggest limitation to this analysis is the data set. Not all regions require fighters to publicize their fight night weight, meaning a lot of weight regains are not capture in my analysis. Having more post fight weigh in data could let us more accurately assess if there is a limitation to the benefit of cutting and regaining weight.