In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd


In [5]:
df = pd.read_csv('/content/drive/MyDrive/UFC/masterMLpublic.csv')

Feature Extracting

In [6]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [11]:
df = df[df['method'] != "DRAW"] # Remove draws
df = df[df['division'] != "Women's Bantamweight"]
df = df[df['division'] != "Heavyweight"]
df = df[df['division'] != "Women's Flyweight"]
df = df.replace([np.inf, -np.inf], np.nan) # Replace inf with NaN
df = df.dropna() # Remove NaNs
df = df[df.columns.drop(list(df.filter(like='odds')))] # Remove odds columns

In [16]:
test_df = df.loc[(df['date'] > '2019-11-01')]
y_test = test_df['result']
X_test = test_df.filter(like='precomp')

train_df = df.drop(test_df.index)
y_train = train_df['result']
X_train = train_df.filter(like='precomp')

In [17]:
params = {'tree_method': 'gpu_hist',
          'objective': 'binary:logistic',
          'verbosity': 0, 'n_jobs': -1,
          'learning_rate': 0.006379731330665644,
          'min_child_weight': 5,
          'max_depth': 1,
          'subsample': 0.4329771439302427,
          'colsample_bytree': 0.28566614739884083,
          'gamma': 0.047745011818589665,
          'n_estimators': 158,
          'eta': 0.10543103310179618}

clf = xgb.XGBClassifier(**params) # Create the classifier
clf.fit(X_train, y_train) # Fit the model using the training data; this is the actual training of the model
ypred = clf.predict(X_test) # Predict the test fights using the test fighter's stats
accuracy = accuracy_score(y_test, ypred) # Check the accuracy
print(f'XGBoost test accuracy: {accuracy}') # Print the accuracy

XGBoost test accuracy: 0.6422338568935427


In [18]:
print_num = 100 # Print the top 100 most important stats
features_list = list()
columns = X_test.columns.tolist() # Get the names of the stats that are in the testing data
feats = {}  # a dict to hold feature_name: feature_importance
for feature, importance in zip(columns, clf.feature_importances_):
    feats[feature] = importance
q = [[k, v] for k, v in sorted(feats.items(), key=lambda item: item[1])]
for x in q[-print_num:]:
    print(x[0], x[1])
    features_list.append(x[0])
#print(columns)
print(features_list)

precomp_change_avg_distance_strikes_landed_per_min_differential_peak_vs_opp 0.0
precomp_change_avg_distance_strikes_landed_per_min_differential_valley_vs_opp 0.0
precomp_change_avg_distance_strikes_landed_per_min_differential_vs_peak_vs_opp 0.0
precomp_change_avg_distance_strikes_landed_per_min_differential_vs_valley_vs_opp 0.0
precomp_change_recent_avg_distance_strikes_landed_per_min_differential_peak_vs_opp 0.0
precomp_change_recent_avg_distance_strikes_landed_per_min_differential_valley_vs_opp 0.0
precomp_change_recent_avg_distance_strikes_landed_per_min_differential_vs_peak_vs_opp 0.0
precomp_change_recent_avg_distance_strikes_landed_per_min_differential_vs_valley_vs_opp 0.0
precomp_change_avg_distance_strikes_attempts_per_min_differential_peak_vs_opp 0.0
precomp_change_avg_distance_strikes_attempts_per_min_differential_valley_vs_opp 0.0
precomp_change_avg_distance_strikes_attempts_per_min_differential_vs_peak_vs_opp 0.0
precomp_change_avg_distance_strikes_attempts_per_min_differen

Set X to be the features I want to be used to predict whether a fighter one a fight or not. I am using median to fill in the values that were NaN. I used round(1) because there were a lot of significant figures in the values on the tables.

In [20]:
X = df[features_list]

"""
'height', 'reach', 'stance', 'recent_avg_distance_strikes_landed','recent_avg_distance_strikes_attempts', 'recent_avg_clinch_strikes_landed', 'recent_avg_clinch_strikes_attempts',
        'recent_avg_ground_strikes_landed', 'recent_avg_ground_strikes_attempts', 'recent_avg_knockdowns', 'recent_avg_sub_attempts', 'recent_avg_reversals', 'recent_avg_control', 'recent_avg_sig_strikes_landed', 'recent_avg_sig_strikes_attempts', 'recent_avg_total_strikes_landed', 'recent_avg_total_strikes_attempts',
        'recent_avg_head_strikes_landed', 'recent_avg_head_strikes_attempts', 'recent_avg_body_strikes_landed', 'recent_avg_body_strikes_attempts', 'recent_avg_leg_strikes_landed', 'recent_avg_leg_strikes_attempts',
        'recent_avg_takedowns_def', 'recent_avg_sig_strikes_def', 'recent_avg_total_strikes_def', 'recent_avg_head_strikes_def', 'recent_avg_body_strikes_def', 'recent_avg_distance_strikes_def',
        'recent_avg_clinch_strikes_def',
        'recent_avg_ground_strikes_acc', 'recent_avg_clinch_strikes_acc', 'recent_avg_distance_strikes_acc', 'recent_avg_leg_strikes_acc', 'recent_avg_body_strikes_acc', 'recent_avg_head_strikes_acc', 'recent_avg_total_strikes_acc',
        'recent_avg_sig_strikes_acc', 'recent_avg_sub_acc',
"""
        #'recent_avg_comp_time', 'recent_avg_title_fight_loss', 'recent_avg_title_fight_losses', 'recent_avg_num_fights', 'recent_avg_win_streak']]
#X.fillna(X.median(numeric_only=True).round(1), inplace=True)
X[features_list].fillna(0, inplace=True)
"""
X['recent_avg_ground_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_clinch_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_distance_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_leg_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_body_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_head_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_total_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_sig_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_sub_acc'].fillna(0, inplace=True)




X['height'].fillna(X['height'].median().round(1), inplace=True)
X['reach'].fillna(X['reach'].median().round(1), inplace=True)
#X['age'].fillna(X['age'].median().round(1), inplace=True)

X['recent_avg_distance_strikes_landed'].fillna(0, inplace=True)
X['recent_avg_distance_strikes_attempts'].fillna(0, inplace=True)
X['recent_avg_clinch_strikes_landed'].fillna(0, inplace=True)
X['recent_avg_clinch_strikes_attempts'].fillna(0, inplace=True)
X['recent_avg_ground_strikes_landed'].fillna(0, inplace=True)
X['recent_avg_ground_strikes_attempts'].fillna(0, inplace=True)

X['recent_avg_knockdowns'].fillna(0, inplace=True)
X['recent_avg_sub_attempts'].fillna(0, inplace=True)
X['recent_avg_reversals'].fillna(0, inplace=True)
X['recent_avg_control'].fillna(0, inplace=True)
#X['recent_avg_takedowns_acc'].fillna(0, inplace=True)
#X['recent_avg_takedowns_attempts'].fillna(0, inplace=True)

X['recent_avg_sig_strikes_landed'].fillna(0, inplace=True)
X['recent_avg_sig_strikes_attempts'].fillna(0, inplace=True)
X['recent_avg_total_strikes_landed'].fillna(0, inplace=True)
X['recent_avg_total_strikes_attempts'].fillna(0, inplace=True)
X['recent_avg_head_strikes_landed'].fillna(0, inplace=True)
X['recent_avg_head_strikes_attempts'].fillna(0, inplace=True)
X['recent_avg_body_strikes_landed'].fillna(0, inplace=True)
X['recent_avg_body_strikes_attempts'].fillna(0, inplace=True)
X['recent_avg_leg_strikes_landed'].fillna(0, inplace=True)
X['recent_avg_leg_strikes_attempts'].fillna(0, inplace=True)

X['recent_avg_ground_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_sig_strikes_def'].fillna(0, inplace=True)
X['recent_avg_total_strikes_def'].fillna(0, inplace=True)
X['recent_avg_head_strikes_def'].fillna(0, inplace=True)
X['recent_avg_body_strikes_def'].fillna(0, inplace=True)
#X['recent_avg_leg_strikes_def'].fillna(0, inplace=True)
X['recent_avg_distance_strikes_def'].fillna(0, inplace=True)
X['recent_avg_clinch_strikes_def'].fillna(0, inplace=True)
#X['recent_avg_ground_strikes_def'].fillna(0, inplace=True)
X['recent_avg_takedowns_def'].fillna(0, inplace=True)



X['recent_avg_ground_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_clinch_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_distance_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_leg_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_body_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_head_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_total_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_sig_strikes_acc'].fillna(0, inplace=True)
X['recent_avg_sub_acc'].fillna(0, inplace=True)


#X['recent_avg_reach_differential'].fillna(0, inplace=True)
"""

X

Unnamed: 0,precomp_change_avg_distance_strikes_landed_per_min_differential_peak_vs_opp,precomp_change_avg_distance_strikes_landed_per_min_differential_valley_vs_opp,precomp_change_avg_distance_strikes_landed_per_min_differential_vs_peak_vs_opp,precomp_change_avg_distance_strikes_landed_per_min_differential_vs_valley_vs_opp,precomp_change_recent_avg_distance_strikes_landed_per_min_differential_peak_vs_opp,precomp_change_recent_avg_distance_strikes_landed_per_min_differential_valley_vs_opp,precomp_change_recent_avg_distance_strikes_landed_per_min_differential_vs_peak_vs_opp,precomp_change_recent_avg_distance_strikes_landed_per_min_differential_vs_valley_vs_opp,precomp_change_avg_distance_strikes_attempts_per_min_differential_peak_vs_opp,precomp_change_avg_distance_strikes_attempts_per_min_differential_valley_vs_opp,...,precomp_recent_avg_head_strikes_absorbed_differential_valley_vs_opp,precomp_change_elo_differential,precomp_recent_avg_age_vs_opp,precomp_elo_vs_opp,precomp_avg_distance_strikes_absorbed_differential,precomp_change_avg_elo_differential,precomp_avg_head_strikes_absorbed_differential_vs_opp,precomp_elo_differential,precomp_avg_age_vs_opp,precomp_avg_ground_strikes_absorbed_peak_vs_opp
1614,0.000000,4.548691,0.303836,0.000000,10.686949,10.686949,0.000000,0.000000,5.337686,4.268028,...,-1.295473,-0.043480,3887.000000,38.869274,2.000000,-0.021740,-1.295473,1.030003,3887.000000,-2.000000
1616,0.000000,0.000000,-0.276974,-0.509071,0.000000,0.000000,-0.599020,-2.205350,-0.061964,0.000000,...,-0.242203,-0.018454,258.666667,-10.318008,0.502786,-0.023814,-0.118527,0.992501,370.400000,-1.750000
1617,0.000000,0.000000,0.276974,0.509071,0.000000,0.000000,0.599020,2.205350,0.061964,0.000000,...,0.242203,0.064515,-258.666667,10.318008,1.106952,0.009673,0.118527,1.007555,-370.400000,1.750000
1684,0.000000,-19.723403,-0.169900,0.000000,19.287708,19.287708,0.000000,0.000000,0.000000,-5.967395,...,-1.130952,-0.005309,1020.000000,-72.325768,0.166667,-0.002655,-1.130952,0.944031,1020.000000,7.000000
1685,0.000000,19.723403,0.169900,0.000000,-19.287708,-19.287708,0.000000,0.000000,0.000000,5.967395,...,1.130952,0.006218,-1020.000000,72.325768,1.250000,0.003109,1.130952,1.059287,-1020.000000,-7.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14124,-4.619460,0.000000,-0.020426,-1.252008,-8.476781,-8.476781,-0.976242,-0.917548,-1.002687,0.000000,...,0.178399,-0.090079,440.666667,26.761303,0.834273,0.002656,0.605211,1.020462,-560.969697,7.000000
14125,4.619460,0.000000,0.020426,1.252008,8.476781,8.476781,0.976242,0.917548,1.002687,0.000000,...,-0.178399,-0.039319,-440.666667,-26.761303,0.943910,-0.019660,-0.605211,0.979948,560.969697,-7.000000
14126,0.016569,0.024630,0.006874,0.019440,0.097983,0.020711,0.078253,0.606025,0.000863,0.014278,...,-0.530334,0.020215,-935.333333,23.906362,0.915214,-0.001487,-2.169799,1.018377,743.857143,-0.657895
14127,-0.016569,-0.024630,-0.006874,-0.019440,-0.097983,-0.020711,-0.078253,-0.606025,-0.000863,-0.014278,...,0.530334,0.006030,935.333333,-23.906362,0.907001,-0.001161,2.169799,0.981954,-743.857143,0.657895


Using one hot encoding to create 4 new rows that correspond to each stance possiblity.

In [10]:
"""
encoded_stance = pd.get_dummies(X['stance'], prefix = 'stance', drop_first = True)

X = pd.concat([X, encoded_stance], axis = 1)
X = X.drop('stance', axis = 1)
#X = X.drop(columns=['stance_Sideways'])
X
"""

"\nencoded_stance = pd.get_dummies(X['stance'], prefix = 'stance', drop_first = True)\n\nX = pd.concat([X, encoded_stance], axis = 1)\nX = X.drop('stance', axis = 1)\n#X = X.drop(columns=['stance_Sideways'])\nX\n"

In [21]:
nan_columns = X.columns[X.isna().any()].tolist()
print(nan_columns)

[]


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [23]:
y = df['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

Feature Scaling

In [24]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.fit_transform(X_test)

In [25]:
# Initialize and train the logistic regression model
logreg_scaled = LogisticRegression(penalty = 'l2', C = 0.025, max_iter=10000)
logreg_scaled.fit(X_train_scaled, y_train)

# Predict on the scaled test set
y_pred_scaled = logreg_scaled.predict(X_test_scaled)

In [26]:
# Calculate accuracy
accuracy_scaled = accuracy_score(y_test, y_pred_scaled)

# Get a detailed classification report
class_report_scaled = classification_report(y_test, y_pred_scaled)

print(accuracy_scaled)
print(class_report_scaled)

0.6281094527363185
              precision    recall  f1-score   support

           0       0.64      0.63      0.63       411
           1       0.62      0.63      0.62       393

    accuracy                           0.63       804
   macro avg       0.63      0.63      0.63       804
weighted avg       0.63      0.63      0.63       804



In [21]:
logreg = LogisticRegression(penalty = 'l2', C = 0.0083, max_iter = 10000)

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

accuracy, class_report

(0.6545454545454545,
 '              precision    recall  f1-score   support\n\n           0       0.66      0.64      0.65       473\n           1       0.65      0.66      0.66       462\n\n    accuracy                           0.65       935\n   macro avg       0.65      0.65      0.65       935\nweighted avg       0.65      0.65      0.65       935\n')

In [18]:
"""
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# Lists to store results
results = []

# Iterate over regularization types and C values
for penalty in ['l1', 'l2']:
    for C in C_values:
        # Initialize and train the logistic regression model
        logreg_reg = LogisticRegression(penalty=penalty, C=C, solver='saga', max_iter=10000)
        logreg_reg.fit(X_train, y_train)

        # Predict on the test set
        y_pred_reg = logreg_reg.predict(X_test)

        # Calculate accuracy
        accuracy_reg = accuracy_score(y_test, y_pred_reg)

        # Store results
        results.append((penalty, C, accuracy_reg))

# Sort results by accuracy and display
sorted_results = sorted(results, key=lambda x: x[2], reverse=True)
sorted_results
"""

"\nC_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]\n\n# Lists to store results\nresults = []\n\n# Iterate over regularization types and C values\nfor penalty in ['l1', 'l2']:\n    for C in C_values:\n        # Initialize and train the logistic regression model\n        logreg_reg = LogisticRegression(penalty=penalty, C=C, solver='saga', max_iter=10000)\n        logreg_reg.fit(X_train, y_train)\n\n        # Predict on the test set\n        y_pred_reg = logreg_reg.predict(X_test)\n\n        # Calculate accuracy\n        accuracy_reg = accuracy_score(y_test, y_pred_reg)\n\n        # Store results\n        results.append((penalty, C, accuracy_reg))\n\n# Sort results by accuracy and display\nsorted_results = sorted(results, key=lambda x: x[2], reverse=True)\nsorted_results\n"

In [None]:
coefficents = logreg.coef_[0]

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficent': coefficents
})

feature_importance

* Original Result Accuracy: 0.53
* Add recent_avg_distance_strikes_landed: 0.543
* Add recent_avg_distance_strikes_attempts: 0.553
* Add 'recent_avg_clinch_strikes_landed', 'recent_avg_clinch_strikes_attempts', 'recent_avg_ground_strikes_landed', 'recent_avg_ground_strikes_attempts': 0.600
* Add 'recent_avg_knockdowns',
'recent_avg_sub_attempts', 'recent_avg_reversals', 'recent_avg_control', 'recent_avg_takedowns_landed', 'recent_avg_takedowns_attempts': 0.616
* Add 'recent_avg_sig_strikes_landed', 'recent_avg_sig_strikes_attempts', 'recent_avg_total_strikes_landed', 'recent_avg_total_strikes_attempts',
'recent_avg_head_strikes_landed', 'recent_avg_head_strikes_attempts', 'recent_avg_body_strikes_landed', 'recent_avg_body_strikes_attempts', 'recent_avg_leg_strikes_landed', 'recent_avg_leg_strikes_attempts', : 0.617

* Swapped 'recent_avg_takedowns_attempts' and 'recent_avg_takedowns_landed' with 'recent_avg_takedowns_attempts' : 0.620
- However when swap all of the other features who end with landed and attempt with accuracy the accuracy score goes lower
* Adding 'recent_avg_takedowns_def', 'recent_avg_sig_strikes_def', 'recent_avg_total_strikes_def', 'recent_avg_head_strikes_def', 'recent_avg_body_strikes_def', 'recent_avg_leg_strikes_def', 'recent_avg_distance_strikes_def', 'recent_avg_clinch_strikes_def', 'recent_avg_ground_strikes_def'
surprisingly lowers the accuracy score: 0.617

* Used the data with the larger masterMLpublic.csv file ahd got: 0.636

* Feature scaling made it slightly worse: 0.633
Regularization
[('l1', 1, 0.6259731068648267),
 ('l1', 10, 0.6259731068648267),
 ('l1', 100, 0.6259731068648267),
 ('l1', 1000, 0.6259731068648267),
 ('l2', 1, 0.6259731068648267),
 ('l2', 10, 0.6259731068648267),
 ('l2', 100, 0.6259731068648267),
 ('l2', 1000, 0.6259731068648267)]
* I decided to bit the bullet essentially add all possible featured that have to do with the stats accumulated during the fight such as ground, takedowns, strikes, leg kick, etc. totalling 45 features total and gave me a final score of 0.649 and used l2 regularization with 0.001 for the C value.

* After taking out Age, stance_Sideways, avg grouns strikes defended I got up to: 0.6493
* After taking out recent_avg_leg_strikes_def the model jumped to 0.6497, possibly due to Muay Thai fighters being able to take a lot of kicks due to their conditioning
* Taking out 'recent_avg_takedowns_acc' gave the same result
* used the XGboost tree to get the most important 100 features, from scaling with the l2 penalty and C = 0.0025 it got 0.662

After I did I achieved the 0.649 accuracy score I printed out the coefficents and got:

index,Feature,Coefficent
0,height,-0.011838588602234206
1,reach,0.02513690752054723
2,age,-0.00013369953669891495
3,recent_avg_distance_strikes_landed,-0.012351539438104948
4,recent_avg_distance_strikes_attempts,0.0041190302582423365
5,recent_avg_clinch_strikes_landed,0.027532567404839333
6,recent_avg_clinch_strikes_attempts,-0.026014725700526174
7,recent_avg_ground_strikes_landed,0.02004397680442856
8,recent_avg_ground_strikes_attempts,0.007196252119483971
9,recent_avg_knockdowns,0.051405815544197014
10,recent_avg_sub_attempts,0.06756937317873425
11,recent_avg_reversals,0.0013607525060938142
12,recent_avg_control,0.001944900921447198
13,recent_avg_takedowns_acc,0.002333657399607738
14,recent_avg_sig_strikes_landed,0.035225004771234796
15,recent_avg_sig_strikes_attempts,-0.014699443322789064
16,recent_avg_total_strikes_landed,0.03109682098073467
17,recent_avg_total_strikes_attempts,-0.02426746377260322
18,recent_avg_head_strikes_landed,-0.026536555494366083
19,recent_avg_head_strikes_attempts,0.026299921416166067
20,recent_avg_body_strikes_landed,0.031203255605844842
21,recent_avg_body_strikes_attempts,-0.022493230425627408
22,recent_avg_leg_strikes_landed,0.03055830465983687
23,recent_avg_leg_strikes_attempts,-0.018506134313419676
24,recent_avg_takedowns_def,0.00367645368724413
25,recent_avg_sig_strikes_def,0.010268201476279864
26,recent_avg_total_strikes_def,0.006725560624542145
27,recent_avg_head_strikes_def,0.02552589501558793
28,recent_avg_body_strikes_def,0.003299081082747945
29,recent_avg_leg_strikes_def,0.000357356994948846
30,recent_avg_distance_strikes_def,-0.018848230350060606
31,recent_avg_clinch_strikes_def,-0.003056480291725662
32,recent_avg_ground_strikes_def,-0.00035959609607769675
33,recent_avg_ground_strikes_acc,-0.010157023722371953
34,recent_avg_clinch_strikes_acc,-0.011688453858385952
35,recent_avg_distance_strikes_acc,-0.011615199776599987
36,recent_avg_leg_strikes_acc,-0.011431043828876605
37,recent_avg_body_strikes_acc,-0.007704309074677114
38,recent_avg_head_strikes_acc,0.022771530695088173
39,recent_avg_total_strikes_acc,-0.022945384642348843
40,recent_avg_sig_strikes_acc,0.02829723926140461
41,recent_avg_sub_acc,0.01911244604832737
42,stance_Orthodox,-0.005373658632131242
43,stance_Sideways,-7.015666218288243e-05
44,stance_Southpaw,0.013131370828203235
45,stance_Switch,-0.004004658399553213


Predictors with stongest victory correlatoin:
1. Submission Attempts(0.067)
- Possibly becasue judges score takedowns higher becasue they are rarer than signifigant strikes? More impactful when they go to the scorecard.
2. Average Knockdowns(0.051)
3. Avg Signifigant Strikes Landed(0.035)

Weakest Victory Correlation:
1. Sideways Stance(-0.0000701)
2. Age(-0.000134)
3. Average Ground Strikes Defended(0.000357)

recent_avg_leg_strikes_def, 	recent_avg_distance_strikes_attempts, recent_avg_control

### Next Steps
- Work more on feature extracting, look at the other dude's project and see what features he find are most predictive of the outcome
- PCA before logistic regression(plot 2 or 3 features)/ TSNE
- SMV, NN, XG Boost

