In [2]:
import os
import sqlite3

import pandas as pd

db_path = os.path.join(os.path.dirname("__file__"), "..", "..", "data", "ufc.db")

### Find bouts where both fighters have fought at least once in UFC

In [4]:
query = """
WITH cte1 AS (
    SELECT fighter_id, t1.'order', bout_id FROM ufcstats_fighter_histories t1
    INNER JOIN bout_mapping t2 ON t1.bout_id = t2.ufcstats_id
),
cte2 AS (
    SELECT fighter_id, bout_id, ROW_NUMBER() OVER (PARTITION BY fighter_id ORDER BY t1.'order') AS ufc_order FROM cte1 t1
),
cte3 AS (
    SELECT
        id,
        t3.ufc_order AS red_ufc_order,
        t4.ufc_order AS blue_ufc_order
    FROM ufcstats_bouts t1
    INNER JOIN bout_mapping t2 ON t1.id = t2.ufcstats_id
    LEFT JOIN cte2 t3 ON t1.id = t3.bout_id AND t1.red_fighter_id = t3.fighter_id
    LEFT JOIN cte2 t4 ON t1.id = t4.bout_id AND t1.blue_fighter_id = t4.fighter_id
)
SELECT id FROM cte3 WHERE
    red_ufc_order > 1 AND blue_ufc_order > 1
"""

with sqlite3.connect(db_path) as conn:
    query_res = pd.read_sql_query(query, conn)
exp_level_bout_ids = query_res["id"].tolist()

### Filter for only men's bouts where both fighters have fought at least once in UFC

In [3]:
features_df = pd.read_pickle("../../data/features.pkl.xz", compression="xz")
features_df

Unnamed: 0,id,avg_knockdowns_scored_diff,cumulative_knockdowns_scored_diff,avg_knockdowns_scored_per_second_diff,cumulative_knockdowns_scored_per_second_diff,avg_knockdowns_scored_per_strike_landed_diff,cumulative_knockdowns_scored_per_strike_landed_diff,avg_knockdowns_scored_per_strike_attempted_diff,cumulative_knockdowns_scored_per_strike_attempted_diff,avg_knockdowns_scored_per_significant_strike_landed_diff,...,avg_opp_event_attendance_change_diff,avg_opp_event_attendance_change_diff_diff,avg_opp_avg_event_attendance_change_diff,avg_opp_avg_event_attendance_change_diff_diff,avg_opp_avg_event_occupancy_pct_diff,avg_opp_avg_event_occupancy_pct_diff_diff,avg_opp_event_occupancy_pct_change_diff,avg_opp_event_occupancy_pct_change_diff_diff,avg_opp_avg_event_occupancy_pct_change_diff,avg_opp_avg_event_occupancy_pct_change_diff_diff
0,be38ed9ccfe2ee03,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-3611.000000,6580.500000,206.916667,7790.250000,-0.001338,-0.316345,-0.167808,0.596338,-0.008228,0.201749
1,eb1b371dfc37fcdb,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,219bd976b8ca745d,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,5791.250000,0.000000,2043.861111,-10063.500000,-0.097682,-0.516345,-0.102990,-0.649139,0.073671,-0.509056
3,af178adff964d854,0.200000,1,0.000313,0.000430,0.002247,0.004386,0.001639,0.003115,0.004651,...,-5569.000000,0.000000,0.000000,0.000000,-0.237066,0.000000,0.000000,0.000000,0.000000,0.000000
4,920194911d727a38,0.000000,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,-4205.666667,0.000000,-0.112834,0.000000,-0.346601,0.000000,0.080951,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7071,5238f6470d0557fb,0.250000,4,0.001888,0.001181,0.015227,0.010821,0.006283,0.003462,0.015179,...,-4734.666667,11243.000000,722.184292,-1511.853896,-0.202461,-0.584005,0.000000,0.000000,0.007550,0.173006
7072,7b1bc4ff776f12c1,-0.085714,-3,-0.001594,-0.000143,-0.004453,0.000881,-0.003147,0.000704,0.006262,...,-18745.000000,0.000000,82.594036,4943.723440,0.039104,0.251747,-0.224510,0.421001,0.035729,0.306498
7073,1a635a5e4551e7d5,-0.142857,-1,-0.001952,-0.000514,-0.023014,-0.007278,-0.010774,-0.004009,-0.028357,...,-8795.333333,-1100.500000,-3269.043056,20082.425000,0.167921,-0.258702,-0.320822,0.585316,-0.161427,0.579691
7074,7521015554088962,0.175000,7,0.000810,0.000208,0.010541,0.003426,0.005460,0.001763,0.010358,...,3649.085714,-4911.654135,1349.485978,1308.150105,0.138067,0.222074,-0.297118,0.372520,0.084494,0.042074


In [8]:
features_case_study = features_df.loc[(features_df["id"].isin(exp_level_bout_ids)) & (features_df["is_female"] == 0)].copy().reset_index(drop=True)
features_case_study

Unnamed: 0,id,avg_knockdowns_scored_diff,cumulative_knockdowns_scored_diff,avg_knockdowns_scored_per_second_diff,cumulative_knockdowns_scored_per_second_diff,avg_knockdowns_scored_per_strike_landed_diff,cumulative_knockdowns_scored_per_strike_landed_diff,avg_knockdowns_scored_per_strike_attempted_diff,cumulative_knockdowns_scored_per_strike_attempted_diff,avg_knockdowns_scored_per_significant_strike_landed_diff,...,avg_opp_event_attendance_change_diff,avg_opp_event_attendance_change_diff_diff,avg_opp_avg_event_attendance_change_diff,avg_opp_avg_event_attendance_change_diff_diff,avg_opp_avg_event_occupancy_pct_diff,avg_opp_avg_event_occupancy_pct_diff_diff,avg_opp_event_occupancy_pct_change_diff,avg_opp_event_occupancy_pct_change_diff_diff,avg_opp_avg_event_occupancy_pct_change_diff,avg_opp_avg_event_occupancy_pct_change_diff_diff
0,be38ed9ccfe2ee03,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-3611.000000,6580.500000,206.916667,7790.250000,-0.001338,-0.316345,-0.167808,0.596338,-0.008228,0.201749
1,219bd976b8ca745d,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,5791.250000,0.000000,2043.861111,-10063.500000,-0.097682,-0.516345,-0.102990,-0.649139,0.073671,-0.509056
2,af178adff964d854,0.200000,1,0.000313,0.000430,0.002247,0.004386,0.001639,0.003115,0.004651,...,-5569.000000,0.000000,0.000000,0.000000,-0.237066,0.000000,0.000000,0.000000,0.000000,0.000000
3,96da5813683649f5,-0.200000,-1,-0.000222,-0.000269,-0.002817,-0.006024,-0.001600,-0.003846,-0.007692,...,1661.466667,-1201.000000,23.208333,5164.013889,-0.216629,0.107524,0.202652,-0.021123,0.002736,0.031342
4,df3b75809b7fe252,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,-0.310681,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4834,5238f6470d0557fb,0.250000,4,0.001888,0.001181,0.015227,0.010821,0.006283,0.003462,0.015179,...,-4734.666667,11243.000000,722.184292,-1511.853896,-0.202461,-0.584005,0.000000,0.000000,0.007550,0.173006
4835,7b1bc4ff776f12c1,-0.085714,-3,-0.001594,-0.000143,-0.004453,0.000881,-0.003147,0.000704,0.006262,...,-18745.000000,0.000000,82.594036,4943.723440,0.039104,0.251747,-0.224510,0.421001,0.035729,0.306498
4836,1a635a5e4551e7d5,-0.142857,-1,-0.001952,-0.000514,-0.023014,-0.007278,-0.010774,-0.004009,-0.028357,...,-8795.333333,-1100.500000,-3269.043056,20082.425000,0.167921,-0.258702,-0.320822,0.585316,-0.161427,0.579691
4837,7521015554088962,0.175000,7,0.000810,0.000208,0.010541,0.003426,0.005460,0.001763,0.010358,...,3649.085714,-4911.654135,1349.485978,1308.150105,0.138067,0.222074,-0.297118,0.372520,0.084494,0.042074


In [9]:
features_case_study.to_pickle("../../data/features_case_study.pkl.xz", compression="xz")

In [11]:
backtest_odds_df = pd.read_csv("../../data/backtesting/backtest_odds.csv")
backtest_odds_case_study = backtest_odds_df.loc[backtest_odds_df["bout_id"].isin(features_case_study["id"])].copy().reset_index(drop=True)
backtest_odds_case_study

Unnamed: 0,bout_id,event_id,date,sportsbook,red_odds,blue_odds,red_win
0,c1356395d6b055d7,46effbd1135423c5,2017-01-15,Bovada,-110,-120,0.0
1,ae803440d778a12b,46effbd1135423c5,2017-01-15,Bovada,190,-240,0.0
2,3f7684492c9df05e,46effbd1135423c5,2017-01-15,Bovada,-160,130,1.0
3,8156479490877d08,46effbd1135423c5,2017-01-15,Bovada,-170,140,1.0
4,1a81573425c585fb,46effbd1135423c5,2017-01-15,Bovada,-145,115,1.0
...,...,...,...,...,...,...,...
2617,5238f6470d0557fb,72c9c2eadfc3277e,2024-12-14,Bovada,215,-260,0.0
2618,7b1bc4ff776f12c1,72c9c2eadfc3277e,2024-12-14,Bovada,-320,265,0.0
2619,1a635a5e4551e7d5,72c9c2eadfc3277e,2024-12-14,Bovada,-420,320,1.0
2620,7521015554088962,72c9c2eadfc3277e,2024-12-14,Bovada,175,-205,1.0


In [12]:
backtest_odds_case_study.to_csv("../../data/backtesting/backtest_odds_case_study.csv", index=False)