In [1]:
import random

import pandas as pd
import numpy as np
import math
from scipy import stats as scistats

import json
import array as arr
from datetime import datetime, date

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [2]:
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')
df_stat_data = pd.read_csv('data/df_stat_data_10_25.csv')
df_fight_data = pd.read_csv('data/df_fight_data_10_25.csv',
                            parse_dates=['date'], date_parser=dateparse)

with open('data/fighter_hist.json') as json_file:
    fighter_dict = json.load(json_file)
    
with open('data/fighter_name_to_id.json') as json_file:
    name_dict = json.load(json_file)

elo b1:

    - First we calculate an elo based on winning and losing (win_loss_elo),
          very similar to chess
      
    - for each fight stat type (sig str, clinch, td, etc) we calculate
          an additional elo (ex. clinch_elo)
      
    - clinch_elo increases based on the win_loss_elo of the opponent fighter
         -being able to outland an opponent in the clinch is rewarded 
          based on the opponent's general win_loss_record.

Results: same as older methods

elo c1:
- we include regular win loss elo from older methods
- sig str = head + body + leg = distance + clinch + ground
- keep track of sig str elo
- for each fight
    - calculate league head-body-leg ratio, distance-clinch-ground ratio
        - avg and stdev
    - if someone lands more sig strikes:
        - update their sig str elo
        - update head, body, leg, distance, clinch, ground elos based on standardization with league avg
    - if someone lands less sig strikes:
        - update their sig str elo
        - update head, body, leg, distance, clinch, ground elos based on standardization with league avg

elo e1
- calculate sig str elo normally
- additionally keep track of mean, sd percent distributions of head/body/leg, distance/clinch/ground
- calculate standardized fighter distributions based on mean, sd

In [3]:
example_df = pd.DataFrame( \
              {'elo_version': ['a1', 'a1', 'a2', 'b1', 'b1', 'b1'],
               'fighter_id': ['324', '21', '154', '3213', '43', '1213'],
               'previous_card_date': ['2021-10-08', '2021-09-08', '2021-08-08',
                                 '2021-08-08', '2021-08-08', '2021-07-08'],
               'previous_fight_id': ['664', '654', '633', '633', '633', '632'],
               'elo_type': ['win_loss', 'win_loss', 'win_loss', 'sig_str', 'sig_str', 'distance'],
               'elo_value': [100, 150, 150, 400, 430, 445] ,
               'opp_prev_value': [20, 20, 20 , 20, 20, 20], 
               
               })

example_df

Unnamed: 0,elo_version,fighter_id,previous_card_date,previous_fight_id,elo_type,elo_value,opp_prev_value
0,a1,324,2021-10-08,664,win_loss,100,20
1,a1,21,2021-09-08,654,win_loss,150,20
2,a2,154,2021-08-08,633,win_loss,150,20
3,b1,3213,2021-08-08,633,sig_str,400,20
4,b1,43,2021-08-08,633,sig_str,430,20
5,b1,1213,2021-07-08,632,distance,445,20


In [4]:
df_stat_data[(df_stat_data.Stat == 'sig str') &
             (df_stat_data.Round == 0)].head()

Unnamed: 0,Fighter,Landed,Out Of,Round,Seconds,Stat,fight_id
80,0,41,80,0,628,sig str,5291
81,1,43,65,0,628,sig str,5291
132,0,4,7,0,154,sig str,5290
133,1,13,16,0,154,sig str,5290
184,0,1,2,0,44,sig str,5289


In [5]:
example_df.sort_values(by='previous_card_date', ascending=True)

Unnamed: 0,elo_version,fighter_id,previous_card_date,previous_fight_id,elo_type,elo_value,opp_prev_value
5,b1,1213,2021-07-08,632,distance,445,20
2,a2,154,2021-08-08,633,win_loss,150,20
3,b1,3213,2021-08-08,633,sig_str,400,20
4,b1,43,2021-08-08,633,sig_str,430,20
1,a1,21,2021-09-08,654,win_loss,150,20
0,a1,324,2021-10-08,664,win_loss,100,20


In [6]:
df_stat_data.head()

Unnamed: 0,Fighter,Landed,Out Of,Round,Seconds,Stat,fight_id
0,0,0,-1,1,300,kd,5291
1,1,0,-1,1,300,kd,5291
2,0,15,29,1,300,sig str,5291
3,1,13,18,1,300,sig str,5291
4,0,15,29,1,300,total str,5291


In [7]:
def gen_new_mean_sd(entry, new_val):
    mean = entry['mean']
    cnt = entry['cnt']
    sd = entry['sd']

    if (cnt == 0):
        new_mean = new_val
        new_sd = 0

    else:
        new_mean = (mean * cnt + new_val) / (cnt + 1)

        new_sd_numer = ((cnt) * (sd ** 2)) + \
            ((new_val - mean) * (new_val - new_mean))
        
        new_sd = math.sqrt(new_sd_numer / (cnt + 1))
    

    return {'mean': new_mean, 'sd': new_sd, 'cnt': cnt + 1}

In [105]:
np.array([3]).std()
np.array([3, 5]).std()

(((2) * (0 ** 1)) + \
            ((5 - 3) * (5 - 4))) / 2

1.0

In [8]:
def gen_fighter_elo_increments(fighter_0_elo, fighter_1_elo, winner, divby=400, k_val=40):
    
    prob_fighter_0 = 1.0 / (1 + 1.0 * math.pow(10, 1.0 *
                                               (fighter_1_elo - fighter_0_elo) /
                                               divby))
    prob_fighter_1 = 1 - prob_fighter_0
    
    if winner == 0:
        return k_val * (1 - prob_fighter_0), \
               k_val * (0 - prob_fighter_1) 
    
    return k_val * (0 - prob_fighter_0), \
           k_val * (1 - prob_fighter_1)

In [9]:
def melt_fight_data(df_fight_data):
    df_fight_data = df_fight_data.copy()
    df_fight_data = df_fight_data[['fighter_0', 'fighter_1', 'winner', 'date', 'fight_id']]
    returned = pd.melt(df_fight_data, id_vars=['winner', 'date', 'fight_id'])
    returned.rename(columns={'variable': 'fighter', 'value': 'fighter_id'},
        inplace=True)
    returned['fighter'] = returned['fighter']\
        .apply(lambda x: 0 if x == 'fighter_0' else 1)

    to_merge = df_fight_data[['fight_id', 'fighter_0', 'fighter_1']]
    returned = returned.merge(to_merge, on='fight_id')
    returned['opponent_id'] = returned\
        .apply(lambda x: x['fighter_1'] if x['fighter'] == 0 else x['fighter_0'],
            axis=1)

    return returned.drop(columns=['fighter_0', 'fighter_1'])
melt_fight_data(df_fight_data)\
    .sort_values(by=['fight_id', 'fighter'], ascending=True).head()

Unnamed: 0,winner,date,fight_id,fighter,fighter_id,opponent_id
0,0,1998-10-16,0,0,1899,1900
1,0,1998-10-16,0,1,1900,1899
2,0,1998-10-16,1,0,1898,1601
3,0,1998-10-16,1,1,1601,1898
4,0,1998-10-16,2,0,1813,1819


In [10]:
df_stat_data.head()

Unnamed: 0,Fighter,Landed,Out Of,Round,Seconds,Stat,fight_id
0,0,0,-1,1,300,kd,5291
1,1,0,-1,1,300,kd,5291
2,0,15,29,1,300,sig str,5291
3,1,13,18,1,300,sig str,5291
4,0,15,29,1,300,total str,5291


In [None]:
def gen_elo_e1(df_fight_data, df_stat_data, fighter_dict, name_dict, divby=400, k_val=40):

    df_fights = melt_fight_data(df_fight_data)\
        .sort_values(by=['fight_id', 'fighter'], ascending=True)

    list_returned = []
    set_fighters_observed = set()
    ranges = ('distance', 'clinch', 'ground')
    locations = ('head', 'body', 'leg')
    stat_types = ('sig str',) + ranges + locations

    stat_type_mean_sd_counts = {s: 
        {'mean': -1, 'sd': -1, 'cnt': 0} for s in locations + ranges}

    # Filter to remove round-by-round stat data
    df_stats = df_stat_data[(df_stat_data.Round == 0) &
            df_stat_data.Stat.isin(stat_types)]\
        [['Fighter', 'Landed', 'Stat', 'Seconds', 'fight_id']] \
        .rename(columns={'Fighter': 'fighter'})\
        .set_index('fight_id')

    current_elo_values = {fighter_id: {'sig str': 0, 'win_loss': 0}
         for fighter_id in fighter_dict.keys()}

    current_strike_dstrbs = {fighter_id: {k: 0 for k in stat_types}
        for fighter_id in fighter_dict.keys()}
    

    # handle issue of figher_elo update affecting the calculation of opponent's elo
    temp_elo_store = {}

    #iterate through fight_ids
    for fight_entry in df_fights.to_dict('records'):
        
        fight_stats = df_stats.loc[fight_entry['fight_id']]

        # get fighter ids
        fighter_id = str(fight_entry['fighter_id'])
        opp_id = str(fight_entry['opponent_id'])

        # determine dictionaries to be appended to returned

        to_append = {'fighter_id': fighter_id,
             'card_date': fight_entry['date'],
             'fight_id': fight_entry['fight_id'], 
             'fighter' : fight_entry['fighter'],
             'winner': fight_entry['winner']}

        to_append.update({stat_type: current_elo_values[fighter_id][stat_type]
            for stat_type in ('sig str', 'win_loss')})

        # for each range and location striking stat, add a column
        # containing percentage of total sig strikes
        to_append.update({
            
            # divide strikes landed of current stat type by sig str
            stat_type: current_strike_dstrbs[fighter_id][stat_type] /
                    current_strike_dstrbs[fighter_id]['sig str'] 

                # if denom is 0, assign to 1/3
                if current_strike_dstrbs[fighter_id]['sig str'] != 0 else 1/3

            # iterate through ranges and locations
            for stat_type in ranges + locations
        })

        list_returned.append(to_append)

        # Now we focus on updating elo values appropriately

        fighter_wl_elo = current_elo_values[fighter_id]['win_loss']
        opp_wl_elo = current_elo_values[opp_id]['win_loss']

        fighter_ss_elo = current_elo_values[fighter_id]['sig str']
        opp_ss_elo = current_elo_values[opp_id]['sig str']

        fighter_observed = fighter_id in set_fighters_observed
        opp_observed = opp_id in set_fighters_observed
        
        if fight_entry['fighter'] == 1:
            set_fighters_observed.add(fighter_id)
            set_fighters_observed.add(opp_id)

        obs_info = 'normal'
        
        if (not fighter_observed) and opp_observed:
            fighter_wl_elo = opp_wl_elo
            fighter_ss_elo = opp_ss_elo
            obs_info = 'fighter_new'

        if fighter_observed and (not opp_observed):
            opp_wl_elo = fighter_wl_elo
            opp_ss_elo = fighter_ss_elo
            obs_info = 'fighter_new'
            
        if (not fighter_observed) and (not opp_observed):
            fighter_wl_elo = 100
            fighter_ss_elo = 100
            obs_info = 'both_new'

        
        fighter_wl_inc, _ = \
            gen_fighter_elo_increments(fighter_wl_elo, opp_wl_elo,
                 int(fight_entry['winner'] != fight_entry['fighter']))
        
        fighter_ss_inc, _ = \
            gen_fighter_elo_increments(fighter_ss_elo, opp_ss_elo,
                 int(fight_entry['winner'] != fight_entry['fighter']))
        
        new_fighter_wl_elo = fighter_wl_elo + fighter_wl_inc
        new_fighter_ss_elo = fighter_ss_elo + fighter_ss_inc

        for stat_type in stat_types:
            num_landed = fight_stats[fight_stats.Stat == stat_type]['Landed']\
                .list()[0]
            current_strike_dstrbs['fighter_id'][stat_type] = \
                current_strike_dstrbs['fighter_id'][stat_type] + num_landed
            

        
        

        

In [8]:
def gen_elo_c2(df_fight_data, df_stat_data, fighter_dict, name_dict, divby=400, k_val=40):
    
    df_fights = melt_fight_data(df_fight_data)\
        .sort_values(by=['fight_id', 'fighter'], ascending=True)


    list_win_loss = []
    set_fighters_observed = set()

    ranges = ('distance', 'clinch', 'ground')
    locations = ('head', 'body', 'leg')
    stat_types = ('sig str',) + ranges + locations

    range_vals = {r: {'mean': -1, 'sd': -1, 'cnt': 0} for r in ranges}
    loc_vals = {l: {'mean': -1, 'sd': -1, 'cnt': 0} for l in locations}
    
    # Filter to remove round-by-round stat data
    df_stats = df_stat_data[(df_stat_data.Round == 0) &
            df_stat_data.Stat.isin(stat_types)]\
        [['Fighter', 'Landed', 'Stat', 'Seconds', 'fight_id']] \
        .rename(columns={'Fighter': 'fighter'})\
        .set_index('fight_id')
    
    current_elo_values = {f_id: 
                             {elo_type: 0 
                                for elo_type in stat_types + ('win_loss',)}
                                    for f_id in fighter_dict.keys()}

    set_fighters_observed = set()

    # handle issue of figher_elo update affecting the calculation of opponent's elo
    temp_elo_store = {}

    #iterate through fight_ids
    for fight_entry in df_fights.to_dict('records'):
        
        fight_stats = df_stats.loc[fight_entry['fight_id']]

        # get fighter ids
        fighter_id = str(fight_entry['fighter_id'])
        opp_id = str(fight_entry['opponent_id'])

        # determine dictionaries to be appended to returned

        to_append = {'fighter_id': fighter_id,
             'card_date': fight_entry['date'],
             'fight_id': fight_entry['fight_id'], 
             'fighter' : fight_entry['fighter'],
             'winner': fight_entry['winner']}

        to_append.update({stat_type: current_elo_values[fighter_id][stat_type]
            for stat_type in stat_types + ('win_loss',)})

        # append to returned
        list_win_loss.append(to_append)
        
        # Now we focus on updating elo values appropriately

        fighter_wl_elo = current_elo_values[fighter_id]['win_loss']
        opp_wl_elo = current_elo_values[opp_id]['win_loss']

        fighter_ss_elo = current_elo_values[fighter_id]['sig str']
        opp_ss_elo = current_elo_values[opp_id]['sig str']

        fighter_wl_inc, _ = \
            gen_fighter_elo_increments(fighter_wl_elo, opp_wl_elo,
                 int(fight_entry['winner'] != fight_entry['fighter']))
        
        fighter_ss_inc, _ = \
            gen_fighter_elo_increments(fighter_ss_elo, opp_ss_elo,
                 int(fight_entry['winner'] != fight_entry['fighter']))
                  
        # take the increment from sig str elos, and divide it based on r_data
        #  get the stat values from df_stats

        fighter_observed = fighter_id in set_fighters_observed
        opp_observed = opp_id in set_fighters_observed
        
        if fight_entry['fighter'] == 1:
            set_fighters_observed.add(fighter_id)
            set_fighters_observed.add(opp_id)

        obs_info = 'normal'
        
        if (not fighter_observed) and opp_observed:
            fighter_wl_elo = opp_wl_elo
            fighter_ss_elo = opp_ss_elo
            obs_info = 'current_new'
            
        if (not fighter_observed) and (not opp_observed):
            fighter_wl_elo = 0
            fighter_ss_elo = 0
            obs_info = 'both_new'

        def get_stat_landed(stat_name):
            return list(fight_stats[(fight_stats.fighter ==  0) &
            (fight_stats.Stat == stat_name)]['Landed'])[0]

        stat_elo_update = {'win_loss': fighter_wl_elo + fighter_wl_inc,
                'sig str': fighter_ss_elo + fighter_ss_inc}

        for piece_stats, piece_vals in ((ranges, range_vals), (locations, loc_vals)):

            range_landed = {r: get_stat_landed(r) for r in piece_stats}
            sum_landed = sum(list(range_landed.values()))
            range_landed_perc = {r: val/(sum_landed if sum_landed != 0 else 1)
                 for r, val in range_landed.items()}
            range_standardized = {}
            

            for stat_name, perc_landed in range_landed_perc.items():
                new_stat_entry = gen_new_mean_sd(piece_vals[stat_name], perc_landed)
                piece_vals[stat_name] = new_stat_entry
                range_standardized[stat_name] = (perc_landed - new_stat_entry['mean']) / \
                    (new_stat_entry['sd'] if new_stat_entry['sd'] != 0 else 1)

            range_standardized = {k: scistats.norm.cdf(v) * 2 / 3
                for k, v in range_standardized.items()}

            for stat_name, inc_perc in range_standardized.items():

                initial_elo = current_elo_values[fighter_id][stat_name] if obs_info == 'normal' \
                    else current_elo_values[opp_id]['sig str'] / 3 if obs_info == 'current_new' \
                        else 0

                stat_elo_update[stat_name] = initial_elo + \
                    (fighter_ss_inc * inc_perc if fight_entry['winner'] == fight_entry['fighter']
                        else fighter_ss_inc * (2/3 - inc_perc))

        if fight_entry['fighter'] == 0:
            temp_elo_store.update(stat_elo_update)
        else:
            current_elo_values[opp_id] = temp_elo_store.copy()
            temp_elo_store.clear()
            current_elo_values[fighter_id] = stat_elo_update.copy()
    
    df_returned = pd.DataFrame.from_records(list_win_loss)
    return df_returned

        
df_c2 = gen_elo_c2(df_fight_data, df_stat_data, fighter_dict, name_dict, divby=400, k_val=40)

In [10]:
df_c2

Unnamed: 0,body,card_date,clinch,distance,fight_id,fighter,fighter_id,ground,head,leg,sig str,win_loss,winner
0,0.000000,1998-10-16,0.000000,0.000000,0,0,1899,0.000000,0.000000,0.000000,0.000000,0.000000,0
1,0.000000,1998-10-16,0.000000,0.000000,0,1,1900,0.000000,0.000000,0.000000,0.000000,0.000000,0
2,0.000000,1998-10-16,0.000000,0.000000,1,0,1898,0.000000,0.000000,0.000000,0.000000,0.000000,0
3,0.000000,1998-10-16,0.000000,0.000000,1,1,1601,0.000000,0.000000,0.000000,0.000000,0.000000,0
4,0.000000,1998-10-16,0.000000,0.000000,2,0,1813,0.000000,0.000000,0.000000,0.000000,0.000000,0
5,0.000000,1998-10-16,0.000000,0.000000,2,1,1819,0.000000,0.000000,0.000000,0.000000,0.000000,0
6,0.000000,1998-10-16,0.000000,0.000000,3,0,1818,0.000000,0.000000,0.000000,0.000000,0.000000,0
7,0.000000,1998-10-16,0.000000,0.000000,3,1,1893,0.000000,0.000000,0.000000,0.000000,0.000000,0
8,0.000000,1998-10-16,0.000000,0.000000,4,0,1785,0.000000,0.000000,0.000000,0.000000,0.000000,0
9,0.000000,1998-10-16,0.000000,0.000000,4,1,1784,0.000000,0.000000,0.000000,0.000000,0.000000,0


In [149]:
def gen_elo_d1(df_fight_data, df_stat_data, fighter_dict, name_dict, divby=400, k_val=40):
    
    df_fights = melt_fight_data(df_fight_data)\
        .sort_values(by=['fight_id', 'fighter'], ascending=True)


    list_win_loss = []
    set_fighters_observed = set()
    
    
    # Filter to remove round-by-round stat data
    df_stats = df_stat_data[(df_stat_data.Round == 0) &
            df_stat_data.Stat.isin(stat_types)]\
        [['Fighter', 'Landed', 'Stat', 'Seconds', 'fight_id']] \
        .rename(columns={'Fighter': 'fighter'})\
        .set_index('fight_id')

    stat_vals = {s: {'mean': -1, 'sd': -1, 'cnt': 0} for s in stat_types}
    
    current_elo_values = {f_id: 
                             {elo_type: 0 
                                for elo_type in stat_types + ('win_loss',)}
                                    for f_id in fighter_dict.keys()}

    set_fighters_observed = set()

    # handle issue of figher_elo update affecting the calculation of opponent's elo
    temp_elo_store = {}

    #iterate through fight_ids
    for fight_entry in df_fights.to_dict('records'):
        
        fight_stats = df_stats.loc[fight_entry['fight_id']]

        # get fighter ids
        fighter_id = str(fight_entry['fighter_id'])
        opp_id = str(fight_entry['opponent_id'])

        # determine dictionaries to be appended to returned

        to_append = {'fighter_id': fighter_id,
             'card_date': fight_entry['date'],
             'fight_id': fight_entry['fight_id'], 
             'fighter' : fight_entry['fighter'],
             'winner': fight_entry['winner']}

        to_append.update({stat_type: current_elo_values[fighter_id][stat_type]
            for stat_type in stat_types + ('win_loss',)})

        # append to returned
        list_win_loss.append(to_append)
        
        # Now we focus on updating elo values appropriately

        fighter_wl_elo = current_elo_values[fighter_id]['win_loss']
        opp_wl_elo = current_elo_values[opp_id]['win_loss']

        fighter_ss_elo = current_elo_values[fighter_id]['sig str']
        opp_ss_elo = current_elo_values[opp_id]['sig str']

        fighter_wl_inc, _ = \
            gen_fighter_elo_increments(fighter_wl_elo, opp_wl_elo,
                 int(fight_entry['winner'] != fight_entry['fighter']))
        
        fighter_ss_inc, _ = \
            gen_fighter_elo_increments(fighter_ss_elo, opp_ss_elo,
                 int(fight_entry['winner'] != fight_entry['fighter']))
                  
        # take the increment from sig str elos, and divide it based on r_data
        #  get the stat values from df_stats

        fighter_observed = fighter_id in set_fighters_observed
        opp_observed = opp_id in set_fighters_observed
        
        if fight_entry['fighter'] == 1:
            set_fighters_observed.add(fighter_id)
            set_fighters_observed.add(opp_id)

        obs_info = 'normal'
        
        if (not fighter_observed) and opp_observed:
            fighter_wl_elo = opp_wl_elo
            fighter_ss_elo = opp_ss_elo
            obs_info = 'current_new'
            
        if (not fighter_observed) and (not opp_observed):
            fighter_wl_elo = 0
            fighter_ss_elo = 0
            obs_info = 'both_new'

        def get_stat_landed(stat_name):
            return list(fight_stats[(fight_stats.fighter ==  0) &
            (fight_stats.Stat == stat_name)]['Landed'])[0]

        stat_elo_update = {'win_loss': fighter_wl_elo + fighter_wl_inc,
                'sig str': fighter_ss_elo + fighter_ss_inc}

        for piece_stats, piece_vals in ((ranges, range_vals), (locations, loc_vals)):

            range_landed = {r: get_stat_landed(r) for r in piece_stats}
            sum_landed = sum(list(range_landed.values()))
            range_landed_perc = {r: val/(sum_landed if sum_landed != 0 else 1)
                 for r, val in range_landed.items()}
            range_standardized = {}
            

            for stat_name, perc_landed in range_landed_perc.items():
                new_stat_entry = gen_new_mean_sd(piece_vals[stat_name], perc_landed)
                piece_vals[stat_name] = new_stat_entry
                range_standardized[stat_name] = (perc_landed - new_stat_entry['mean']) / \
                    (new_stat_entry['sd'] if new_stat_entry['sd'] != 0 else 1)

            range_standardized = {k: scistats.norm.cdf(v) * 2 / 3
                for k, v in range_standardized.items()}

            for stat_name, inc_perc in range_standardized.items():

                initial_elo = current_elo_values[fighter_id][stat_name] if obs_info == 'normal' \
                    else current_elo_values[opp_id]['sig str'] / 3 if obs_info == 'current_new' \
                        else 0

                stat_elo_update[stat_name] = initial_elo + \
                    (fighter_ss_inc * inc_perc if fight_entry['winner'] == fight_entry['fighter']
                        else fighter_ss_inc * (2/3 - inc_perc))

        if fight_entry['fighter'] == 0:
            temp_elo_store.update(stat_elo_update)
        else:
            current_elo_values[opp_id] = temp_elo_store.copy()
            temp_elo_store.clear()
            current_elo_values[fighter_id] = stat_elo_update.copy()
    
    df_returned = pd.DataFrame.from_records(list_win_loss)
    return df_returned

        
df_c2 = gen_elo_c2(df_fight_data, df_stat_data, fighter_dict, name_dict, divby=400, k_val=40)

In [178]:
fighter_dict['217']['fighter_names']

['Jose Aldo']

In [147]:
name_dict['Jon Jones']

118

In [144]:
df_fight_data[df_fight_data['fight_id'] == 781]

Unnamed: 0,fight_name,fighter_0,fighter_1,winner,method,round_end,date,fight_id
781,Cheick Kongo v Dan Evensen 2008-08-09,1288,1610,0,KO/TKO,1,2008-08-09,781


In [177]:
df_c2[['sig str', 'head', 'body', 'leg', 'distance', 'clinch', 'ground']].corr()

Unnamed: 0,sig str,head,body,leg,distance,clinch,ground
sig str,1.0,0.860417,0.886607,0.862603,0.863655,0.822378,0.767943
head,0.860417,1.0,0.614618,0.550523,0.684595,0.672404,0.786267
body,0.886607,0.614618,1.0,0.793036,0.774079,0.81367,0.62456
leg,0.862603,0.550523,0.793036,1.0,0.835708,0.707697,0.550624
distance,0.863655,0.684595,0.774079,0.835708,1.0,0.563043,0.385771
clinch,0.822378,0.672404,0.81367,0.707697,0.563043,1.0,0.678148
ground,0.767943,0.786267,0.62456,0.550624,0.385771,0.678148,1.0


In [175]:
df_c2.sort_values(by='sig str', ascending=False)
# df_c2[df_c2.fighter_id == '118'].sort_values(by='card_date', ascending=True)


Unnamed: 0,body,card_date,clinch,distance,fight_id,fighter,fighter_id,ground,head,leg,sig str,win_loss,winner
4280,107.944353,2013-07-06,101.624144,111.521190,2140,0,552,94.545458,105.349672,98.052396,315.586953,315.586953,1
8255,111.085013,2017-11-04,54.323331,132.526838,4127,1,789,108.038307,101.946152,104.688055,313.618166,313.618166,1
3788,105.163204,2012-10-13,96.970039,110.649001,1894,0,552,92.352720,101.896949,97.083282,308.288996,308.288996,0
10464,129.535883,2020-02-08,99.682172,123.914636,5232,0,118,75.641292,55.776088,140.100293,305.380549,305.380549,0
4584,104.645439,2013-11-16,49.887286,124.606252,2292,0,789,106.280552,99.755618,97.471105,299.342579,299.342579,0
3622,100.416923,2012-07-07,93.273813,109.462162,1811,0,552,86.696926,97.622907,95.723132,298.047527,298.047527,0
10042,98.672663,2019-08-17,113.130005,91.664167,5021,0,450,93.670229,108.368163,84.300242,293.942811,293.942811,1
9898,123.606012,2019-07-06,96.992166,116.550441,4949,0,118,73.829745,55.502071,131.650963,292.498910,292.498910,0
4068,101.700582,2013-03-16,48.018125,121.514177,2034,0,789,103.613765,97.108337,95.094597,291.571505,291.571505,0
10088,97.633074,2019-09-07,77.062757,103.090641,5044,0,421,96.699604,104.670044,83.115764,287.657126,287.657126,0


In [143]:
df_c2[df_c2.fighter_id == '1610'].sort_values(by='card_date', ascending=True)


Unnamed: 0,body,card_date,clinch,distance,fight_id,fighter,fighter_id,ground,head,leg,sig str,win_loss,winner
1563,0.0,2008-08-09,0.0,0.0,781,1,1610,0.0,0.0,0.0,0.0,0.0,0
1715,127.865821,2008-12-27,125.017068,129.888957,857,1,1610,122.495808,134.691537,114.960536,377.908124,377.908124,0


In [7]:
def gen_elo_c1(df_fight_data, df_stat_data, fighter_dict, name_dict, divby=400, k_val=40):
    

    stat_types = ('sig str') + ranges + locations
    current_elo_values = {f_id: 
                             {elo_type: 0 
                                for elo_type in stat_types + ['win_loss']}
                                    for f_id in fighter_dict.keys()}

    list_win_loss = []
    set_fighters_observed = set()

    # sort fights by date
    df_fights = df_fight_data \
        .sort_values(by='fight_id', ascending=True) \
        .set_index('fight_id')
    
    # Filter to remove round-by-round stat data
    df_stats = df_stat_data[(df_stat_data.Round == 0) &
            df_stat_data.Stat.isin(stat_types)]\
        [['Fighter', 'Landed', 'Stat', 'Seconds', 'fight_id']] \
        .rename(columns={'Fighter': 'fighter'})\
        .set_index('fight_id')

    ranges = ('distance', 'clinch', 'ground')
    locations = ('head', 'body', 'leg')

    range_vals = {r: {'mean': -1, 'sd': -1, 'cnt': 0} for r in ranges}
    loc_vals = {l: {'mean': -1, 'sd': -1, 'cnt': 0} for l in locations}

    set_fighters_observed = set()
    
    #iterate through fight_ids
    for fight_id in df_fights.index:
        
        #
        fight_stats = df_stats.loc[fight_id]
        
        # dictionary containing data about the fight
        fight_entry = dict(df_fight_data.loc[fight_id])

        # get fighter ids
        fighter_0_id = str(fight_entry['fighter_0'])
        fighter_1_id = str(fight_entry['fighter_1'])

        # determine dictionaries to be appended to returned

        to_append_0 = {'fighter_id': fighter_0_id,
             'card_date': fight_entry['date'],
             'fight_id': fight_entry['fight_id'], 
             'fighter' : 0,
             'winner': fight_entry['winner']}

        to_append_0.update({current_elo_values[fighter_0_id][stat_type]
            for stat_type in stat_types})

        to_append_1 = {'fighter_id': fighter_1_id,
             'card_date': fight_entry['date'],
             'fight_id': fight_entry['fight_id'], 
             'fighter' : 1,
             'winner': fight_entry['winner']}

        to_append_1.update({current_elo_values[fighter_1_id][stat_type]
            for stat_type in stat_types})

        # append to returned
        list_win_loss.append(to_append_0)
        list_win_loss.append(to_append_1)
        
        # Now we focus on updating elo values appropriately

        fighter_0_wl_elo = current_elo_values[fighter_0_id]['win_loss']
        fighter_1_wl_elo = current_elo_values[fighter_1_id]['win_loss']

        fighter_0_ss_elo = current_elo_values[fighter_0_id]['sig str']
        fighter_1_ss_elo = current_elo_values[fighter_1_id]['sig str']

        fighter_0_ss_inc, fighter_1_ss_inc = \
            gen_fighter_elo_increments(fighter_0_ss_elo, fighter_1_ss_elo,
                 fight_entry['winner'])
                  
        # take the increment from sig str elos, and divide it based on r_data
        #  get the stat values from df_stats

        def get_stat_landed(stat_name):
            return list(fight_stats[(fight_stats.Fighter ==  0) &
            (fight_stats.Stat == stat_name)])[0]

        range_landed = {r: get_stat_landed(r) for r in ranges}
        sum_landed = sum(list(range_landed.values()))
        range_landed_perc = {r: val/sum_landed for r, val in range_landed.items()}
        range_standardized = {}

        for stat_name, perc_landed in range_landed_perc.items():
            new_stat_entry = gen_new_mean_sd(range_vals[stat_name], perc_landed)
            range_vals[stat_name] = new_stat_entry
            range_standardized[stat_name] = (perc_landed - new_stat_entry['mean']) / \
                new_stat_entry['sd']
            
        
        
            

        
        
            

In [90]:
('hello',) + ('hi',)

('hello', 'hi')

In [47]:
df_fight_data.head()

Unnamed: 0,fight_name,fighter_0,fighter_1,winner,method,round_end,date,fight_id
0,Tulio Palhares v Adriano Santos 1998-10-16,1899,1900,0,KO/TKO,1,1998-10-16,0
1,Ebenezer Fontes Braga v Jeremy Horn 1998-10-16,1898,1601,0,Submission,1,1998-10-16,1
2,Tsuyoshi Kohsaka v Pete Williams 1998-10-16,1813,1819,0,Decision,2,1998-10-16,2
3,Pat Miletich v Mikey Burnett 1998-10-16,1818,1893,0,Decision,3,1998-10-16,3
4,Pedro Rizzo v David Abbott 1998-10-16,1785,1784,0,KO/TKO,1,1998-10-16,4


In [45]:
np.array(list({'a': 1, 'b': 2}.values())).mean()

1.5

In [9]:
def gen_elo_b1(df_fight_data, df_stat_data, fighter_dict, name_dict, divby=400, k_val=40):
    
#     First we create a dictionary to store the current values of the elos
#     its current structure is as follows:

#         {fighter_id: 
#             {elo_type: elo_value} (contains all zeroes to begin)
#         }

    # list of different stat types
    stat_types = list(df_stat_data.Stat.unique())

    current_elo_values = {f_id: 
                             {elo_type: 0 
                                for elo_type in stat_types + ['win_loss']}
                                    for f_id in fighter_dict.keys()}
    
#     Next we create a dataframe containing only only the win_loss_elo data
#     It has the same structure as the example df except for it doesn't have the elotype column,
#     and the elo value column is replaced with the column name of the stat type.
    
    # empty list to begin
    list_win_loss = []
    
    set_fighters_observed = set()
    
    # sort fights by date
    df_fights = df_fight_data \
        .sort_values(by='fight_id', ascending=True) \
        .set_index('fight_id')
    
    # Filter to remove round-by-round stat data
    df_stats = df_stat_data[df_stat_data.Round == 0][['Fighter', 'Landed', 'Stat', 'fight_id']] \
        .rename(columns={'Fighter': 'fighter'})\
        .set_index('fight_id')
    
    #iterate through fight_ids
    for fight_id in df_fights.index:
        
        #
        fight_stats = df_stats.loc[fight_id]
        
        # dictionary containing data about the fight
        fight_entry = dict(df_fight_data.loc[fight_id])
        
        # get fighter ids
        fighter_0_id = str(fight_entry['fighter_0'])
        fighter_1_id = str(fight_entry['fighter_1'])

        # get current fighter wl elos
        fighter_0_wl_elo = current_elo_values[fighter_0_id]['win_loss']
        fighter_1_wl_elo = current_elo_values[fighter_1_id]['win_loss']
        
        fighter_0_observed = fighter_0_id in set_fighters_observed
        fighter_1_observed = fighter_1_id in set_fighters_observed
        
        set_fighters_observed.add(fighter_0_id)
        set_fighters_observed.add(fighter_1_id)
        
        if (not fighter_0_observed) and fighter_1_observed:
            fighter_0_wl_elo = fighter_1_wl_elo
            
        if fighter_0_observed and (not fighter_1_observed):
            fighter_1_wl_elo = fighter_0_wl_elo
        
        # get new win_loss elo values
        fighter_0_wl_inc, fighter_1_wl_inc = \
            gen_fighter_elo_increments(fighter_0_wl_elo, fighter_1_wl_elo, fight_entry['winner'])
        
        fighter_0_wl_elo_new = fighter_0_wl_inc + fighter_0_wl_elo
        fighter_1_wl_elo_new = fighter_1_wl_inc + fighter_1_wl_elo
        
        # get dictionary of other new elo values
        dict_elos_0, dict_elos_1 = {'old': {}, 'new': {}}, {'old': {}, 'new': {}}
        for stat_type in stat_types:
            
            fighter_0_stat_elo = current_elo_values[fighter_0_id][stat_type]
            fighter_1_stat_elo = current_elo_values[fighter_1_id][stat_type]
            
            dict_elos_0['old'][stat_type] = fighter_0_stat_elo
            dict_elos_1['old'][stat_type] = fighter_1_stat_elo  
            
            relevant_stats = fight_stats[(fight_stats['Stat'] == stat_type)]
            fighter_0_landed = list(relevant_stats[relevant_stats['fighter'] == 0]['Landed'])[0]
            fighter_1_landed = list(relevant_stats[relevant_stats['fighter'] == 1]['Landed'])[0]
            
            if ((fight_entry['winner'] == 0) and (fighter_0_landed > fighter_1_landed)) or \
                ((fight_entry['winner'] == 1) and (fighter_0_landed < fighter_1_landed)):
                
                fighter_0_stat_elo = fighter_0_stat_elo + fighter_0_wl_inc
                fighter_1_stat_elo = fighter_1_stat_elo + fighter_1_wl_inc

                
            dict_elos_0['new'][stat_type] = fighter_0_stat_elo
            dict_elos_1['new'][stat_type] = fighter_1_stat_elo     
        
        to_append_0 = {'fighter_id': fighter_0_id,
             'card_date': fight_entry['date'],
             'fight_id': fight_entry['fight_id'], 
             'fighter' : 0,
             'winner': fight_entry['winner'],
             'win_loss': current_elo_values[fighter_0_id]['win_loss']}
        
        to_append_0.update(dict_elos_0['old'])
        
        to_append_1 = {'fighter_id': fighter_1_id,
             'card_date': fight_entry['date'],
             'fight_id': fight_entry['fight_id'], 
             'fighter': 1,
             'winner': fight_entry['winner'],
             'win_loss': current_elo_values[fighter_1_id]['win_loss']}
    
        to_append_1.update(dict_elos_1['old'])
    
        # append relevant data to future dataframe
        list_win_loss.append(to_append_0)
        list_win_loss.append(to_append_1)
        
        # update current_elo_values
        current_elo_values[fighter_0_id]['win_loss'] = fighter_0_wl_elo_new
        current_elo_values[fighter_1_id]['win_loss'] = fighter_1_wl_elo_new
        
        for stat_type in stat_types:
            current_elo_values[fighter_0_id][stat_type] = dict_elos_0['new'][stat_type]
            current_elo_values[fighter_1_id][stat_type] = dict_elos_1['new'][stat_type]
                       
    
    # create a dataframe from list of dictionaries list_win_loss
    # this dataframe will be eventually returned
    df_returned = pd.DataFrame.from_records(list_win_loss)
    return df_returned




In [10]:
# Must fix: assign elo to be the elo of their first fighter

In [11]:
df_elo1 = gen_elo_b1(df_fight_data, df_stat_data, fighter_dict, name_dict)
df_elo1.head()

Unnamed: 0,body,card_date,clinch,distance,fight_id,fighter,fighter_id,ground,head,kd,leg,pass,rev,sig str,sub att,td,total str,win_loss,winner
0,0.0,1998-10-16,0.0,0.0,0,0,1899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,1998-10-16,0.0,0.0,0,1,1900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,1998-10-16,0.0,0.0,1,0,1898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,1998-10-16,0.0,0.0,1,1,1601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,1998-10-16,0.0,0.0,2,0,1813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [12]:
name_dict['Max Holloway']

212

In [13]:
df_elo1.sort_values(by='head', ascending=False).head()

Unnamed: 0,body,card_date,clinch,distance,fight_id,fighter,fighter_id,ground,head,kd,leg,pass,rev,sig str,sub att,td,total str,win_loss,winner
8255,196.439569,2017-11-04,121.885807,294.84647,4127,1,789,266.093323,280.678131,51.404855,197.199465,177.117709,31.25785,294.84647,127.096464,284.175805,254.903956,318.037107,1
4584,182.27123,2013-11-16,121.885807,280.678131,2292,0,789,266.093323,280.678131,51.404855,183.031127,177.117709,31.25785,280.678131,112.928126,270.007467,254.903956,303.868769,0
8254,94.20432,2017-11-04,77.51533,299.020784,4127,0,776,61.457907,273.444432,-6.782807,132.563652,-31.65543,36.372945,199.843124,-11.803378,40.018165,199.843124,201.380945,1
4068,174.592811,2013-03-16,121.885807,272.999712,2034,0,789,258.414904,272.999712,51.404855,175.352708,169.43929,31.25785,272.999712,112.928126,262.329048,247.225537,296.19035,0
10464,274.497652,2020-02-08,201.938141,269.64271,5232,0,118,259.126662,270.43042,73.813442,248.297268,157.263324,0.0,325.043246,92.943136,276.773976,289.126986,306.394717,0


In [14]:
df_fight_data.head()

Unnamed: 0,fight_name,fighter_0,fighter_1,winner,method,round_end,date,fight_id
0,Tulio Palhares v Adriano Santos 1998-10-16,1899,1900,0,KO/TKO,1,1998-10-16,0
1,Ebenezer Fontes Braga v Jeremy Horn 1998-10-16,1898,1601,0,Submission,1,1998-10-16,1
2,Tsuyoshi Kohsaka v Pete Williams 1998-10-16,1813,1819,0,Decision,2,1998-10-16,2
3,Pat Miletich v Mikey Burnett 1998-10-16,1818,1893,0,Decision,3,1998-10-16,3
4,Pedro Rizzo v David Abbott 1998-10-16,1785,1784,0,KO/TKO,1,1998-10-16,4


In [15]:
df_fight_data.columns

Index(['fight_name', 'fighter_0', 'fighter_1', 'winner', 'method', 'round_end',
       'date', 'fight_id'],
      dtype='object')

In [41]:
df_fight_data.head().loc[0]['fighter_0']

1899

In [42]:
df_elo1.head().loc[0]['fighter_id']

'1899'

In [36]:
df_elo1[df_elo1.fighter == 1].head()

Unnamed: 0,body,card_date,clinch,distance,fight_id,fighter,fighter_id,ground,head,kd,leg,pass,rev,sig str,sub att,td,total str,win_loss,winner
1,0.0,1998-10-16,0.0,0.0,0,1,1900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,1998-10-16,0.0,0.0,1,1,1601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,0.0,1998-10-16,0.0,0.0,2,1,1819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,0.0,1998-10-16,0.0,0.0,3,1,1893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,0.0,1998-10-16,0.0,0.0,4,1,1784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [180]:
def run_model(df_elo, df_fight_data, columns, divby=400, random_state=0):

    df_elo = df_elo.copy()
    df_elo = df_elo[['fight_id', 'fighter', 'fighter_id', 'win_loss'] + columns]
    df_fight_data = df_fight_data.copy()
    df_fight_data['fighter_0'] = df_fight_data['fighter_0'].astype(str)
    df_fight_data['fighter_1'] = df_fight_data['fighter_1'].astype(str)
    
    df_fight_data = df_fight_data[['fighter_0', 'fighter_1', 'fight_id', 'winner']]

    df_elo_f0 = df_elo[df_elo['fighter'] == 0]\
        .drop(columns=['fighter'])\
        .rename(columns={'fighter_id': 'fighter_0'})

    df_elo_f1 = df_elo[df_elo['fighter'] == 1]\
        .drop(columns=['fighter'])\
        .rename(columns={'fighter_id': 'fighter_1'})
    
    df_fight_data = df_fight_data.merge(df_elo_f0, on=['fight_id', 'fighter_0'], how='inner')
    df_fight_data.rename(columns={c: c + '_0' for c in columns + ['win_loss']},
        inplace=True)
    
    df_fight_data = df_fight_data.merge(df_elo_f1, on=['fight_id', 'fighter_1'], how='inner')
    df_fight_data.rename(columns={c: c + '_1' for c in columns + ['win_loss']},
        inplace=True)
        
    for col in columns + ['win_loss']:
        df_fight_data[col] = df_fight_data[col + '_1'] - df_fight_data[col + '_0']
        df_fight_data[col] = df_fight_data[col].apply(lambda x: \
            1.0 / (1 + 1.0 * math.pow(10, x / divby)))
        df_fight_data.drop(columns = [col + '_0', col + '_1'], inplace=True)

    df_fight_data = df_fight_data[columns + ['win_loss', 'winner']]

    for i in df_fight_data.index:
        if random.randint(0, 1) == 0:
            df_fight_data.loc[i] = [1-x for x in df_fight_data.loc[i]]

    x = df_fight_data[filter(lambda x: x != 'winner', df_fight_data.columns)]
    y = df_fight_data[['winner']]
    x_train, x_test, y_train, y_test =\
         train_test_split(x, y, test_size=0.25, random_state=random_state)
    model = LogisticRegression().fit(x_train, y_train)
    print(model.score(x_test, y_test))
    return model, df_fight_data

moda, testa = run_model(df_c2, df_fight_data, ['head', 'body', 'leg', 'distance', 'clinch', 'ground'])

0.5744520030234316


  y = column_or_1d(y, warn=True)


In [86]:
mod, test = run_model(df_elo1, df_fight_data, ['sig str'])

0.581254724111867


  y = column_or_1d(y, warn=True)


In [181]:
{testa.columns[i]: moda.coef_[0][i] for i in range(len(testa.columns) - 1)}

{'head': 0.09358926429035991,
 'body': -0.26853299952304466,
 'leg': -0.7724240092447986,
 'distance': -0.16628331808916783,
 'clinch': 0.11988132459690594,
 'ground': -0.6172972102264404,
 'win_loss': -2.1698236096115777}

In [78]:
moda.coef_[0][1]

-1.2559663209889567

In [18]:
df_fight_data[df_fight_data.date > datetime(2020, 1, 1)]

Unnamed: 0,fight_name,fighter_0,fighter_1,winner,method,round_end,date,fight_id
5199,Sabina Mazo v JJ Aldrich 2020-01-18,182,183,0,Decision,3,2020-01-18,5199
5200,Aleksa Camur v Justin Ledet 2020-01-18,180,181,0,Decision,3,2020-01-18,5200
5201,Drew Dober v Nasrat Haqparast 2020-01-18,178,179,0,KO/TKO,1,2020-01-18,5201
5202,Tim Elliott v Askar Askarov 2020-01-18,176,177,1,Decision,3,2020-01-18,5202
5203,Andre Fili v Sodiq Yusuff 2020-01-18,174,175,1,Decision,3,2020-01-18,5203
5204,Roxanne Modafferi v Maycee Barber 2020-01-18,172,173,0,Decision,3,2020-01-18,5204
5205,Anthony Pettis v Diego Ferreira 2020-01-18,170,171,1,Submission,2,2020-01-18,5205
5206,Brian Kelleher v Ode Osbourne 2020-01-18,168,169,0,Submission,1,2020-01-18,5206
5207,Aleksei Oleinik v Maurice Greene 2020-01-18,166,167,0,Submission,2,2020-01-18,5207
5208,Holly Holm v Raquel Pennington 2020-01-18,164,165,0,Decision,3,2020-01-18,5208


In [57]:
testa

NameError: name 'testa' is not defined

In [16]:
stat_types = list(df_stat_data.Stat.unique())


In [None]:

    
def gen_relevant(self, names):
        returned = {}
        v2 = self.stat_data[(self.stat_data.Stat.isin(names)) &
                           (self.stat_data.Round == 0)]
        f1_stats = v2[v2.Fighter == 0]
        f2_stats = v2[v2.Fighter == 1]
        
        for fight in self.fight_data.index:
            returned[fight] = {'fighter_0': self.fight_data['fighter_0'][fight],
                               'fighter_1': self.fight_data['fighter_1'][fight],
                               'winner': self.fight_data['winner'][fight],
                               'date': self.fight_data['date'][fight],
                               'landed_0': {name: f1_stats[f1_stats.Stat==name]['Landed'][fight] for name in names},
                               'landed_1': {name: f2_stats[f2_stats.Stat==name]['Landed'][fight] for name in names}}
        return returned

In [5]:
df_fight_data.head()

Unnamed: 0,fight_name,fighter_0,fighter_1,winner,method,round_end,date,fight_id
0,Tulio Palhares v Adriano Santos 1998-10-16,1899,1900,0,KO/TKO,1,1998-10-16,0
1,Ebenezer Fontes Braga v Jeremy Horn 1998-10-16,1898,1601,0,Submission,1,1998-10-16,1
2,Tsuyoshi Kohsaka v Pete Williams 1998-10-16,1813,1819,0,Decision,2,1998-10-16,2
3,Pat Miletich v Mikey Burnett 1998-10-16,1818,1893,0,Decision,3,1998-10-16,3
4,Pedro Rizzo v David Abbott 1998-10-16,1785,1784,0,KO/TKO,1,1998-10-16,4
