In [97]:
import random

import pandas as pd
import numpy as np
import math

import json
import array as arr
from datetime import datetime, date

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [147]:
with open('data/fighter_elos.json') as json_file:
    elo_dict = json.load(json_file)
    
with open('data/fighter_elos_ignore_loser_10_29.json') as json_file:
    elo_dict_winner = json.load(json_file)
    
# dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')
df_stat_data = pd.read_csv('data/df_stat_data_10_25.csv')
df_fight_data = pd.read_csv('data/df_fight_data_10_25.csv')

with open('data/fighter_hist.json') as json_file:
    fighter_dict = json.load(json_file)
    
with open('data/fighter_name_to_id.json') as json_file:
    name_dict = json.load(json_file)

In [118]:
%run ./elo_generation.ipynb

In [119]:
# elo_win_loss = Elo(fighter_dict, df_fight_data, df_stat_data, name_dict, ['clinch'])

In [120]:
# elo_ignore_loser = Elo(fighter_dict = fighter_dict,
#                        df_fight_data = df_fight_data,
#                        df_stat_data = df_stat_data,
#                        name_dict = name_dict,
#                        stat_names = list(df_stat_data.Stat.unique()),
#                        ignore_winner=False)

In [146]:
# elo_ignore_loser.to_json('data/fighter_elos_ignore_loser_10_29.json')

In [197]:
def gen_train_test(elo_dict, fighter_dict, df_fight_data, stat_names, random_state=0):
    df_elo = gen_df_elo(df_fight_data, elo_dict, stat_names)
    df_elo = filter_columns(df_elo)
    df_elo = randomize_result(df_elo)
    df_x, df_y = gen_x_y(df_elo)
    df_x = adjust_range(df_x)
    return train_test_split(df_x, df_y, test_size=0.25, random_state=random_state)
    
def gen_df_elo(df_fight_data, elodict, stat_names, divby=400, check_win=False):
    returned = df_fight_data.copy()
    for stat_name in stat_names:
        def to_apply(row):
            f0_elo = elodict[str(row['fighter_0'])][row['date']][stat_name]
            f1_elo = elodict[str(row['fighter_1'])][row['date']][stat_name]
            
            return Elo.prob(f0_elo, f1_elo, divby)[1]
            
        returned['elo_'+stat_name] = returned.apply(to_apply, axis=1)
    return returned

def filter_columns(df_elo):
    return df_elo[list(filter(lambda col: (col == 'winner') or ('elo' in col),
                             df_elo.columns))]

def randomize_result(df_elo):
    df_elo = df_elo.copy()
    for i in df_elo.index:
        if random.randint(0, 1) == 0:
            df_elo.loc[i] = [1-x for x in df_elo.loc[i]]
    return df_elo

def gen_x_y(df_elo):
    x = df_elo[list(filter(lambda col: col != 'winner', df_elo.columns))]
    y = df_elo[['winner']]
    return x, y

def adjust_range(df_x):
    df_x = df_x.copy()
    for col in df_x.columns:
        df_x[col] = df_x[col] - .5
    return df_x

In [209]:
name_dict['Khabib Nurmagomedov']

421

In [214]:
elo_ignore_loser.elodict[421]

{'curr': {'win_loss': 305.9153506946926,
  'kd': 149.9920560333268,
  'sig str': 288.21753204412295,
  'total str': 285.71619767411585,
  'td': 259.74500219290866,
  'sub att': 192.2316035656752,
  'pass': 261.0686925642812,
  'rev': 120.42425096822464,
  'head': 285.58617871515435,
  'body': 154.53395106106157,
  'leg': 198.26175057987086,
  'distance': 208.64438067706027,
  'clinch': 224.63400535281517,
  'ground': 279.2530036901879},
 Timestamp('2012-01-20 00:00:00'): {'kd': 120.42425096822464,
  'sig str': 120.42425096822464,
  'total str': 120.42425096822464,
  'td': 120.42425096822464,
  'sub att': 120.42425096822464,
  'pass': 120.42425096822464,
  'rev': 120.42425096822464,
  'head': 120.42425096822464,
  'body': 120.42425096822464,
  'leg': 120.42425096822464,
  'distance': 120.42425096822464,
  'clinch': 120.42425096822464,
  'ground': 120.42425096822464,
  'win_loss': 120.42425096822464},
 Timestamp('2012-07-07 00:00:00'): {'kd': 140.11142258530714,
  'sig str': 140.35252671

In [212]:
[a['sig str'] for a in elo_dict_winner['421'].values()]

[288.21753204412295,
 120.42425096822464,
 140.3525267159697,
 140.3525267159697,
 155.93316981006265,
 167.97272596997425,
 178.99244969451124,
 194.15927632532578,
 215.14851102511938,
 228.6986705627376,
 242.44812473102584,
 255.81002387715873,
 271.2268571883861]

In [265]:
x_train, x_test, y_train, y_test = \
    gen_train_test(elo_dict_winner, fighter_dict, df_fight_data,
                   ['clinch'],
                   random_state=3)

In [266]:
def get_accuracy(x_train, x_test, y_train, y_test,
                 model=LogisticRegression(solver='lbfgs')):
    mod = model.fit(x_train, y_train)
    return mod, mod.score(x_test, y_test)

model, score = get_accuracy(x_train, x_test, y_train, y_test)
score

  y = column_or_1d(y, warn=True)


0.5336356764928194

In [264]:
get_accuracy(x_train, x_train, y_train, y_train)[1]

  y = column_or_1d(y, warn=True)


0.54421768707483

In [236]:
[print(a) for a in ['sig str', 'td', 'head', 'distance', 'clinch',
                    'ground']]

sig str
td
head
distance
clinch
ground
win_loss


[None, None, None, None, None, None, None]

In [258]:
relevant_cols

['sig str', 'total str', 'td', 'pass', 'head', 'distance', 'clinch', 'ground']

In [252]:
x_train[['elo_'+c for c in relevant_cols] + ['elo_win_loss']].corr()

Unnamed: 0,elo_sig str,elo_total str,elo_td,elo_pass,elo_head,elo_distance,elo_clinch,elo_ground,elo_win_loss
elo_sig str,1.0,0.908121,0.575702,0.594071,0.90179,0.823276,0.757454,0.792742,0.859649
elo_total str,0.908121,1.0,0.617166,0.643003,0.855016,0.737328,0.73432,0.783471,0.841151
elo_td,0.575702,0.617166,1.0,0.859386,0.585807,0.377623,0.533615,0.705827,0.678286
elo_pass,0.594071,0.643003,0.859386,1.0,0.595649,0.389858,0.529827,0.721796,0.695329
elo_head,0.90179,0.855016,0.585807,0.595649,1.0,0.745472,0.711989,0.810832,0.840643
elo_distance,0.823276,0.737328,0.377623,0.389858,0.745472,1.0,0.664339,0.561978,0.703317
elo_clinch,0.757454,0.73432,0.533615,0.529827,0.711989,0.664339,1.0,0.662712,0.692935
elo_ground,0.792742,0.783471,0.705827,0.721796,0.810832,0.561978,0.662712,1.0,0.789731
elo_win_loss,0.859649,0.841151,0.678286,0.695329,0.840643,0.703317,0.692935,0.789731,1.0


In [267]:
list(model.coef_[0])

[1.1416526090048817]

In [204]:
_, stest = get_accuracy(x_train, x_train, y_train, y_train)
stest

  y = column_or_1d(y, warn=True)


0.580750818846057

In [113]:
predicted_train = list(model.predict(x_train))
correct = 0
for i in range(len(predicted_train)):
    if predicted_train[i] == list(y_train.winner)[i]:
        correct += 1
        
print(correct/len(predicted_train))

0.5813822284908322


In [102]:
model_nb, score_nb = get_accuracy(x_train, x_test, y_train, y_test,
                                  model=GaussianNB())
score_nb

  y = column_or_1d(y, warn=True)


0.5500858614768174

In [219]:
def get_relevant_cols(x_train, model):
    cols = list(x_train.columns)
    coefs = list(model.coef_[0])
    cols_or_none = [cols[i][4:] if coefs[i] > 0 else None for i in range(len(coefs))]
    return list(filter(lambda x: not x is None, cols_or_none))

relevant_cols = get_relevant_cols(x_train, model)
relevant_cols

['sig str', 'total str', 'td', 'pass', 'head', 'distance', 'clinch', 'ground']

In [171]:
model_v2, score_2 = get_accuracy(*gen_train_test(elo_dict_winner, fighter_dict, df_fight_data,
#                                                  list(df_stat_data.Stat.unique()) + ['win_loss'],
                                                 ['win_loss', 'head', 'td', 'sig str'],
                                                 random_state=5))

score_2

  y = column_or_1d(y, warn=True)


0.5603892386949055

In [67]:
[a if a==0 else None for a in [0,1,2]]

[0, None, None]

In [95]:
x_train.columns

Index(['elo_kd', 'elo_sig str', 'elo_total str', 'elo_td', 'elo_sub att',
       'elo_pass', 'elo_rev', 'elo_head', 'elo_body', 'elo_leg',
       'elo_distance', 'elo_clinch', 'elo_ground', 'elo_win_loss'],
      dtype='object')

In [169]:
{relevant_cols[i]: list(model_v2.coef_[0])[i] for i in range(len(relevant_cols))}

{'sig str': 2.092784028353675,
 'total str': 0.4152194696347669,
 'td': 0.19291105105542655,
 'pass': -0.44891031504484497,
 'head': 1.6597123287713962,
 'distance': -0.32163704238556823,
 'clinch': -0.8852864261542588,
 'ground': -0.27346492352883095,
 'win_loss': 0.18592028403341884}

In [172]:
list(model_v2.coef_[0])

[-0.4592955621157757,
 1.6843825603098663,
 0.10392205456768783,
 1.7306195963331343]

In [205]:
model.coef_

array([[-0.77882573,  1.87713765,  0.70653025,  0.85726798, -1.02963589,
         0.39239353, -2.85534916,  1.57252398, -0.34622638, -0.90063324,
         1.02886282,  0.48301515,  0.59788978, -0.11711049]])

In [5]:
df_fight_data

Unnamed: 0,fight_name,fighter_0,fighter_1,winner,method,round_end,date,fight_id
0,Tulio Palhares v Adriano Santos 1998-10-16,1899,1900,0,KO/TKO,1,1998-10-16,0
1,Ebenezer Fontes Braga v Jeremy Horn 1998-10-16,1898,1601,0,Submission,1,1998-10-16,1
2,Tsuyoshi Kohsaka v Pete Williams 1998-10-16,1813,1819,0,Decision,2,1998-10-16,2
3,Pat Miletich v Mikey Burnett 1998-10-16,1818,1893,0,Decision,3,1998-10-16,3
4,Pedro Rizzo v David Abbott 1998-10-16,1785,1784,0,KO/TKO,1,1998-10-16,4
5,Vitor Belfort v Wanderlei Silva 1998-10-16,734,1306,0,KO/TKO,1,1998-10-16,5
6,Frank Shamrock v John Lober 1998-10-16,1870,1897,0,Submission,1,1998-10-16,6
7,Laverne Clark v Frank Caracci 1999-01-08,1844,1896,0,KO/TKO,1,1999-01-08,7
8,Evan Tanner v Darrel Gholar 1999-01-08,1634,1895,0,Submission,1,1999-01-08,8
9,Mikey Burnett v Townsend Saunders 1999-01-08,1893,1894,0,Decision,2,1999-01-08,9
