In [1]:
from abc import ABC, abstractmethod
import math
import random

import pandas as pd
import random
import sklearn
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

import shap
import array as arr

In [2]:
#stat_data = pd.read_pickle("No Event.pkl")
stat_data = pd.read_pickle("Pickled big stats.pkl").set_index('Fight')
fight_data = pd.read_pickle('pickled fights.pkl')
fighter_data = pd.read_pickle('pickled fighter stats.pkl')
list_fights = fight_data.index.to_list()

In [3]:
class FightFilter(ABC):
    
    def __init__(self):
        self.ref = self.ref_dict()
    
    @abstractmethod
    def get_list(self, date, fighter):
        pass
    
    def for_current(self, fighter1, fighter2):
        return self.get_list('99999999999', fighter1), self.get_list('99999999999', fighter2)
    
    def ref_dict(self):
        returned = {}
        for fight in list_fights:
            date = fight_data.Date[fight]
            fights1 = self.get_list(date, fight_data['0'][fight])
            fights2 = self.get_list(date, fight_data['1'][fight])
            
            if len(fights1) > 0 and len(fights2) > 0:
                returned[fight] = fights1, fights2
        
        return returned

In [4]:
class BasicFilter(FightFilter):
    def get_list(self, date, fighter):
        fights = fighter_data.Fights[fighter]
        returned = []
        for fight in fights:
            if fight_data.Date[fight] < date:
                returned.append(fight)
        return returned

In [5]:
class HandleFighterValues(ABC):
    def handle(self, f1_stat, f2_stat):
        pass

In [6]:
class SubtractValues(HandleFighterValues):
    def handle(self, f1_stat, f2_stat):
        return f1_stat - f2_stat

In [7]:
class Stat(ABC):
    
    @abstractmethod
    def get_value(self, fighter, date, relevant_fights):
        pass
    @abstractmethod
    def to_string(self):
        pass

In [8]:
def change_target(a):
    if a==2:
        return -1
    elif a==1:
        return 1
    else:
        return 0
def randomize_result(df):
    returned = []
    for i in range(0,len(df)):
        mul = 1
        returned.append([])
        if random.randint(0,1) == 1:
            mul = -1
        for col in df.columns:
            returned[i].append(df[col][i] * mul * 100)
    return pd.DataFrame(returned, columns = df.columns)

In [9]:
class BuildFrame:
    
    def __init__(self, input_dict):
        self.input_dict = input_dict
        self.X, self.y = self.build()
        
    def make_row(self, fighter1, fighter2):
        returned = []
        for ff in self.input_dict:
            fight_ref = ff.ref
            fights1, fights2 = ff.for_current(fighter1, fighter2)
            
            for handler in self.input_dict[ff]:
                for stat in self.input_dict[ff][handler]:
                    returned.append(handler.handle(stat.get_value(fighter1, 'curr', fights1), 
                                                   stat.get_value(fighter2, 'curr', fights2)))
        return pd.DataFrame([returned], columns=self.X.columns)
        
    def build(self):
        returned = pd.DataFrame()
        for ff in self.input_dict:
            fight_ref = ff.ref_dict()

            for handler in self.input_dict[ff]:
                for stat in self.input_dict[ff][handler]:
                    col = []           
                    for fight in fight_ref.keys():
                        fighter1 = fight_data['0'][fight]
                        fighter2 = fight_data['1'][fight]
                        date = fight_data['Date'][fight]
                        fights1, fights2 = fight_ref[fight]
                        add_to_col = handler.handle(stat.get_value(fighter1, date, fights1), 
                                                    stat.get_value(fighter2, date, fights2))
                        col.append(add_to_col)
                        col.append(-add_to_col)
                    returned[stat.to_string()] = col

        target = []
        for fight in fight_ref.keys():
            val = change_target(fight_data['2'][fight])
            target.append(val)
            target.append(-val)

        returned['y'] = target
        #randomized = randomize_result(returned)
        randomized = returned
        
        return randomized.loc[:, randomized.columns != 'y'], randomized['y']

In [10]:
class BasicStriking(Stat):
    
    def __init__(self, name, fighter, rnd, num, denom):
        self.frame = stat_data[(stat_data.Stat == name) &
                               (stat_data.Fighter == fighter) &
                               (stat_data.Round == rnd)]
        self.name = name
        self.fighter = fighter
        self.rnd = rnd
        self.numerator = num
        self.denominator = denom
        
    def get_value(self, fighter, date, relevant_fights):
        sum_n = 0
        sum_d = 0
        for fight in relevant_fights:
            sum_n += self.frame[self.numerator][fight]
            sum_d += self.frame[self.denominator][fight]
        return sum_n/sum_d
    
    def to_string(self):
        return "basic_"+self.name + "_"+str(self.fighter)+"_" + str(self.rnd)+"_"+self.numerator+"_per_" + self.denominator

In [11]:
class Elo_v1(Stat):
    def __init__(self, name, k_val, divby, mult):
        self.name = name
        self.k_val = k_val
        self.divby = divby
        self.mult = mult
        self.rstart = 0
        self.elodict = {}
        self.relevants = self.gen_relevant(name)
        self.gen_data()
        
    def get_value(self, fighter, date, relevant_fights):
        return self.elodict[fighter][date]
        
    def to_string(self):
        return "ELO_" + self.name
    
    def gen_relevant(self, name):
        returned = {}
        v2 = stat_data[(stat_data.Stat == name) &
                       (stat_data.Round == 0)]
        f1_stats = v2[v2.Fighter == 1]
        f2_stats = v2[v2.Fighter == 2]
        
        for fight in fight_data.index:
            returned[fight] = {'fighter1': fight_data['0'][fight],
                               'fighter2': fight_data['1'][fight],
                               'date': fight_data['Date'][fight],
                               'landed1': f1_stats['Landed'][fight],
                               'landed2': f2_stats['Landed'][fight]}
        return returned
    
    def get_elo(self, fighter):
        return self.elodict[fighter]['curr']
    
    def add_fighter(self, fighter, elo):
        self.elodict[fighter] = {'curr': elo}
        
    def prob(self, r1, r2, divby):
        a = 1.0 / (1 + 1.0 * math.pow(10, 1.0 * (r1 - r2) / divby))
        b = 1-a
        return b,a
    
    def update_dict(self, entry):
        curr1 = self.get_elo(entry['fighter1'])
        curr2 = self.get_elo(entry['fighter2'])
        p1, p2 = self.prob(curr1, curr2, self.divby)
        winner1 = entry['landed1'] > entry['landed2']
        diff = abs(entry['landed1'] - entry['landed2'])
        Ra = None
        Rb = None
        
        if winner1:
            Ra = curr1 + math.pow(diff, self.mult) * self.k_val * (1 - p1) 
            Rb = curr2 + math.pow(diff, self.mult) * self.k_val * (0 - p2) 
        
        else: 
            Ra = curr1 + math.pow(diff, self.mult) * self.k_val * (0 - p1) 
            Rb = curr2 + math.pow(diff, self.mult) * self.k_val * (1 - p2)
            
        self.elodict[entry['fighter1']].update({entry['date']: curr1, 'curr': Ra})
        self.elodict[entry['fighter2']].update({entry['date']: curr2, 'curr': Rb})
        
    def gen_data(self):
        for fight in fight_data.index:
            entry = self.relevants[fight]
            if entry['fighter1'] in self.elodict and (not entry['fighter2'] in self.elodict):
                self.add_fighter(entry['fighter2'], self.get_elo(entry['fighter1']))

            if entry['fighter2'] in self.elodict and (not entry['fighter1'] in self.elodict):
                self.add_fighter(entry['fighter1'], self.get_elo(entry['fighter2']))

            elif (not entry['fighter1'] in self.elodict) and (not entry['fighter2'] in self.elodict):
                self.add_fighter(entry['fighter1'], self.rstart)
                self.add_fighter(entry['fighter2'], self.rstart)
                
            self.update_dict(entry)
            
    def display_sorted(self):
        s = {k: v['curr'] for k,v in sorted(self.elodict.items(), key=lambda a: a[1]['curr'], reverse=True)}
        return s
    
    def percent_wins(self):
        correct = 0
        incorrect = 0
        for fight in fight_data.index:
            date = fight_data.Date[fight]
            r1 = self.elodict[fight_data['0'][fight]][date]
            r2 = self.elodict[fight_data['1'][fight]][date]
            result = fight_data['2'][fight]
            if (r1 > r2 and result == 1) or (r1 < r2 and result == 2):
                correct += 1
            else:
                incorrect += 1
        return correct / (correct + incorrect)
            

In [12]:
class WinLossElo(Elo_v1):
    
    def gen_relevant(self, name):
        returned = {}
        
        for fight in fight_data.index:
            returned[fight] = {'fighter1': fight_data['0'][fight],
                               'fighter2': fight_data['1'][fight],
                               'date': fight_data['Date'][fight],
                               'winner': fight_data['2'][fight]}
        return returned
        
    def update_dict(self, entry):
        curr1 = self.get_elo(entry['fighter1'])
        curr2 = self.get_elo(entry['fighter2'])
        p1, p2 = self.prob(curr1, curr2, self.divby)
        winner = entry['winner']
        Ra = None
        Rb = None
        
        if winner == 1:
            Ra = curr1 + self.k_val * (1 - p1) 
            Rb = curr2 + self.k_val * (0 - p2) 
        
        elif winner == 2: 
            Ra = curr1 + self.k_val * (0 - p1) 
            Rb = curr2 + self.k_val * (1 - p2)
            
        else:
            Ra = curr1 + self.k_val * (0.5 - p1) 
            Rb = curr2 + self.k_val * (0.5 - p2)
            
        self.elodict[entry['fighter1']].update({entry['date']: curr1, 'curr': Ra})
        self.elodict[entry['fighter2']].update({entry['date']: curr2, 'curr': Rb})

In [13]:
estimators = {"k-Nearest Neighbor": KNeighborsClassifier(), 
          #"Support Vector Machine" : SVC(gamma=0.002), 
          "Gaussian Naive Bayes" : GaussianNB(), 
          "Decision Tree" : DecisionTreeClassifier(),
          }

In [14]:
class MLAnalysis():
    
    def __init__(self, build_frame, classifier):
        self.build_frame = build_frame
        self.X, self.y = build_frame.X, build_frame.y
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, random_state=0)
        
        self.classifier = classifier
        self.classifier.fit(self.X_train, self.y_train)
        
        self.explainer = shap.KernelExplainer(self.classifier.predict_proba, self.X_train, link="logit")
    
    def predict(self, fighter1, fighter2):
        row = self.build_frame.make_row(fighter1, fighter2)
        return [self.classifier.predict(row), 
                self.explainer.shap_values(pd.DataFrame(row), nsamples=100)]
    
    def predict_proba(self, fighter1, fighter2):
        row = self.build_frame.make_row(fighter1, fighter2)
        return self.classifier.predict_proba(row)
    
    def interpret(self, fighter1, fighter2):
        a = interpret(self, fighter1, fighter2)
        return a + '\nProbability: ' + str(max(self.predict_proba(fighter1, fighter2)[0]))

In [15]:
def interpret(ml_analysis, fighter1, fighter2):
    result, shaps = ml_analysis.predict(fighter1, fighter2)
    result = result[0]
    result_to_shap = {-1: 0, 1: 1}
    winshap = shaps[result_to_shap[result]]
    shap_dict = pd.DataFrame(winshap, columns=ml_analysis.build_frame.X.columns).to_dict(orient='list')
    s1 = sorted(shap_dict.items(), key=lambda item: item[1][0])
    neg_shaps = []
    pos_shaps = []
    frame_row = ml_analysis.build_frame.make_row(fighter1, fighter2)
    for i in range(0,3):
        if i < len(s1) and s1[i][1][0] < 0:
            
            neg_shaps.append(s1[i])
    s1.reverse()
    for i in range(0,3): 
        if i < len(s1) and s1[i][1][0] > 0:
            pos_shaps.append(s1[i])
    return reading(ml_analysis, pos_shaps, neg_shaps, result, frame_row, fighter1, fighter2)

In [16]:
def reading(ml_analysis, pos_shaps, neg_shaps, result, frame_row, fighter1, fighter2):
    winner = ""
    if result == 1:
        winner = fighter1
    else:
        winner = fighter2
        
    returned = winner + " won.\n"
    for i in pos_shaps:
        name = i[0]
        num = i[1][0]
        row_val = frame_row[name][0]
        wording = ""
        if (result == -1 and row_val < 0) or (result == 1 and row_val > 0):
            wording = "helped winner"
        else:
            wording = "hurt loser"
        returned += "%s %s\n" % (name, wording)
    
    return returned
                

In [17]:
def evaluate_models(build_frame):
    for key in estimators:
        ml = MLAnalysis(build_frame, estimators[key])
        print(key)
        print(ml.classifier.score(ml.X_test, ml.y_test))

In [18]:
basics = []
for a in ['sig str', 'kd', 'td','sub att', 'pass', 'rev','distance','clinch', 'ground', 'head','body','leg']:
    basics.append(BasicStriking(a, 1, 0, 'Landed', 'Seconds'))
    basics.append(BasicStriking(a, 2, 0, 'Landed', 'Seconds'))

In [19]:
elo_stats = ['td', 'distance', 'ground', 'kd', 'total str', 'sig str', 'leg', 'head', 'pass', 'sub att', 'body', 'clinch']
basics2 = []
for s in elo_stats:
    basics2.append(Elo_v1(s, 50, 400, 0.5))

In [20]:
build_elo1 = BuildFrame({BasicFilter(): {SubtractValues(): basics2}})
build_elo2 = BuildFrame({BasicFilter(): {SubtractValues(): basics2 + [WinLossElo('W/L', 100, 400, 56)]}})

In [21]:
build5 = BuildFrame({BasicFilter(): {SubtractValues(): basics}})

In [22]:
ml_elo = MLAnalysis(build_elo1, GaussianNB())
ml_elo2 = MLAnalysis(build_elo2, GaussianNB())

Using 5685 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
Using 5685 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


In [23]:
ml_elo2.classifier.score(ml_elo2.X_test, ml_elo2.y_test)

0.5883905013192612

In [24]:
print(ml_elo.interpret('Israel Adesanya', 'Justin Gaethje'))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!
l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!



Israel Adesanya won.
ELO_sig str helped winner
ELO_head helped winner
ELO_distance helped winner

Probability: 0.9573617807289166
