In [68]:
import json
import pickle
import random
from collections import defaultdict
from datetime import datetime
from itertools import chain
from typing import Iterable

import keras
import numpy
import pandas
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, StratifiedKFold
from statsmodels.formula import api as formula_api

In [69]:
# Define cross-validation and evaluation strategies.

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scoring = 'accuracy'

In [70]:
# Parse battle logs.

def parse_heroes(heroes: Iterable[dict], sign: int, result: dict):
    for hero in heroes:
        for hero_key in ('level', 'color', 'star'):
            result[f'''{hero_key}_{hero['id']}'''] += sign * hero[hero_key]


def parse_battle(line: str) -> dict:
    battle = json.loads(line)
    result = defaultdict(int)

    parse_heroes(battle.get('attackers') or battle['player'], +1, result)
    parse_heroes(battle.get('defenders') or battle['enemies'], -1, result)
        
    return {'win': battle['win'], **result}

In [71]:
# Load the logs into a data frame.

def invert_column(series: pandas.Series):
    """
    Inverts the column to make an "opposite" battle.
    """
    return series == False if series.name == 'win' else -series

lines = list(chain.from_iterable([
    open('battles.jsonl'),
    open('battles-lilia.jsonl'),
]))

numpy.random.seed(42)
battles = pandas.DataFrame([parse_battle(line) for line in lines]).fillna(value=0)
battles.drop_duplicates(inplace=True)  # because the files may contain duplicate battles
battles = pandas.concat((battles, battles.apply(invert_column)))  # add inverted battles
battles = battles.sample(frac=1)  # shuffle
battles.head()

Unnamed: 0,color_1,color_10,color_11,color_12,color_13,color_14,color_15,color_17,color_18,color_19,...,star_38,star_39,star_4,star_40,star_5,star_6,star_7,star_8,star_9,win
159,0.0,7.0,0.0,-8.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,False
125,-0.0,-7.0,-0.0,-0.0,-7.0,-0.0,7.0,-0.0,7.0,-0.0,...,-0.0,-0.0,2.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,True
206,0.0,7.0,0.0,-9.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,-3.0,0.0,False
173,-0.0,-7.0,-0.0,7.0,-7.0,-0.0,7.0,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.0,-0.0,-0.0,False
237,-0.0,-7.0,-7.0,9.0,3.0,-0.0,9.0,-0.0,-0.0,-0.0,...,-0.0,-0.0,3.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,True


In [72]:
battles.describe()

Unnamed: 0,color_1,color_10,color_11,color_12,color_13,color_14,color_15,color_17,color_18,color_19,...,star_36,star_38,star_39,star_4,star_40,star_5,star_6,star_7,star_8,star_9
count,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,...,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,2.695351,4.863777,2.703308,3.698029,5.150249,1.473716,4.817431,1.235899,4.037415,1.301734,...,0.396885,0.38467,0.365584,1.904648,0.154487,0.293119,0.860149,1.649474,1.118301,0.378415
min,-9.0,-7.0,-8.0,-9.0,-7.0,-9.0,-11.0,-7.0,-9.0,-8.0,...,-3.0,-3.0,-2.0,-4.0,-2.0,-3.0,-3.0,-3.0,-4.0,-2.0
25%,0.0,-6.0,0.0,-0.0,-7.0,-0.0,-5.0,-0.0,-1.0,0.0,...,-0.0,0.0,-0.0,-2.0,0.0,0.0,0.0,-1.0,0.0,-0.0
50%,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,6.0,0.0,0.0,7.0,-0.0,5.0,0.0,1.0,0.0,...,0.0,0.0,0.0,2.0,-0.0,-0.0,0.0,1.0,-0.0,0.0
max,9.0,7.0,8.0,9.0,7.0,9.0,11.0,7.0,9.0,8.0,...,3.0,3.0,2.0,4.0,2.0,3.0,3.0,3.0,4.0,2.0


In [73]:
# Split the dataframe.

x = battles.drop(['win'], axis=1)
y = battles['win']

In [74]:
# Analyse the model.

# https://github.com/statsmodels/statsmodels/issues/3931#issuecomment-343810211
stats.chisqprob = stats.chi2.sf
print(f'Shape: {x.shape}. Rank: {numpy.linalg.matrix_rank(x.values)}.')
# TODO: formula_api.Logit(y, x).fit().summary()

Shape: (420, 108). Rank: 104.


In [75]:
# Train, adjust hyper-parameters and evaluate.

classifier = LogisticRegression(max_iter=1000, fit_intercept=False)
param_grid = {'C': numpy.logspace(-4, 2, num=1000)}

numpy.random.seed(42)
grid_search = GridSearchCV(classifier, param_grid, cv=cv, scoring=scoring, n_jobs=4).fit(x, y)
best_estimator = grid_search.best_estimator_

print(f'Best score: {grid_search.best_score_}')
print(f'Best params: {grid_search.best_params_}')
print(f'Classes: {best_estimator.classes_}')

scores = cross_val_score(best_estimator, x, y, scoring=scoring, cv=cv)
print(f'CV score: {scores.mean()} (std: {scores.std()})')

Best score: 0.9666666666666667
Best params: {'C': 92.03731996618231}
Classes: [False  True]
CV score: 0.9666666666666666 (std: 0.035634832254989916)


In [76]:
# Re-train the best model on the entire dataset.

best_estimator.fit(x, y)

LogisticRegression(C=92.03731996618231, class_weight=None, dual=False,
          fit_intercept=False, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [77]:
# Save the model.

print(f'''
"""
Arena battle prediction model.
Auto-generated on {datetime.now().replace(microsecond=0)}.
X shape: {x.shape}.
"""

import pickle

FEATURE_NAMES = {list(x.columns)}

MODEL = pickle.loads({pickle.dumps(best_estimator)})
'''.strip(), file=open('model.py', 'wt'))

In [78]:
# Display the feature importances.

pandas.DataFrame({'Feature': x.columns, 'Importance': best_estimator.coef_[0]}) \
    .set_index('Feature') \
    .sort_values('Importance', ascending=False) \
    .head(n=10)

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
color_34,15.03397
star_18,9.626666
star_4,9.484464
star_1,9.19183
color_13,8.419173
color_17,7.619978
star_13,7.245966
star_17,7.088063
color_39,6.842805
star_7,6.648345


In [79]:
# Display prediction examples.

result = pandas.concat((
    pandas.Series(best_estimator.predict(x), index=battles.index, name='win_predicted'),
    pandas.Series(best_estimator.predict_proba(x)[:, 1], index=battles.index, name='win_probability'),
    battles,
), axis=1)
result['win_probability'] = result['win_probability'].apply('{:.2f}'.format)
result.head()

Unnamed: 0,win_predicted,win_probability,color_1,color_10,color_11,color_12,color_13,color_14,color_15,color_17,...,star_38,star_39,star_4,star_40,star_5,star_6,star_7,star_8,star_9,win
159,False,0.04,0.0,7.0,0.0,-8.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,False
125,True,0.97,-0.0,-7.0,-0.0,-0.0,-7.0,-0.0,7.0,-0.0,...,-0.0,-0.0,2.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,True
206,False,0.0,0.0,7.0,0.0,-9.0,7.0,0.0,0.0,0.0,...,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,-3.0,0.0,False
173,False,0.01,-0.0,-7.0,-0.0,7.0,-7.0,-0.0,7.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.0,-0.0,-0.0,False
237,True,1.0,-0.0,-7.0,-7.0,9.0,3.0,-0.0,9.0,-0.0,...,-0.0,-0.0,3.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,True


In [80]:
# Experimental NN model.
# I want it to catch an interaction between different heroes (e.g. Йорген is good in conjunction with Исмаил).

def create_model() -> keras.Model:
    model = Sequential()
    model.add(Dense(1000, input_dim=x.columns.size, activation='sigmoid'))
    model.add(Dropout(rate=0.1))
    model.add(Dense(1, input_dim=x.columns.size, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

keras_classifier = KerasClassifier(build_fn=create_model, epochs=100, verbose=0)

numpy.random.seed(42)
scores = cross_val_score(keras_classifier, x, y, cv=cv)

print(f'Score: {scores.mean()} (std: {scores.std()})')

Score: 0.9714285705770764 (std: 0.020756661766139033)
