In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
url = 'https://docs.google.com/spreadsheets/d/1HcdISgCl3s4RpWkJa8m-G1JjfKzd8qf2WY2Xcw32D7U/pub?gid=1371955398&single=true&output=csv'

In [3]:
import pandas
import re
from enum import Enum

class Character(Enum):
   
    def __ge__(self, other):
        if self.__class__ is other.__class__:
            return self.value >= other.value
        return NotImplemented
    def __gt__(self, other):
        if self.__class__ is other.__class__:
            return self.value > other.value
        return NotImplemented
    def __le__(self, other):
        if self.__class__ is other.__class__:
            return self.value <= other.value
        return NotImplemented
    def __lt__(self, other):
        if self.__class__ is other.__class__:
            return self.value < other.value
        return NotImplemented

    Argagarg = 'argagarg'
    BBB = 'bbb'
    DeGrey = 'degrey'
    Geiger = 'geiger'
    Gloria = 'gloria'
    Grave = 'grave'
    Gwen = 'gwen'
    Jaina = 'jaina'
    Lum = 'lum'
    Menelker = 'menelker'
    Midori = 'midori'
    Onimaru = 'onimaru'
    Persephone = 'persephone'
    Quince = 'quince'
    Rook = 'rook'
    Setsuki = 'setsuki'
    Troq = 'troq'
    Valerie = 'valerie'
    Vendetta = 'vendetta'
    Zane = 'zane'
    
    def __str__(self):
        return self.name
    
for char in Character:
    locals()[char.name] = char
    
character_category = pandas.api.types.CategoricalDtype(Character, ordered=True)
    
historical_record = pandas.read_csv(url)
historical_record.columns = [re.sub('\W+', '_', col.lower()).strip('_') for col in historical_record.columns]

historical_record = historical_record[
    ~historical_record.character_1.isin(['Squall', 'Kefka', 'Ultimicia']) &
    ~historical_record.character_2.isin(['Squall', 'Kefka', 'Ultimicia'])
]

historical_record.character_2.replace(to_replace=['variable'], value=['Gloria'], inplace=True)

names = pandas.DataFrame({'name': historical_record.player_1.append(historical_record.player_2)})
names['lower'] = names.apply(lambda r: r['name'].lower(), axis=1)
name_map = names.groupby('lower').first()

historical_record['match_date'] = pandas.to_datetime(historical_record.match_date, infer_datetime_format=True)
historical_record.format_restricted.replace(to_replace=['.', 'Restricted'], value=[False, True], inplace=True)
historical_record.format_team.replace(to_replace=['.', 'Team'], value=[False, True], inplace=True)
historical_record.char_select_random.replace(to_replace=['.', 'Random'], value=[False, True], inplace=True)
historical_record.char_select_locked.replace(to_replace=['.', 'Locked'], value=[False, True], inplace=True)
historical_record.set_length_non_ft3_ft4.replace(to_replace=['.', 'non-FT3/FT4'], value=[False, True], inplace=True)
historical_record.set_win_1 = historical_record.set_win_1.fillna(0)
historical_record.set_win_2 = historical_record.set_win_2.fillna(0)
historical_record.wins_1 = historical_record.wins_1.fillna(0)
historical_record.wins_2 = historical_record.wins_2.fillna(0)
historical_record.character_1 = historical_record.character_1.apply(lambda n: Character(n.lower()))
historical_record.character_2 = historical_record.character_2.apply(lambda n: Character(n.lower()))
historical_record.player_1 = historical_record.player_1.apply(lambda n: name_map.loc[n.lower()])
historical_record.player_2 = historical_record.player_2.apply(lambda n: name_map.loc[n.lower()])

player_category = pandas.api.types.CategoricalDtype(
    sorted(historical_record.player_1.append(historical_record.player_2).unique()),
    ordered=True
)

tournament_category = pandas.api.types.CategoricalDtype(
    sorted(historical_record.tournament_name.unique()),
    ordered=True
)

historical_record = historical_record.astype({
    'tournament_name': tournament_category,
    'set_win_1': 'int8',
    'player_1': player_category,
    'character_1': character_category,
    'wins_1': 'int8',
    'wins_2': 'int8',
    'character_2': character_category,
    'player_2': player_category,
    'set_win_2': 'int8',
})


In [4]:
historical_record.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12136 entries, 0 to 12137
Data columns (total 15 columns):
tournament_name           12136 non-null category
match_date                12136 non-null datetime64[ns]
char_select_random        12136 non-null bool
char_select_locked        12136 non-null bool
set_length_non_ft3_ft4    12136 non-null bool
format_team               12136 non-null bool
format_restricted         12136 non-null bool
set_win_1                 12136 non-null int8
player_1                  12136 non-null category
character_1               12136 non-null category
wins_1                    12136 non-null int8
wins_2                    12136 non-null int8
character_2               12136 non-null category
player_2                  12136 non-null category
set_win_2                 12136 non-null int8
dtypes: bool(5), category(5), datetime64[ns](1), int8(4)
memory usage: 468.8 KB


In [5]:
historical_record.head()

Unnamed: 0,tournament_name,match_date,char_select_random,char_select_locked,set_length_non_ft3_ft4,format_team,format_restricted,set_win_1,player_1,character_1,wins_1,wins_2,character_2,player_2,set_win_2
0,IYL Season 1,2014-08-16,False,False,False,False,False,0,Akawashi,Gwen,2,1,Argagarg,Bob199,1
1,IYL Season 1,2014-08-16,False,False,False,False,False,0,Akawashi,Gwen,0,2,Lum,Bob199,0
2,IYL Season 1,2014-08-17,False,False,False,False,False,0,CKR,Setsuki,2,1,Zane,Bob199,1
3,IYL Season 1,2014-08-17,False,False,False,False,False,0,CKR,Setsuki,0,2,Grave,Bob199,0
4,IYL Season 1,2014-08-17,False,False,False,False,False,1,Akawashi,Troq,1,0,Argagarg,CKR,0


In [6]:
import numpy as np
p1_wins = historical_record.loc[np.repeat(historical_record.index.values, historical_record.wins_1.astype(int))].reset_index(drop=True)
p2_wins = historical_record.loc[np.repeat(historical_record.index.values, historical_record.wins_2.astype(int))].reset_index(drop=True)

p1_wins['win'] = 1
p2_wins['win'] = 0

games = pandas.concat([p1_wins, p2_wins]).reset_index(drop=True)[['tournament_name', 'match_date', 'player_1', 'character_1', 'win', 'character_2', 'player_2']]

backwards_mus = games.character_1 > games.character_2
games[backwards_mus] = games[backwards_mus].rename(columns={
    'player_1': 'player_2',
    'player_2': 'player_1',
    'character_1': 'character_2',
    'character_2': 'character_1',
})
games.loc[backwards_mus, ['win']] = 1 - games[backwards_mus].win

mirror_mus_to_flip = list(games[games.character_1 == games.character_2].iloc[::2].index.values)
games.iloc[mirror_mus_to_flip] = games.iloc[mirror_mus_to_flip].rename(columns={
    'player_1': 'player_2',
    'player_2': 'player_1',
    'character_1': 'character_2',
    'character_2': 'character_1',
})
games.iloc[mirror_mus_to_flip, games.columns.get_loc('win')] = 1 - games.iloc[mirror_mus_to_flip].win

games = games.astype({'win': 'int8'})

In [7]:
games.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21385 entries, 0 to 21384
Data columns (total 7 columns):
tournament_name    21385 non-null category
match_date         21385 non-null datetime64[ns]
player_1           21385 non-null category
character_1        21385 non-null category
win                21385 non-null int8
character_2        21385 non-null category
player_2           21385 non-null category
dtypes: category(5), datetime64[ns](1), int8(1)
memory usage: 432.8 KB


In [8]:
games[games.character_1 == games.character_2].win.sum()

636

In [9]:
games.head()

Unnamed: 0,tournament_name,match_date,player_1,character_1,win,character_2,player_2
0,IYL Season 1,2014-08-16,Bob199,Argagarg,0,Gwen,Akawashi
1,IYL Season 1,2014-08-16,Bob199,Argagarg,0,Gwen,Akawashi
2,IYL Season 1,2014-08-17,CKR,Setsuki,1,Zane,Bob199
3,IYL Season 1,2014-08-17,CKR,Setsuki,1,Zane,Bob199
4,IYL Season 1,2014-08-17,CKR,Argagarg,0,Troq,Akawashi


In [10]:
import pystan
import pickle
import hashlib

stan_code = """
data {
    int<lower=0> NPT; // Number of player/tournaments
    int<lower=0> NG; // Number of games
    int<lower=0> NMG; // Number of mirror-games
    int<lower=0> NM; // Number of non-mirror matchups
    int<lower=0> NMM; // Number of mirror matchups
    
    int<lower=0, upper=NPT> prev_tournament[NPT]; // Previous tournament for player/tournament
    
    int<lower=0, upper=1> win[NG]; // Did player 1 win game
    int<lower=1, upper=NPT> pt1[NG]; // Player/tournament 1 in game
    int<lower=1, upper=NPT> pt2[NG]; // Player/tournament 2 in game
    int<lower=1, upper=NM> mup[NG]; // Matchup in game
    
    int<lower=0, upper=1> m_win[NMG]; // Did player 1 win mirror-match
    int<lower=1, upper=NPT> m_pt1[NMG]; // Player/tournament 1 in mirror-match
    int<lower=1, upper=NPT> m_pt2[NMG]; // Player/tournament 2 in mirror-match
    int<lower=1, upper=NM> m_mup[NMG]; // Matchup in game
    
}
parameters {
    vector[NPT] skill_adjust; // Skill change before player/tournament
    vector[NM] mu; // Matchup value
    vector<lower=0>[NM] muv; // Matchup skill multiplier
    vector<lower=0>[NMM] mmv; // Mirror matchup skill multiplier
}
transformed parameters {
    vector[NPT] skill;
    
    for (t in 1:NPT) {
        if (prev_tournament[t] == 0)
            skill[t] = skill_adjust[t];
        else
            skill[t] = skill[prev_tournament[t]] + skill_adjust[t];
    }
    
}
model {
    skill_adjust ~ std_normal();
    mu ~ normal(0, 0.5);
    mmv ~ std_normal();
    muv ~ std_normal();
    
    win ~ bernoulli_logit(muv[mup] .* (skill[pt1] - skill[pt2]) + mu[mup]);
    m_win ~ bernoulli_logit(mmv[m_mup] .* (skill[m_pt1] - skill[m_pt2]));
}
"""

model_hash = hashlib.md5(stan_code.encode('utf-8')).hexdigest()
print(f"Model Hash: {model_hash}")

model_filename = f"{model_hash}.model"

try:
    with open(model_filename, 'rb') as model_file:
        model = pickle.load(model_file)
except IOError:
    with open(f"{model_hash}.stan", "w") as code_file:
        code_file.write(stan_code)
    model = pystan.StanModel(model_code=stan_code)
    with open(model_filename, 'wb') as model_file:
        pickle.dump(model, model_file)

Model Hash: 165eac82d0ff9a5eedae03d5326dc6fa


In [11]:
player_index = dict(zip(sorted(games.player_1.append(games.player_2).unique()), range(1, 1000)))
mu_index = dict(zip(((c1, c2) for c1 in Character for c2 in Character if c1 < c2), range(1, 1000)))
mirror_index = dict(zip(Character, range(1, 1000)))

p1_games = games[['player_1', 'tournament_name', 'match_date']].rename(columns={'player_1': 'player'})
p2_games = games[['player_2', 'tournament_name', 'match_date']].rename(columns={'player_2': 'player'})

player_tournament_dates = (
    p1_games
    .append(p2_games)
    .groupby(['player', 'tournament_name'])
    .match_date
    .quantile(0.5)
    .reset_index()
    .sort_values(['match_date', 'player'])
    .reset_index(drop=True)
)

player_tournament_index = dict(player_tournament_dates.apply(lambda r: ((r.player, r.tournament_name), r.name + 1), axis=1).values)

for player in player_tournament_dates.player.unique():
    player_tournament_dates.loc[player_tournament_dates.player == player, 'previous'] = (
        [-1] + list(player_tournament_dates.loc[player_tournament_dates.player == player].index.values)[:-1]
    )


ordered_tournaments = games.groupby('tournament_name').match_date.quantile(0.5).reset_index().sort_values('match_date').tournament_name
tournament_index = dict(zip(ordered_tournaments, range(1, 1000)))

non_mirror_games = games[games.character_1 != games.character_2]
mirror_games = games[games.character_1 == games.character_2]

stan_data = {
    'NPT': len(player_tournament_index),
    'NG': len(non_mirror_games),
    'NMG': len(mirror_games),
    'NM': len(mu_index),
    'NMM': len(mirror_index),
    'win': non_mirror_games.win,
    'pt1': non_mirror_games.apply(lambda r: player_tournament_index[(r.player_1, r.tournament_name)], axis=1),
    'pt2': non_mirror_games.apply(lambda r: player_tournament_index[(r.player_2, r.tournament_name)], axis=1),
    'mup': non_mirror_games.apply(lambda r: mu_index[(r.character_1, r.character_2)], axis=1),
    'm_win': mirror_games.win,
    'm_pt1': mirror_games.apply(lambda r: player_tournament_index[(r.player_1, r.tournament_name)], axis=1),
    'm_pt2': mirror_games.apply(lambda r: player_tournament_index[(r.player_2, r.tournament_name)], axis=1),
    'm_mup': mirror_games.apply(lambda r: mirror_index[r.character_1], axis=1),
    'prev_tournament': player_tournament_dates.previous.astype(int).apply(lambda x: x+1)
}

del(p1_games)
del(p2_games)
del(player_tournament_dates)
del(non_mirror_games)
del(mirror_games)

In [12]:
# data_hash = hashlib.md5(pickle.dumps(stan_data)).hexdigest()
data_hash = 'f99dc79a2a1d584d25f833bdc5e4c656'
print(f"Data Hash: {data_hash}")

Data Hash: f99dc79a2a1d584d25f833bdc5e4c656


In [13]:
results_filename = f'{model_hash}-{data_hash}.csv'

try:
    fit_results = pandas.DataFrame.from_csv(results_filename)
except FileNotFoundError:

    fit = model.sampling(
        data=stan_data,
        iter=1000,
        chains=4,
        n_jobs=1,
    )
    fit_results = fit.to_dataframe()

    fit_results.to_csv(results_filename)

  after removing the cwd from sys.path.


In [14]:
fit_results.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Columns: 5134 entries, chain to lp__
dtypes: float64(5128), int64(6)
memory usage: 78.4 MB


In [15]:
matchups = fit_results[[col for col in fit_results.columns if col.startswith('mu[')]].rename(
    columns={'mu[{}]'.format(ix): "{.value}-{.value}".format(c1, c2) for ((c1, c2), ix) in mu_index.items()}
).unstack().rename('win_rate').reset_index()
matchups['c1'] = matchups.level_0.apply(lambda x: Character(x.split('-')[0])).astype(character_category)
matchups['c2'] = matchups.level_0.apply(lambda x: Character(x.split('-')[1])).astype(character_category)
matchups['win_rate'] = pandas.to_numeric(matchups['win_rate'])
del(matchups['level_0'])
matchups = matchups.rename(columns={'level_1': 'sample'})

In [16]:
matchups.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380000 entries, 0 to 379999
Data columns (total 4 columns):
sample      380000 non-null int64
win_rate    380000 non-null float64
c1          380000 non-null category
c2          380000 non-null category
dtypes: category(2), float64(1), int64(1)
memory usage: 6.5 MB


In [17]:
flipped = matchups[matchups.c1 != matchups.c2].rename(columns={'c1': 'c2', 'c2': 'c1'})
flipped['win_rate'] = -flipped['win_rate']

matchups = matchups.append(flipped)
del(flipped)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [18]:
matchups.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 760000 entries, 0 to 379999
Data columns (total 4 columns):
c1          760000 non-null category
c2          760000 non-null category
sample      760000 non-null int64
win_rate    760000 non-null float64
dtypes: category(2), float64(1), int64(1)
memory usage: 18.8 MB


In [19]:
import math

matchups['win_rate'] = pandas.to_numeric(10*(matchups['win_rate'].rpow(math.e)) / (1 + matchups['win_rate'].rpow(math.e)))

median_rates = pandas.to_numeric(matchups.groupby(['c1', 'c2']).win_rate.median().rename('median_win_rate'))
text_color = median_rates.reset_index().median_win_rate.apply(lambda x: 'white' if x > 6 or x < 4 else 'black')

matchups = matchups.join(median_rates, on=['c1', 'c2'])

In [20]:
matchups.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 760000 entries, 0 to 379999
Data columns (total 5 columns):
c1                 760000 non-null category
c2                 760000 non-null category
sample             760000 non-null int64
win_rate           760000 non-null float64
median_win_rate    760000 non-null float64
dtypes: category(2), float64(2), int64(1)
memory usage: 24.6 MB


In [22]:
from IPython.core.display import display
from plotnine import *

matchup_chart = (
    ggplot(matchups, aes(x='0', y='win_rate', fill='median_win_rate'))
    + geom_violin()
    + geom_text(
        data=median_rates.reset_index(),
        mapping=aes(label='median_win_rate', y='median_win_rate', x=0, size=9),
        color=text_color,
        format_string='{:.2}',
    )
    + facet_grid('c1 ~ c2')
    + coord_flip()
    + theme(figure_size=(25, 15))
    + scale_fill_gradient2(midpoint=5)
)

filename = f'yomi-matchup-estimates-{model_hash}-{data_hash}.png'
matchup_chart.save(filename)

  from_inches(height, units), units))
  warn('Filename: {}'.format(filename))
  return not cbook.iterable(value) and (cbook.is_numlike(value) or
  return not cbook.iterable(value) and (cbook.is_numlike(value) or
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat(lst, axis=axis, ignore_index=True)


In [21]:
matchups.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 760000 entries, 0 to 379999
Data columns (total 5 columns):
c1                 760000 non-null category
c2                 760000 non-null category
sample             760000 non-null int64
win_rate           760000 non-null float64
median_win_rate    760000 non-null float64
dtypes: category(2), float64(2), int64(1)
memory usage: 24.6 MB


In [22]:
std_devs = fit_results[[
    col for col in fit_results.columns
    if col.startswith('muv[') or col.startswith('mmv[')
]].rename(
    columns={'muv[{}]'.format(ix): "{.value}-{.value}".format(c1, c2) for ((c1, c2), ix) in mu_index.items()}
).rename(
    columns={'mmv[{}]'.format(ix): "{.value}-{.value}".format(c1, c1) for (c1, ix) in mirror_index.items()}
).unstack().rename('std_dev').reset_index()
std_devs['c1'] = std_devs.level_0.apply(lambda x: Character(x.split('-')[0])).astype(character_category)
std_devs['c2'] = std_devs.level_0.apply(lambda x: Character(x.split('-')[1])).astype(character_category)
del(std_devs['level_0'])
std_devs = std_devs.rename(columns={'level_1': 'sample'})

In [23]:
std_devs.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420000 entries, 0 to 419999
Data columns (total 4 columns):
sample     420000 non-null int64
std_dev    420000 non-null float64
c1         420000 non-null category
c2         420000 non-null category
dtypes: category(2), float64(1), int64(1)
memory usage: 7.2 MB


In [24]:
flipped = std_devs[std_devs.c1 != std_devs.c2].rename(columns={'c1': 'c2', 'c2': 'c1'})

std_devs = std_devs.append(flipped)

In [25]:
std_devs.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800000 entries, 0 to 379999
Data columns (total 4 columns):
c1         800000 non-null category
c2         800000 non-null category
sample     800000 non-null int64
std_dev    800000 non-null float64
dtypes: category(2), float64(1), int64(1)
memory usage: 19.8 MB


In [26]:
import math
#std_devs['std_dev'] = 10*(std_devs['std_dev'].rpow(math.e)) / (1 + std_devs['std_dev'].rpow(math.e))-5

median_rates = pandas.DataFrame(std_devs.groupby(['c1', 'c2']).std_dev.median().rename('median_std_dev'))
text_color = median_rates.reset_index().median_std_dev.apply(lambda x: 'white' if x < 1 else 'black')

std_devs = std_devs.join(median_rates, on=['c1', 'c2'])

In [27]:
std_devs.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800000 entries, 0 to 379999
Data columns (total 5 columns):
c1                800000 non-null category
c2                800000 non-null category
sample            800000 non-null int64
std_dev           800000 non-null float64
median_std_dev    800000 non-null float64
dtypes: category(2), float64(2), int64(1)
memory usage: 25.9 MB


In [30]:
from plotnine import *
from IPython.core.display import display

std_dev_chart = (
    ggplot(std_devs, aes(x='0', y='std_dev', fill='median_std_dev'))
    + geom_violin()
    + geom_text(
        data=median_rates.reset_index(),
        mapping=aes(label='median_std_dev', x=.4, y=2, size=9),
        #color=text_color,
        format_string='median={:.2}',
    )
    + facet_grid('c1 ~ c2')
    + coord_flip()
    + theme(figure_size=(25, 15))
    + scale_fill_gradient2(midpoint=1)
)

filename = f'yomi-std-dev-estimates-{model_hash}-{data_hash}.png'
std_dev_chart.save(filename)

  from_inches(height, units), units))
  warn('Filename: {}'.format(filename))
  return not cbook.iterable(value) and (cbook.is_numlike(value) or
  return not cbook.iterable(value) and (cbook.is_numlike(value) or
  return not cbook.iterable(value) and (cbook.is_numlike(value) or


In [31]:
(
    ggplot(std_devs.sample(frac=0.01), aes(x='0', y='std_dev', fill='median_std_dev'))
    + geom_violin()
    + facet_wrap('c1')
    + coord_flip()
    + theme(figure_size=(5, 5))
    + scale_fill_gradient2(midpoint=1)
).save(f'skill-effects-character-{model_hash}-{data_hash}.png')

  from_inches(height, units), units))
  warn('Filename: {}'.format(filename))


In [33]:
from IPython.core.display import display

reverse_player_tournament_index = {ix: (player, tournament) for ((player, tournament), ix) in player_tournament_index.items()}

player_tournament_skill = fit_results[[col for col in fit_results.columns if col.startswith('skill[')]].unstack().rename('skill').reset_index()
player_tournament_skill['player'] = player_tournament_skill.level_0.apply(
    lambda x: reverse_player_tournament_index[int(x[6:-1])][0]
).astype(player_category)
player_tournament_skill['tournament'] = player_tournament_skill.level_0.apply(
    lambda x: reverse_player_tournament_index[int(x[6:-1])][1]
).astype(tournament_category)
#     columns={'skill[{}]'.format(ix): player for (player, ix) in player_index.items()}
# )
del(player_tournament_skill['level_0'])
player_tournament_skill = player_tournament_skill.rename(columns={'level_1': 'sample'})
tournament_list = games.groupby('tournament_name').match_date.quantile(0.5).sort_values().index.tolist()

player_tournament_skill['tournament'] = player_tournament_skill['tournament'].cat.reorder_categories(tournament_list, ordered=True )


def render_player(player):
    player_skill = player_tournament_skill[player_tournament_skill.player == player]
    player_chart = (
        ggplot(player_skill, aes(x='tournament', y='skill'))
        + geom_violin()
        + theme(
            figure_size=(player_skill.tournament.nunique()*.2, 2),
            axis_text_x=element_text(rotation=90),
        )
        + labs(title=player)
    )
    filename = f'{player}-skill-{model_hash}-{data_hash}.png'
    player_chart.save(filename)
    
def render_players(*players):
    for player in players:
        render_player(player)


In [34]:
player_tournament_skill.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4724000 entries, 0 to 4723999
Data columns (total 4 columns):
sample        int64
skill         float64
player        category
tournament    category
dtypes: category(2), float64(1), int64(1)
memory usage: 90.1 MB


In [34]:
render_players(
    'vengefulpickle',
    'mysticjuicer',
    'cpat',
    'CKR',
    'Bomber678',
    'snoc',
    'thehug0naut',
    'Fluffiness',
    'Hobusu',
)

  from_inches(height, units), units))
  warn('Filename: {}'.format(filename))
  from_inches(height, units), units))
  warn('Filename: {}'.format(filename))
  from_inches(height, units), units))
  warn('Filename: {}'.format(filename))
  from_inches(height, units), units))
  warn('Filename: {}'.format(filename))
  from_inches(height, units), units))
  warn('Filename: {}'.format(filename))
  from_inches(height, units), units))
  warn('Filename: {}'.format(filename))
  from_inches(height, units), units))
  warn('Filename: {}'.format(filename))
  from_inches(height, units), units))
  warn('Filename: {}'.format(filename))
  from_inches(height, units), units))
  warn('Filename: {}'.format(filename))


In [35]:
iyl6 = player_tournament_skill[player_tournament_skill.tournament=='IYL Season 6'].copy()

iyl_player_list = iyl6.groupby('player').skill.median().sort_values().index.tolist()
iyl6['player'] = iyl6['player'].astype('category').cat.reorder_categories(iyl_player_list, ordered=True)

plot = (
    ggplot(iyl6, aes(x='player', y='skill'))
    + geom_violin()
    + theme(
        figure_size=(10, 10),
    )
    + coord_flip()
)
plot.save(f'iyl6-{model_hash}-{data_hash}.png')

NameError: name 'ggplot' is not defined

In [38]:
num_samples = player_tournament_skill['sample'].nunique()
skill_subsample = pandas.cut(
    player_tournament_skill.set_index(['tournament', 'player']).skill,
    bins=[x-0.5 for x in range(-9, 9)],
    precision=0,
).rename('skill').reset_index().groupby(
    ['tournament', 'player', 'skill']
).size().rename('skill_fraction').reset_index(['skill'])

skill_subsample['skill_fraction'] /= num_samples

game_skill = games.join(
    skill_subsample.rename(columns={'skill': 'skill_1', 'skill_fraction': 'skill_fraction_1'}),
    on=('tournament_name', 'player_1')
).join(
    skill_subsample.rename(columns={'skill': 'skill_2', 'skill_fraction': 'skill_fraction_2'}),
    on=('tournament_name', 'player_2')
)

flipped = game_skill[game_skill.character_1 < game_skill.character_2]
flipped = flipped.rename(columns={
    'character_1': 'character_2',
    'character_2': 'character_1',
    'skill_1': 'skill_2',
    'skill_2': 'skill_1',
    'skill_fraction_1': 'skill_fraction_2',
    'skill_fraction_2': 'skill_fraction_1',
})
flipped['win'] = 1-flipped.win
game_skill = game_skill.append(flipped)

game_skill['skill_fraction'] = game_skill.skill_fraction_1 * game_skill.skill_fraction_2
game_skill['win_fraction'] = game_skill.win * game_skill.skill_fraction



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [39]:
game_skill.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2407502 entries, 0 to 21384
Data columns (total 13 columns):
character_1         category
character_2         category
match_date          datetime64[ns]
player_1            category
player_2            category
skill_1             category
skill_2             category
skill_fraction_1    float64
skill_fraction_2    float64
tournament_name     object
win                 int8
skill_fraction      float64
win_fraction        float64
dtypes: category(6), datetime64[ns](1), float64(4), int8(1), object(1)
memory usage: 300.3 MB


In [40]:
game_skill.groupby(['character_1', 'character_2', 'skill_1', 'skill_2']).sum().dropna().sort_values(by='win_fraction', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,skill_fraction_1,skill_fraction_2,win,skill_fraction,win_fraction
character_1,character_2,skill_1,skill_2,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Troq,Zane,"(0.5, 1.5]","(-0.5, 0.5]",64.2015,55.6815,184.0,13.461586,9.588087
Troq,Zane,"(0.5, 1.5]","(0.5, 1.5]",65.5495,64.553,195.0,14.77309,9.448076
Troq,Setsuki,"(2.5, 3.5]","(3.5, 4.5]",57.843,51.1005,126.0,15.660598,8.671501
Quince,Setsuki,"(1.5, 2.5]","(0.5, 1.5]",56.674,40.53,100.0,13.945864,8.620961
Gloria,Argagarg,"(1.5, 2.5]","(0.5, 1.5]",45.649,50.186,105.0,12.538451,8.598522


In [45]:
import itertools
from plotnine import *

df = game_skill.groupby(
    ['character_1', 'character_2', 'skill_1', 'skill_2']
).sum().rename(columns={'skill_fraction': 'games_played', 'win_fraction': 'p1_wins'}).reset_index().dropna()
df['p1_win_prob'] = df.p1_wins / df.games_played
max_count = df['games_played'].max()
df['played_color'] = df.games_played.apply(lambda p: p > max_count*.8)
df['prob_color'] = df.p1_win_prob.apply(lambda p: not (.2 < p < .8))

df = df[df.games_played >= 0.05]

character_subsets = [list(Character)[x:x+4] for x in range(0, len(Character), 4)]

show_legends = False

for ((y, c1_subset), (x, c2_subset)) in itertools.product(enumerate(character_subsets), enumerate(character_subsets)):
    subset_selector = df.character_1.isin(c1_subset) & df.character_2.isin(c2_subset)
    
    tile_plot = (
        ggplot(
            df[subset_selector],
            aes(y='factor(skill_1)', x='factor(skill_2)')
        )
        + facet_grid('character_1 ~ character_2', labeller='label_both')
        + scale_color_grey(start=0, end=1, limits=[False, True])
        + labs(y='p1 skill', x='p2 skill')
    )
    
    (
        tile_plot
        + geom_tile(aes(fill='games_played'), show_legend=show_legends)
        + geom_text(
            aes(label='games_played', color='played_color'),
            size=7,
            format_string='{:.1f}',
            show_legend=show_legends,
        )
        + scale_fill_distiller(type='seq', limits=[0, max_count], palette='YlOrRd')
        + theme(
            figure_size=(15, 15),
            axis_text_x=element_text(rotation=90)
        )
    ).save(f'games-played-skill-{model_hash}-{data_hash}-{x}-{y}.png', limitsize=False, verbose=False)
    
    (
        tile_plot
        + geom_tile(aes(fill='p1_win_prob'), show_legend=show_legends)
        + geom_text(
            aes(label='p1_win_prob', color='prob_color'),
            size=7,
            format_string='{:.0%}',
            show_legend=show_legends,
        )
        + scale_fill_distiller(type='div', limits=[0, 1], palette='RdBu')
        + theme(
            figure_size=(15, 15),
            axis_text_x=element_text(rotation=90)
        )
    ).save(f'win-rate-skill-{model_hash}-{data_hash}-{x}-{y}.png', limitsize=False, verbose=False)


In [46]:
from PIL import Image

num_tiles = len(Character) // 4

for prefix in ('win-rate-skill', 'games-played-skill'):
    tiles = {
        (x, y): Image.open(f'{prefix}-{model_hash}-{data_hash}-{x}-{y}.png')
        for x in range(num_tiles)
        for y in range(num_tiles)
    }
    (tile_width, tile_height) = tiles[(0, 0)].size
    dest_image = Image.new(tiles[(0, 0)].mode, (num_tiles * tile_width, num_tiles * tile_height))
    for ((x, y), tile) in tiles.items():
        dest_image.paste(tile, (x * tile_width, y * tile_height))
    dest_image.save(f'{prefix}-{model_hash}-{data_hash}.png')