In [1]:
from itertools import combinations
import numpy as np
import pandas as pd
import seaborn as sns
from string import ascii_uppercase
import time as time

%matplotlib inline

Create a dictionary of all possible 1 to 6-tile leaves

In [2]:
tilebag = ['A']*9+['B']*2+['C']*2+['D']*4+['E']*12+\
          ['F']*2+['G']*3+['H']*2+['I']*9+['J']*1+\
          ['K']*1+['L']*4+['M']*2+['N']*6+['O']*8+\
          ['P']*2+['Q']*1+['R']*6+['S']*4+['T']*6+\
          ['U']*4+['V']*2+['W']*2+['X']*1+['Y']*2+\
          ['Z']*1+['?']*2
            
tiles = [x for x in ascii_uppercase] + ['?']
# leaves_2 = sorted(list(set(list(combinations(tilebag,2)))))
# leaves_2 = [x[0]+x[1] for x in leaves_2]

In [3]:
t0 = time.time()

maximum_superleave_length = 5
leaves = {i:sorted(list(set(list(combinations(tilebag,i))))) for i in 
          range(1,maximum_superleave_length+1)}

for i in range(1,maximum_superleave_length+1):
    leaves[i] = [''.join(leave) for leave in leaves[i]]
    
t1 = time.time()
print('Calculated superleaves up to length {} in {} seconds'.format(
    maximum_superleave_length,t1-t0))

Calculated superleaves up to length 5 in 19.196850061416626 seconds


The bottom creates the full set of leaves for all lengths from 1-5 (6 breaks on my local machine)

In [4]:
for i in range(1,maximum_superleave_length+1):
    print(i,len(leaves[i]))

1 27
2 373
3 3509
4 25254
5 148150


In [5]:
column_dict = {
    0:'rack',
    1:'score',
    2:'tiles_remaining'
}
df = pd.read_csv('log_games.csv', header=None, keep_default_na=False)
df.rename(columns=column_dict,inplace=True)

In [6]:
tile_limit = 1
df = df.loc[df['tiles_remaining']>=tile_limit]

Alphabetize rack

In [7]:
df['rack'] = df['rack'].apply(lambda x: ''.join(sorted(x)))

In [8]:
df_dict = {'': df}

In [9]:
tb = time.time()

for multiple in range(1,maximum_superleave_length+1):
    t0 = time.time()

    # iterate through all 27 tiles
    for c in leaves[1]:
        if multiple*c in leaves[multiple]:
            condition = df_dict[(multiple-1)*c]['rack'].apply(lambda x: multiple*c in x)
            df_dict[multiple*c] = df_dict[(multiple-1)*c].loc[condition]
            df[multiple*c] = condition
            df[multiple*c].fillna(False, inplace=True)
            
    t1 = time.time()
    print('Added columns for all duplicates up to length {} in {} seconds'.format(multiple,t1-t0))
    
te = time.time()
print('Added all necessary columns in {} seconds'.format(te-tb))

Added columns for all duplicates up to length 1 in 143.9655900001526 seconds
Added columns for all duplicates up to length 2 in 92.13934993743896 seconds
Added columns for all duplicates up to length 3 in 31.657102823257446 seconds
Added columns for all duplicates up to length 4 in 19.807263135910034 seconds
Added columns for all duplicates up to length 5 in 10.858633995056152 seconds
Added all necessary columns in 298.42888283729553 seconds


In [10]:
# following variable indicates what length of superleave we try to calculate up to.
# WARNING: might break for now if you try to do too long.
ev_calculator_max_length = 4

In [11]:
all_leaves = []

for i in range(1,ev_calculator_max_length+1):
    all_leaves += leaves[i]
    
df_dict = {leave: pd.DataFrame() for leave in all_leaves}
df_dict[''] = df

ev_df = pd.DataFrame(columns=['mean','std','count','ev','synergy'],
                     index=all_leaves)

To find all of the racks corresponding to a particular leave, we have added columns to the dataframe of plays df marking each letter (A, B, C...) and also for duplicates (AA, BB, CC...) and triplicates where possible (AAA, DDD, EEE...).

If the letters in a given leave are all different, we can look for rows by using df['A']&df['B']. However, if there are duplicates involved, we have to look for df['AA']. The following function gives the correct dataframe columns to be looked up.

In [None]:
def get_columns(leave):
    letters=list(set(leave))
    tags = []
    
    for l in letters:
        tags += [sum([l==letter for letter in leave])*l]
    
    return tags

In [None]:
for leave_length in range(1,ev_calculator_max_length+1):
    print(leave_length)
    t0 = time.time()
    
    for leave in leaves[leave_length]:
        condition = df_dict[leave[:-1]][get_columns(leave)].all(axis=1)
        df_dict[leave] = df_dict[leave[:-1]].loc[condition]
        ev_df.loc[leave]['mean'] = df_dict[leave]['score'].mean()
        ev_df.loc[leave]['std'] = df_dict[leave]['score'].std()
        ev_df.loc[leave]['count'] = len(df_dict[leave])
        
    t1 = time.time()
    print('Calculated mean, std and count in {} seconds'.format(t1-t0))

1
Calculated mean, std and count in 33.53370523452759 seconds
2
Calculated mean, std and count in 151.88216018676758 seconds
3
Calculated mean, std and count in 346.1307852268219 seconds
4


In [None]:
ev_df['pct'] = 100*ev_df['count']/len(df)
ev_df['ev'] = ev_df['mean']-df['score'].mean()

Calculate leave "synergy", in other words the difference between the EV of the rack and what we'd expect just from adding the individual values of the tiles

In [None]:
for leave_length in range(2,ev_calculator_max_length+1):
    for leave in leaves[leave_length]:
        ev_df.loc[leave]['synergy'] = ev_df.loc[leave]['ev']-\
                                      sum([ev_df.loc[c]['ev'] for c in leave])

In [None]:
ev_df

In [None]:
ev_df.to_csv('leave_values_011219_v7.csv')

In [24]:
ev_df.sort_values('synergy')

Unnamed: 0,mean,std,count,ev,synergy,pct
SSS,34.1852,18.3993,7731,-1.32534,-20.2978,0.0418657
YY?,40.4391,14.7511,3696,4.92856,-16.923,0.020015
KX?,46.8817,12.6115,3517,11.3712,-14.5166,0.0190456
XZ?,52.5693,14.4442,3218,17.0587,-14.4254,0.0174264
WW?,38.2988,15.0233,4927,2.78819,-14.1912,0.0266812
RRR,24.2387,11.3792,151484,-11.2719,-14.1155,0.820331
SSZ,41.5167,16.6089,10356,6.00614,-13.5018,0.0560809
SSX,39.9536,15.7869,10506,4.44298,-13.4212,0.0568931
HH?,46.4122,20.0918,3763,10.9016,-13.3813,0.0203778
JX?,47.499,12.065,3625,11.9885,-13.3302,0.0196305
