In [1]:
from itertools import combinations
import numpy as np
import pandas as pd
import seaborn as sns
from string import ascii_uppercase
import time as time

%matplotlib inline

Create a dictionary of all possible 1 to 6-tile leaves

In [2]:
tilebag = ['A']*9+['B']*2+['C']*2+['D']*4+['E']*12+\
          ['F']*2+['G']*3+['H']*2+['I']*9+['J']*1+\
          ['K']*1+['L']*4+['M']*2+['N']*6+['O']*8+\
          ['P']*2+['Q']*1+['R']*6+['S']*4+['T']*6+\
          ['U']*4+['V']*2+['W']*2+['X']*1+['Y']*2+\
          ['Z']*1+['?']*2
            
# leaves_1 = ascii_uppercase + '?'
# leaves_2 = sorted(list(set(list(combinations(tilebag,2)))))
# leaves_2 = [x[0]+x[1] for x in leaves_2]

In [3]:
t0 = time.time()

maximum_superleave_length = 5
leaves = {i:sorted(list(set(list(combinations(tilebag,i))))) for i in 
          range(1,maximum_superleave_length+1)}

for i in range(1,maximum_superleave_length+1):
    leaves[i] = [''.join(leave) for leave in leaves[i]]
    
t1 = time.time()
print('Calculated superleaves up to length {} in {} seconds'.format(
    maximum_superleave_length,t1-t0))

Calculated superleaves up to length 5 in 25.13836693763733 seconds


The bottom creates the full set of leaves for all lengths from 1-5 (6 breaks on my local machine)

In [4]:
for i in range(1,6):
    print(i,len(leaves[i]))

1 27
2 373
3 3509
4 25254
5 148150


In [7]:
df = pd.read_csv('../log_games.csv', keep_default_na=False)

In [8]:
df.head()

Unnamed: 0,playerID,gameID,turn,rack,play,score,totalscore,leave,equity,tilesremaining
0,0,43ead121-14b8-47f6-a15f-2f60226a3465,0,OICIUAR,8D CURIO,20,20,AI,19.987,86
1,0,23dc26da-9618-460b-9caa-e9540791b021,0,SNFOITN,8D FONTS,24,24,IN,26.321,86
2,1,43ead121-14b8-47f6-a15f-2f60226a3465,1,YAINONE,D4 YONI.,20,20,AEN,25.056,81
3,0,6bbe7bee-007e-4c20-b2bf-2aaa917ebc59,0,IRYDLBR,8H DRILY,26,26,BR,26.304,86
4,1,23dc26da-9618-460b-9caa-e9540791b021,1,DLYFNTU,F6 FU.NY,27,27,DLT,25.561,81


In [9]:
tile_limit = 1
df = df.loc[df['tilesremaining']>=tile_limit]

In [10]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30587368 entries, 0 to 36820050
Data columns (total 10 columns):
playerID          int64
gameID            object
turn              int64
rack              object
play              object
score             int64
totalscore        int64
leave             object
equity            float64
tilesremaining    int64
dtypes: float64(1), int64(5), object(4)
memory usage: 9.6 GB


Alphabetize rack

In [11]:
df['rack'] = df['rack'].apply(lambda x: ''.join(sorted(x)))

Create flag of whether each letter is contained in a rack, and also check for multiples

In [12]:
for multiple in range(1,6):
    t0 = time.time()

    # iterate through all 27 tiles
    for c in leaves[1]:

        if multiple*c in leaves[multiple]:
            df[multiple*c] = df['rack'].apply(lambda x: multiple*c in x)
            
    t1 = time.time()
    print('Added columns for all duplicates up to length {} in {} seconds'.format(multiple,t1-t0))

Added columns for all duplicates up to length 1 in 192.12264442443848 seconds
Added columns for all duplicates up to length 2 in 179.32395005226135 seconds
Added columns for all duplicates up to length 3 in 100.38379287719727 seconds
Added columns for all duplicates up to length 4 in 92.12169122695923 seconds
Added columns for all duplicates up to length 5 in 59.88667702674866 seconds


In [13]:
# following variable indicates what length of superleave we try to calculate up to.
# WARNING: might break for now if you try to do too long.
ev_calculator_max_length = 3

In [14]:
all_leaves = []

for i in range(1,ev_calculator_max_length+1):
    all_leaves += leaves[i]

ev_df = pd.DataFrame(columns=['mean','std','count','ev','synergy'],
                     index=all_leaves)

To find all of the racks corresponding to a particular leave, we have added columns to the dataframe of plays df marking each letter (A, B, C...) and also for duplicates (AA, BB, CC...) and triplicates where possible (AAA, DDD, EEE...).

If the letters in a given leave are all different, we can look for rows by using df['A']&df['B']. However, if there are duplicates involved, we have to look for df['AA']. The following function gives the correct dataframe columns to be looked up.

In [15]:
def get_columns(leave):
    letters=list(set(leave))
    tags = []
    
    for l in letters:
        tags += [sum([l==letter for letter in leave])*l]
    
    return tags

In [16]:
for leave_length in range(1,ev_calculator_max_length+1):
    print(leave_length)
    t0 = time.time()
    
    for leave in leaves[leave_length]:
        condition = df[get_columns(leave)].all(axis=1)
        ev_df.loc[leave]['mean'] = df.loc[condition]['score'].mean()
        ev_df.loc[leave]['std'] = df.loc[condition]['score'].std()
        ev_df.loc[leave]['count'] = len(df.loc[condition])
        
    t1 = time.time()
    print('Calculated mean, std and count in {} seconds'.format(t1-t0))

1
Calculated mean, std and count in 191.87206315994263 seconds
2
Calculated mean, std and count in 1046.7881920337677 seconds
3
Calculated mean, std and count in 6749.346915721893 seconds


In [17]:
ev_df['pct'] = 100*ev_df['count']/len(df)
ev_df['ev'] = ev_df['mean']-df['score'].mean()

Calculate leave "synergy", in other words the difference between the EV of the rack and what we'd expect just from adding the individual values of the tiles

In [18]:
for leave_length in range(2,ev_calculator_max_length+1):
    for leave in leaves[leave_length]:
        ev_df.loc[leave]['synergy'] = ev_df.loc[leave]['ev']-\
                                      sum([ev_df.loc[c]['ev'] for c in leave])

In [19]:
ev_df

Unnamed: 0,mean,std,count,ev,synergy,pct
?,54.3015,29.9093,4472557,15.7352,,14.6222
A,39.6563,22.0263,15022710,1.09008,,49.1141
B,36.9365,19.5086,3694257,-1.62968,,12.0777
C,39.2795,22.1449,4241973,0.713264,,13.8684
D,39.284,21.8473,7642419,0.717803,,24.9855
E,40.4349,22.9035,19032075,1.86867,,62.222
F,36.7207,17.8844,3511947,-1.84554,,11.4817
G,36.6384,20.3258,5983143,-1.92778,,19.5608
H,39.6928,19.8293,3571368,1.12657,,11.676
I,39.0347,22.3068,15849900,0.468522,,51.8185


In [20]:
ev_df.to_csv('leave_values_010919_v4.csv')

In [21]:
ev_df.sort_values('synergy')

Unnamed: 0,mean,std,count,ev,synergy,pct
S??,48.6499,40.9945,50725,10.0837,-26.0376,0.165836
RRR,23.7867,15.0874,125396,-14.7796,-19.1992,0.40996
YY?,34.0438,17.4827,7845,-4.52237,-18.5062,0.0256478
NNN,23.5414,14.4115,129230,-15.0248,-17.5533,0.422495
III,22.7846,12.6703,479673,-15.7816,-17.1871,1.56821
J??,51.4327,33.5236,5258,12.8665,-16.7268,0.0171901
VV?,30.2133,22.449,13476,-8.35296,-16.5545,0.0440574
SSS,36.1413,21.2141,24160,-2.42487,-16.3775,0.0789869
LLL,22.6121,13.3829,24136,-15.9541,-16.2902,0.0789084
WW?,32.2015,19.9369,8406,-6.3647,-16.2632,0.0274819
