In [1]:
from itertools import combinations
import numpy as np
import pandas as pd
import seaborn as sns
from string import ascii_uppercase
import time as time

%matplotlib inline

Create a dictionary of all possible 1 to 6-tile leaves

In [13]:
tilebag = ['A']*9+['B']*2+['C']*2+['D']*4+['E']*12+\
          ['F']*2+['G']*3+['H']*2+['I']*9+['J']*1+\
          ['K']*1+['L']*4+['M']*2+['N']*6+['O']*8+\
          ['P']*2+['Q']*1+['R']*6+['S']*4+['T']*6+\
          ['U']*4+['V']*2+['W']*2+['X']*1+['Y']*2+\
          ['Z']*1+['?']*2
            
tiles = [x for x in ascii_uppercase] + ['?']
# leaves_2 = sorted(list(set(list(combinations(tilebag,2)))))
# leaves_2 = [x[0]+x[1] for x in leaves_2]

In [3]:
t0 = time.time()

maximum_superleave_length = 5
leaves = {i:sorted(list(set(list(combinations(tilebag,i))))) for i in 
          range(1,maximum_superleave_length+1)}

for i in range(1,maximum_superleave_length+1):
    leaves[i] = [''.join(leave) for leave in leaves[i]]
    
t1 = time.time()
print('Calculated superleaves up to length {} in {} seconds'.format(
    maximum_superleave_length,t1-t0))

Calculated superleaves up to length 5 in 19.917773008346558 seconds


The bottom creates the full set of leaves for all lengths from 1-5 (6 breaks on my local machine)

In [4]:
for i in range(1,6):
    print(i,len(leaves[i]))

1 27
2 373
3 3509
4 25254
5 148150


In [5]:
column_dict = {
    0:'rack',
    1:'score',
    2:'tiles_remaining'
}
df = pd.read_csv('log_games.csv', header=None, keep_default_na=False)
df.rename(columns=column_dict,inplace=True)

In [6]:
tile_limit = 7
df = df.loc[df['tiles_remaining']>=tile_limit]

Alphabetize rack

In [7]:
df['rack'] = df['rack'].apply(lambda x: ''.join(sorted(x)))

Create flag of whether each letter is contained in a rack, and also check for multiples

In [7]:
for multiple in range(1,6):
    t0 = time.time()

    # iterate through all 27 tiles
    for c in leaves[1]:

        if multiple*c in leaves[multiple]:
            df[multiple*c] = df['rack'].apply(lambda x: multiple*c in x)
            
    t1 = time.time()
    print('Added columns for all duplicates up to length {} in {} seconds'.format(multiple,t1-t0))

Added columns for all duplicates up to length 1 in 124.51392412185669 seconds
Added columns for all duplicates up to length 2 in 119.02785897254944 seconds
Added columns for all duplicates up to length 3 in 64.65550684928894 seconds
Added columns for all duplicates up to length 4 in 56.814465045928955 seconds
Added columns for all duplicates up to length 5 in 36.3196918964386 seconds


In [44]:
# following variable indicates what length of superleave we try to calculate up to.
# WARNING: might break for now if you try to do too long.
ev_calculator_max_length = 3

In [57]:
all_leaves = []

for i in range(1,ev_calculator_max_length+1):
    all_leaves += leaves[i]
    
df_dict = {leave: pd.DataFrame() for leave in all_leaves}
df_dict[''] = df

ev_df = pd.DataFrame(columns=['mean','std','count','ev','synergy'],
                     index=all_leaves)

To find all of the racks corresponding to a particular leave, we have added columns to the dataframe of plays df marking each letter (A, B, C...) and also for duplicates (AA, BB, CC...) and triplicates where possible (AAA, DDD, EEE...).

If the letters in a given leave are all different, we can look for rows by using df['A']&df['B']. However, if there are duplicates involved, we have to look for df['AA']. The following function gives the correct dataframe columns to be looked up.

In [58]:
def get_columns(leave):
    letters=list(set(leave))
    tags = []
    
    for l in letters:
        tags += [sum([l==letter for letter in leave])*l]
    
    return tags

In [67]:
for leave_length in range(1,ev_calculator_max_length+1):
    print(leave_length)
    t0 = time.time()
    
    for leave in leaves[leave_length][:20]:
        t2 = time.time()
        condition = df_dict[leave[:-1]][get_columns(leave)].all(axis=1)
        df_dict[leave] = df_dict[leave[:-1]].loc[condition]
        t4 = time.time()
        ev_df.loc[leave]['mean'] = df_dict[leave]['score'].mean()
        t5 = time.time()
        ev_df.loc[leave]['std'] = df_dict[leave]['score'].std()
        t6 = time.time()
        ev_df.loc[leave]['count'] = len(df_dict[leave])
        t3 = time.time()
        print('Calculated EV of leave {} in {} seconds'.format(
              leave,t3-t2))
        # print(t4-t2)
        # print(t5-t4)
        # print(t6-t5)
        # print(t3-t6)
        
    t1 = time.time()
    print('Calculated mean, std and count in {} seconds'.format(t1-t0))

1
Calculated EV of leave ? in 0.49558520317077637 seconds
Calculated EV of leave A in 1.8147618770599365 seconds
Calculated EV of leave B in 0.7003812789916992 seconds
Calculated EV of leave C in 0.8366820812225342 seconds
Calculated EV of leave D in 1.094351053237915 seconds
Calculated EV of leave E in 2.4343879222869873 seconds
Calculated EV of leave F in 0.6291160583496094 seconds
Calculated EV of leave G in 1.065903902053833 seconds
Calculated EV of leave H in 0.6193957328796387 seconds
Calculated EV of leave I in 2.3928537368774414 seconds
Calculated EV of leave J in 0.46109604835510254 seconds
Calculated EV of leave K in 0.43598270416259766 seconds
Calculated EV of leave L in 1.3575332164764404 seconds
Calculated EV of leave M in 0.6677212715148926 seconds
Calculated EV of leave N in 1.656102180480957 seconds
Calculated EV of leave O in 1.9810559749603271 seconds
Calculated EV of leave P in 0.6283071041107178 seconds
Calculated EV of leave Q in 0.5599801540374756 seconds
Calculat

In [70]:
for leave_length in range(1,ev_calculator_max_length+1):
    print(leave_length)
    t0 = time.time()
    
    for leave in leaves[leave_length][:20]:
        t2 = time.time()
        condition = df[get_columns(leave)].all(axis=1)
        t4 = time.time()
        ev_df.loc[leave]['mean'] = df.loc[condition]['score'].mean()
        t5 = time.time()
        ev_df.loc[leave]['std'] = df.loc[condition]['score'].std()
        t6 = time.time()
        ev_df.loc[leave]['count'] = len(df.loc[condition])
        t3 = time.time()
        print('Calculated EV of leave {} in {} seconds'.format(
              leave,t3-t2))
        # print(t4-t2)
        # print(t5-t4)
        # print(t6-t5)
        # print(t3-t6)
        
    t1 = time.time()
    print('Calculated mean, std and count in {} seconds'.format(t1-t0))

1
Calculated EV of leave ? in 1.176253080368042 seconds
Calculated EV of leave A in 4.231508016586304 seconds
Calculated EV of leave B in 1.5274300575256348 seconds
Calculated EV of leave C in 1.5289928913116455 seconds
Calculated EV of leave D in 2.3533878326416016 seconds
Calculated EV of leave E in 5.428741216659546 seconds
Calculated EV of leave F in 1.473625898361206 seconds
Calculated EV of leave G in 2.177222967147827 seconds
Calculated EV of leave H in 1.331714153289795 seconds
Calculated EV of leave I in 4.9467267990112305 seconds
Calculated EV of leave J in 0.9408981800079346 seconds
Calculated EV of leave K in 0.8464927673339844 seconds
Calculated EV of leave L in 2.6979668140411377 seconds
Calculated EV of leave M in 1.4689033031463623 seconds
Calculated EV of leave N in 3.5230090618133545 seconds
Calculated EV of leave O in 4.104795932769775 seconds
Calculated EV of leave P in 1.470768928527832 seconds
Calculated EV of leave Q in 1.0160701274871826 seconds
Calculated EV of

In [15]:
ev_df['pct'] = 100*ev_df['count']/len(df)
ev_df['ev'] = ev_df['mean']-df['score'].mean()

Calculate leave "synergy", in other words the difference between the EV of the rack and what we'd expect just from adding the individual values of the tiles

In [16]:
for leave_length in range(2,ev_calculator_max_length+1):
    for leave in leaves[leave_length]:
        ev_df.loc[leave]['synergy'] = ev_df.loc[leave]['ev']-\
                                      sum([ev_df.loc[c]['ev'] for c in leave])

In [17]:
ev_df

Unnamed: 0,mean,std,count,ev,synergy,pct
?,54.9192,25.1511,1707622,19.4086,,9.24729
A,36.8361,18.8678,8316804,1.32554,,45.038
B,35.1204,16.7445,2355819,-0.390212,,12.7575
C,36.3152,18.7269,2399229,0.804599,,12.9925
D,36.6047,18.7607,4293197,1.09413,,23.249
E,37.4298,19.654,10163946,1.91925,,55.0408
F,35.5469,15.4998,2155441,0.0363451,,11.6724
G,34.4616,17.4353,4033589,-1.04899,,21.8431
H,37.9477,16.7506,1952294,2.43714,,10.5723
I,35.5348,18.8481,9795770,0.0242382,,53.047


In [18]:
ev_df.to_csv('leave_values_010619_v3.csv')

In [24]:
ev_df.sort_values('synergy')

Unnamed: 0,mean,std,count,ev,synergy,pct
SSS,34.1852,18.3993,7731,-1.32534,-20.2978,0.0418657
YY?,40.4391,14.7511,3696,4.92856,-16.923,0.020015
KX?,46.8817,12.6115,3517,11.3712,-14.5166,0.0190456
XZ?,52.5693,14.4442,3218,17.0587,-14.4254,0.0174264
WW?,38.2988,15.0233,4927,2.78819,-14.1912,0.0266812
RRR,24.2387,11.3792,151484,-11.2719,-14.1155,0.820331
SSZ,41.5167,16.6089,10356,6.00614,-13.5018,0.0560809
SSX,39.9536,15.7869,10506,4.44298,-13.4212,0.0568931
HH?,46.4122,20.0918,3763,10.9016,-13.3813,0.0203778
JX?,47.499,12.065,3625,11.9885,-13.3302,0.0196305
