# Get Kaggle Kernel Results
Experiments run on Kaggle:
- Letter frequency
- GYX scores
- Letter frequency + word popularity
- GYX scores + word popularity

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns

from kaggle.api.kaggle_api_extended import KaggleApi

sns.set()

## Connect to Kaggle

In [2]:
api = KaggleApi()
api.authenticate()

## Get Kernels

In [3]:
words = ['arles', 'arose', 'dares', 'lares', 'lores', 'nares',
        'raile', 'raise', 'rales', 'rates', 'reais', 'roate',
        'soare', 'stare', 'tales', 'tares', 'tores']

algos = ['lf', 'gyx', 'lf-pop',
         'gyx-pop'
        ]

In [4]:
for algo in algos:
    for word in words:
        kernel_ref = f'chrischow/wordlebot-{algo}-{word}'
        download_path = f'../kernel_output/{algo}/'
        if algo == 'gyx':
            algo_correct = 'expected_gyx'
        elif algo == 'lf-pop':
            algo_correct = 'lf'
        elif algo == 'gyx-pop':
            algo_correct = 'expected_gyx'
        else:
            algo_correct = algo
        filename = f"wordlebot-{algo_correct}-{word}.csv"
        print(f'Checking {algo}-{word}...', flush=True, end='')
        if not filename in os.listdir(download_path):
            try:
                kernel_status = api.kernels_status(kernel_ref)['status']
            except:
                kernel_status = 'No kernel'
                continue
            if kernel_status == 'complete':
                api.kernels_output(kernel_ref, path=download_path)
                print('Downloaded output.')
            else:
                print('Kernel still running. Output not downloaded.')
        else:
            print('Output already downloaded.')

Checking lf-arles...Output already downloaded.
Checking lf-arose...Output already downloaded.
Checking lf-dares...Output already downloaded.
Checking lf-lares...Output already downloaded.
Checking lf-lores...Output already downloaded.
Checking lf-nares...Output already downloaded.
Checking lf-raile...Output already downloaded.
Checking lf-raise...Output already downloaded.
Checking lf-rales...Output already downloaded.
Checking lf-rates...Output already downloaded.
Checking lf-reais...Output already downloaded.
Checking lf-roate...Output already downloaded.
Checking lf-soare...Output already downloaded.
Checking lf-stare...Output already downloaded.
Checking lf-tales...Output already downloaded.
Checking lf-tares...Output already downloaded.
Checking lf-tores...Output already downloaded.
Checking gyx-arles...Output already downloaded.
Checking gyx-arose...Output already downloaded.
Checking gyx-dares...Output already downloaded.
Checking gyx-lares...Output already downloaded.
Checking 

## Get Data

In [32]:
df = pd.DataFrame()

for algo in algos:
    print(f'Loading {algo}...')
    for word in words:
        download_path = f'../kernel_output/{algo}/'
        if algo == 'gyx':
            algo_correct = 'expected_gyx'
        elif algo == 'lf-pop':
            algo_correct = 'lf'
        elif algo == 'gyx-pop':
            algo_correct = 'expected_gyx'
        else:
            algo_correct = algo
        filename = f"wordlebot-{algo_correct}-{word}.csv"
        
        if filename in os.listdir(download_path):
            temp_df = pd.read_csv(f'{download_path}{filename}')
            with open(f"{download_path}wordlebot-{algo}-{word}.log") as f:
                temp_log = f.readlines()
            runtime = pd.DataFrame(eval(''.join(temp_log))).time.iloc[-1]
            temp_results = pd.DataFrame([{
                'Seed Word': word,
                'Ranking Algorithm': algo,
                # 'Runtime': runtime,
                'Mean Steps': temp_df.steps.mean(),
                'Success Rate': temp_df.steps.le(6).mean(),
                '3-Steps or Less': temp_df.steps.le(3).mean(),
                'Worst Case': temp_df.steps.max()
            }])
            
            df = df.append(temp_results)

print('Loading ncands...')
for word in words:
    data_path = '../results'
    filename = f'wordlebot_{word}.csv'
    if filename in os.listdir(data_path):
        temp_df = pd.read_csv(f'{data_path}/{filename}')
        temp_results = pd.DataFrame([{
            'Seed Word': word,
            'Ranking Algorithm': 'ncands',
            # 'Runtime': runtime,
            'Mean Steps': temp_df.steps.mean(),
            'Success Rate': temp_df.steps.le(6).mean(),
            '3-Steps or Less': temp_df.steps.le(3).mean(),
            'Worst Case': temp_df.steps.max()
        }])
        df = df.append(temp_results)

Loading lf...
Loading gyx...
Loading lf-pop...
Loading gyx-pop...
Loading ncands...


## Compute Ranking

In [33]:
df['steps_rank'] = df['Mean Steps'].rank()
df['success_rank'] = df['Success Rate'].rank(ascending=False)
df['threestep_rank'] = df['3-Steps or Less'].rank(ascending=False)
df['avg_rank'] = df[['steps_rank', 'success_rank', 'threestep_rank']].mean(axis=1)

In [34]:
df.sort_values('avg_rank').head(20)

Unnamed: 0,Seed Word,Ranking Algorithm,Mean Steps,Success Rate,3-Steps or Less,Worst Case,steps_rank,success_rank,threestep_rank,avg_rank
0,tales,ncands,3.601728,0.997408,0.467387,8,1.0,2.0,5.0,2.666667
0,raile,ncands,3.606911,0.995248,0.48121,8,2.0,10.5,1.0,4.5
0,tores,ncands,3.617711,0.996976,0.462203,8,6.0,4.0,6.0,5.333333
0,tares,ncands,3.612095,0.99784,0.455724,8,5.0,1.0,11.0,5.666667
0,stare,ncands,3.611231,0.994816,0.471274,8,3.0,12.5,3.0,6.166667
0,roate,ncands,3.611663,0.994384,0.476458,8,4.0,15.0,2.0,7.0
0,lares,ncands,3.62635,0.996544,0.456156,7,8.0,7.0,9.5,8.166667
0,arles,ncands,3.634125,0.996976,0.451404,8,11.0,4.0,12.0,9.0
0,nares,ncands,3.632397,0.99568,0.456156,8,10.0,9.0,9.5,9.5
0,soare,ncands,3.623758,0.993952,0.469978,8,7.0,18.0,4.0,9.666667


In [39]:
# Check remaining words
df.pivot_table(index='Seed Word', values='Mean Steps', columns=['Ranking Algorithm'], aggfunc='count')

Ranking Algorithm,gyx,gyx-pop,lf,lf-pop,ncands
Seed Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
arles,1,1,1,1,1
arose,1,1,1,1,1
dares,1,1,1,1,1
lares,1,1,1,1,1
lores,1,1,1,1,1
nares,1,1,1,1,1
raile,1,1,1,1,1
raise,1,1,1,1,1
rales,1,1,1,1,1
rates,1,1,1,1,1


In [40]:
import statsmodels.api as sm
X = df[['Seed Word', 'Ranking Algorithm']].copy()
X.loc[X['Seed Word'].eq('stare'), 'Seed Word'] = '0stare'
X = pd.get_dummies(X, drop_first=True)
X['const'] = 1.0
y1 = df['Mean Steps']
y2 = df['Success Rate']
y3 = df['3-Steps or Less']

lm1 = sm.OLS(y1, X)
lm2 = sm.OLS(y2, X)
lm3 = sm.OLS(y3, X)

res1 = lm1.fit()
res2 = lm2.fit()
res3 = lm3.fit()

In [41]:
res1.summary()

0,1,2,3
Dep. Variable:,Mean Steps,R-squared:,0.977
Model:,OLS,Adj. R-squared:,0.97
Method:,Least Squares,F-statistic:,137.6
Date:,"Mon, 07 Feb 2022",Prob (F-statistic):,7.279999999999999e-45
Time:,17:54:23,Log-Likelihood:,222.74
No. Observations:,85,AIC:,-403.5
Df Residuals:,64,BIC:,-352.2
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Seed Word_arles,0.0505,0.013,3.931,0.000,0.025,0.076
Seed Word_arose,0.0315,0.013,2.457,0.017,0.006,0.057
Seed Word_dares,0.0977,0.013,7.613,0.000,0.072,0.123
Seed Word_lares,0.0569,0.013,4.436,0.000,0.031,0.083
Seed Word_lores,0.0963,0.013,7.505,0.000,0.071,0.122
Seed Word_nares,0.0766,0.013,5.971,0.000,0.051,0.102
Seed Word_raile,0.0454,0.013,3.541,0.001,0.020,0.071
Seed Word_raise,0.0398,0.013,3.103,0.003,0.014,0.065
Seed Word_rales,0.0669,0.013,5.210,0.000,0.041,0.093

0,1,2,3
Omnibus:,26.681,Durbin-Watson:,1.251
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5.188
Skew:,-0.069,Prob(JB):,0.0747
Kurtosis:,1.798,Cond. No.,19.3


In [42]:
res2.summary()

0,1,2,3
Dep. Variable:,Success Rate,R-squared:,0.93
Model:,OLS,Adj. R-squared:,0.908
Method:,Least Squares,F-statistic:,42.4
Date:,"Mon, 07 Feb 2022",Prob (F-statistic):,2.2200000000000002e-29
Time:,17:54:42,Log-Likelihood:,431.1
No. Observations:,85,AIC:,-820.2
Df Residuals:,64,BIC:,-768.9
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Seed Word_arles,-0.0003,0.001,-0.312,0.756,-0.003,0.002
Seed Word_arose,-0.0035,0.001,-3.203,0.002,-0.006,-0.001
Seed Word_dares,-0.0024,0.001,-2.187,0.032,-0.005,-0.000
Seed Word_lares,-0.0011,0.001,-1.015,0.314,-0.003,0.001
Seed Word_lores,-0.0028,0.001,-2.500,0.015,-0.005,-0.001
Seed Word_nares,-0.0021,0.001,-1.875,0.065,-0.004,0.000
Seed Word_raile,-0.0048,0.001,-4.374,0.000,-0.007,-0.003
Seed Word_raise,-0.0034,0.001,-3.046,0.003,-0.006,-0.001
Seed Word_rales,-0.0007,0.001,-0.625,0.534,-0.003,0.002

0,1,2,3
Omnibus:,0.551,Durbin-Watson:,1.915
Prob(Omnibus):,0.759,Jarque-Bera (JB):,0.669
Skew:,-0.044,Prob(JB):,0.716
Kurtosis:,2.574,Cond. No.,19.3


In [43]:
res3.summary()

0,1,2,3
Dep. Variable:,3-Steps or Less,R-squared:,0.964
Model:,OLS,Adj. R-squared:,0.952
Method:,Least Squares,F-statistic:,84.79
Date:,"Mon, 07 Feb 2022",Prob (F-statistic):,2.22e-38
Time:,17:55:01,Log-Likelihood:,291.35
No. Observations:,85,AIC:,-540.7
Df Residuals:,64,BIC:,-489.4
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Seed Word_arles,-0.0256,0.006,-4.467,0.000,-0.037,-0.014
Seed Word_arose,-0.0055,0.006,-0.966,0.338,-0.017,0.006
Seed Word_dares,-0.0461,0.006,-8.058,0.000,-0.058,-0.035
Seed Word_lares,-0.0269,0.006,-4.693,0.000,-0.038,-0.015
Seed Word_lores,-0.0418,0.006,-7.303,0.000,-0.053,-0.030
Seed Word_nares,-0.0339,0.006,-5.915,0.000,-0.045,-0.022
Seed Word_raile,-0.0056,0.006,-0.981,0.330,-0.017,0.006
Seed Word_raise,-0.0098,0.006,-1.720,0.090,-0.021,0.002
Seed Word_rales,-0.0320,0.006,-5.583,0.000,-0.043,-0.021

0,1,2,3
Omnibus:,9.978,Durbin-Watson:,1.676
Prob(Omnibus):,0.007,Jarque-Bera (JB):,3.571
Skew:,0.138,Prob(JB):,0.168
Kurtosis:,2.035,Cond. No.,19.3


In [10]:
df.groupby('Seed Word')['Ranking Algorithm'].count()

Seed Word
arles    2
arose    2
dares    2
lares    2
lores    2
nares    2
raile    2
raise    2
rales    2
rates    2
reais    2
roate    2
soare    2
stare    2
tales    2
tares    2
tores    2
Name: Ranking Algorithm, dtype: int64

## All Data

In [15]:
if 'Unnamed: 0' in all_data.columns:
    all_data = all_data.drop('Unnamed: 0', axis=1)
all_data['words'] = all_data.words.apply(eval)
all_data['word'] = all_data.words.apply(lambda x: x[0])
all_data['solution'] = all_data.words.apply(lambda x: x[-1])

In [23]:
unsolved = all_data.groupby(['solution', 'method']).steps.mean().reset_index()
unsolved = unsolved.loc[unsolved.steps.gt(6)].sort_values('steps', ascending=False)

In [26]:
pd.options.display.max_rows=100
unsolved.sort_values('solution')

Unnamed: 0,solution,method,steps
292,baker,gyx,6.117647
317,baste,lf,6.941176
316,baste,gyx,7.235294
318,batch,gyx,7.294118
460,bluer,gyx,6.117647
514,boxer,gyx,7.823529
515,boxer,lf,7.764706
579,brown,lf,6.411765
578,brown,gyx,6.294118
624,buyer,gyx,6.529412


In [38]:
import wordlebot

from wordlebot import Wordle

In [39]:
wordle_candidates, wordle_answers = wordlebot.load_data('data')
wordle = wordle_candidates.loc[
    wordle_candidates.word.apply(lambda x: len(x)==len(set(x)))
].append(wordle_answers).reset_index(drop=True)

word_popularity = wordlebot.load_word_popularity('data')

In [43]:
game = wordlebot.Wordle(wordle, wordle_answers)

In [44]:
game.guess('soare', 'gxggx')

SOARE --> GXGGX: 11 solutions remaining.


In [45]:
game.optimise('ncands')

  0%|          | 0/11 [00:00<?, ?it/s]

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=-2)]: Done 121 out of 121 | elapsed:    0.1s finished


Unnamed: 0,word,ncands_max,ncands_mean,nbuckets,bucket_entropy,ncands_max_rank,ncands_mean_rank,bucket_entropy_rank,avg_rank
0,stark,6,3.909091,5,1.294545,1.5,1.0,1.0,1.25
1,shark,6,4.090909,4,1.168518,1.5,2.0,2.0,1.75
2,smart,7,4.818182,5,1.159589,4.0,3.0,3.0,3.5
3,sharp,7,5.0,4,1.033562,4.0,4.5,4.5,4.25
4,spark,7,5.0,4,1.033562,4.0,4.5,4.5,4.25
5,start,8,6.090909,4,0.885574,6.5,6.0,6.0,6.25
6,shard,8,6.272727,3,0.759547,6.5,7.0,7.0,6.75
7,scarf,9,7.545455,3,0.600166,9.0,9.0,9.0,9.0
8,scary,9,7.545455,3,0.600166,9.0,9.0,9.0,9.0
9,swarm,9,7.545455,3,0.600166,9.0,9.0,9.0,9.0


In [46]:
game.guess('stark', 'gxggx')

STARK --> GXGGX: 6 solutions remaining.


In [47]:
game.optimise('ncands')

  0%|          | 0/6 [00:00<?, ?it/s]

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=-2)]: Done  36 out of  36 | elapsed:    0.1s finished


Unnamed: 0,word,ncands_max,ncands_mean,nbuckets,bucket_entropy,ncands_max_rank,ncands_mean_rank,bucket_entropy_rank,avg_rank
0,scarf,4,3.0,3,0.867563,2.5,2.5,2.5,2.5
1,scary,4,3.0,3,0.867563,2.5,2.5,2.5,2.5
2,shard,4,3.0,3,0.867563,2.5,2.5,2.5,2.5
3,sharp,4,3.0,3,0.867563,2.5,2.5,2.5,2.5
4,snarl,5,4.333333,2,0.450561,5.5,5.5,5.5,5.5
5,swarm,5,4.333333,2,0.450561,5.5,5.5,5.5,5.5


In [48]:
game.guess('scarf', 'gxggx')
game.optimise('ncands')

SCARF --> GXGGX: 4 solutions remaining.


  0%|          | 0/4 [00:00<?, ?it/s]

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=-2)]: Done  16 out of  16 | elapsed:    0.1s finished


Unnamed: 0,word,ncands_max,ncands_mean,nbuckets,bucket_entropy,ncands_max_rank,ncands_mean_rank,bucket_entropy_rank,avg_rank
0,shard,2,1.5,3,1.039721,1.5,1.5,1.5,1.5
1,sharp,2,1.5,3,1.039721,1.5,1.5,1.5,1.5
2,snarl,3,2.5,2,0.562335,3.5,3.5,3.5,3.5
3,swarm,3,2.5,2,0.562335,3.5,3.5,3.5,3.5
