In [1]:
import itertools
import urllib
import numpy as np

In [2]:
# words_url = 'https://github.com/dwyl/english-words/blob/master/words_alpha.txt?raw=true)'
words_url = 'https://github.com/zeisler/scrabble/blob/master/db/dictionary.csv?raw=true'

In [3]:
file = urllib.request.urlopen(words_url)

In [4]:
decoded_lines = []

for line in file:
    decoded_line = line.decode('utf-8')
    decoded_lines.append(decoded_line)

In [5]:
decoded_lines[:10]

['aa\r\n',
 'aah\r\n',
 'aahed\r\n',
 'aahing\r\n',
 'aahs\r\n',
 'aal\r\n',
 'aalii\r\n',
 'aaliis\r\n',
 'aals\r\n',
 'aardvark\r\n']

In [6]:
decoded_lines = [each.strip() for each in decoded_lines]

In [7]:
five_letter_words = [each for each in decoded_lines if len(each) == 5]

In [8]:
len(five_letter_words)

8636

## Most common letter?

In [9]:
big_word_string = ''.join(five_letter_words)

In [10]:
import pandas as pd

In [11]:
word_series = pd.Series(list(big_word_string))

In [12]:
most_common_letters = word_series.value_counts().index[:10].values

In [13]:
most_common_letters

array(['s', 'e', 'a', 'o', 'r', 'i', 'l', 't', 'n', 'd'], dtype=object)

In [14]:
def made_of_common_letters(word, most_common_letters=most_common_letters):
    return all([char in most_common_letters for char in word])

In [15]:
five_letter_words_common = [word for word in five_letter_words if made_of_common_letters(word)]

In [16]:
# wordstack = '\n'.join(five_letter_words_common)

# with open('wordstack.txt', 'w') as f:
#     f.write(wordstack)

In [17]:
letters_by_placement = np.array([list(each) for each in five_letter_words_common]).T

In [21]:
# first_placement

In [19]:
first_placement = pd.Series(letters_by_placement[0]).value_counts(normalize=True)
second_placement = pd.Series(letters_by_placement[1]).value_counts(normalize=True)
third_placement = pd.Series(letters_by_placement[2]).value_counts(normalize=True)
fourth_placement = pd.Series(letters_by_placement[3]).value_counts(normalize=True)
fifth_placement = pd.Series(letters_by_placement[4]).value_counts(normalize=True)

In [20]:
dict(first_placement)

{'s': 0.19413092550790068,
 't': 0.1636568848758465,
 'r': 0.11851015801354402,
 'd': 0.11286681715575621,
 'a': 0.10948081264108352,
 'l': 0.10496613995485328,
 'n': 0.06433408577878104,
 'e': 0.046275395033860044,
 'o': 0.04401805869074492,
 'i': 0.0417607223476298}

In [22]:
placements = [first_placement, second_placement, third_placement, fourth_placement, fifth_placement]

In [23]:
weighted_dicts = [dict(placement) for placement in placements]

In [24]:
def flip_dict(d):
    return {d: k for k, d in d.items()}

In [25]:
placement_dicts = [dict(enumerate(placement.index)) for placement in placements]
placement_dicts = [flip_dict(d) for d in placement_dicts]

In [26]:
def freq_encode(word):
    '''
    Converts a word into a numerical encoding based on character
    positioning and frequency.
    '''
    assert len(word) == 5
    assert made_of_common_letters(word)
    
#     return placement_dicts[0].get(word[0])

    encoding = []
    
    for ix, d in enumerate(weighted_dicts):
        code = d.get(word[ix])
        encoding.append(code)
        
    return sum(encoding)

In [27]:
freq_encode('adorn')

0.34198645598194133

In [28]:
freq_encode('tores')

1.1388261851015802

In [29]:
freq_encode('nidal')

0.4785553047404063

In [30]:
words_list = [each for each in five_letter_words_common if len(set(each)) == 5]

In [31]:
word_df = pd.DataFrame(words_list, columns=['word'])

In [32]:
word_df['encoding'] = word_df['word'].map(freq_encode)

In [33]:
word_df.sort_values(by='encoding', ascending=False).head(15)

Unnamed: 0,word,encoding
403,tores,1.138826
360,tares,1.129797
396,toles,1.123025
401,tones,1.123025
356,tales,1.113995
387,tires,1.110609
284,rotes,1.10158
84,dotes,1.095937
385,tines,1.094808
382,tiles,1.094808


In [34]:
from itertools import permutations

In [35]:
def get_excl_letters(word, most_common_letters=most_common_letters):
    return [char for char in most_common_letters if char not in word]

In [36]:
get_excl_letters('tiles')

['a', 'o', 'r', 'n', 'd']

In [37]:
def generate_permutations(letter_list):
    return [''.join(each) for each in list(permutations(letter_list))]

In [38]:
def get_paired_words(word, most_common_letters=most_common_letters):
    excl_letters = get_excl_letters(word)
    perms = generate_permutations(excl_letters)
    return [perm for perm in perms if perm in five_letter_words]

In [39]:
def get_highest_scored_paired_word(paired_words, encoding_func=freq_encode):
    if len(paired_words) == 1:
        return paired_words[0]
    else:
        max_score = 0
        best_word = None
        for word in paired_words:
            word_score = encoding_func(word)
#             print(word_score)
            if word_score > max_score:
                max_score = word_score
                best_word = word
#                 print(f'best_word set to {best_word}')
        return best_word

In [50]:
get_excl_letters('aegis')

['o', 'r', 'l', 't', 'n', 'd']

In [49]:
get_paired_words('aegis')

['adorn', 'radon']

In [213]:
get_highest_scored_paired_word(get_paired_words('tiles'))

'radon'

In [219]:
def get_complement(word):
    paired_words = get_paired_words(word)
#     print(paired_words)
    complement = get_highest_scored_paired_word(paired_words)
    return complement

In [221]:
word_df['complement'] = word_df['word'].map(get_complement)

In [223]:
word_df['complement'].isnull().sum()

124

In [229]:
complements = word_df.dropna().copy()

In [231]:
complements.head(3)

Unnamed: 0,word,encoding,complement
1,adits,0.727991,loner
2,adore,0.414221,lints
3,adorn,0.341986,tiles


In [232]:
complements['complement_encoding'] = complements['complement'].map(freq_encode)

In [234]:
complements['encoding_sum'] = complements['encoding'] + complements['complement_encoding']
complements['encoding_product'] = complements['encoding'] * complements['complement_encoding']

In [238]:
complements.sort_values(by='encoding_sum', ascending=False).head(15)

Unnamed: 0,word,encoding,complement,complement_encoding,encoding_sum,encoding_product
134,laris,0.892777,toned,0.825056,1.717833,0.736591
399,toned,0.825056,laris,0.892777,1.717833,0.736591
240,ranis,0.890519,toled,0.825056,1.715576,0.734729
395,toled,0.825056,ranis,0.890519,1.715576,0.734729
396,toles,1.123025,ranid,0.592551,1.715576,0.665449
239,ranid,0.592551,toles,1.123025,1.715576,0.665449
163,liras,0.889391,toned,0.825056,1.714447,0.733797
157,lined,0.738149,toras,0.976298,1.714447,0.720653
402,toras,0.976298,lined,0.738149,1.714447,0.720653
234,rails,0.880361,toned,0.825056,1.705418,0.726348


In [239]:
complements.sort_values(by='encoding_product', ascending=False).head(15)

Unnamed: 0,word,encoding,complement,complement_encoding,encoding_sum,encoding_product
134,laris,0.892777,toned,0.825056,1.717833,0.736591
399,toned,0.825056,laris,0.892777,1.717833,0.736591
395,toled,0.825056,ranis,0.890519,1.715576,0.734729
240,ranis,0.890519,toled,0.825056,1.715576,0.734729
163,liras,0.889391,toned,0.825056,1.714447,0.733797
234,rails,0.880361,toned,0.825056,1.705418,0.726348
402,toras,0.976298,lined,0.738149,1.714447,0.720653
157,lined,0.738149,toras,0.976298,1.714447,0.720653
359,tared,0.831828,loins,0.865688,1.697517,0.720104
172,loins,0.865688,tared,0.831828,1.697517,0.720104


In [182]:
get_highest_scored_paired_word(get_paired_words('tiles'))

'radon'

In [180]:
get_paired_words('tiles')

['adorn', 'radon']