# Load 5-letter words and count unique characters

- - -

Using Owen Yin's Wordle list from [here](https://medium.com/@owenyin/here-lies-wordle-2021-2027-full-answer-list-52017ee99e86).

In [363]:
import pandas as pd

# using Owen Yin's Wordle list from here: 
# https://medium.com/@owenyin/here-lies-wordle-2021-2027-full-answer-list-52017ee99e86
df = pd.read_csv('nyt.txt', header=None, names=['word'])

# Filter out rows where words are not exactly 5 characters long. This should not
# be necessary as the word list should already be limited to words that are only
# 5 characters long.
df = df[df['word'].str.len()==5]

# calculate the number of unique characters in each word
df['unique'] = df['word'].apply(lambda x: pd.Series(list(x)).unique().size)

print(F"number of words = {df.size}")

number of words = 4630


# Analyze characters of each word

- - -

Create dataframe that splits each word into its letters

In [364]:
# split each word into 5 columns in a new dataframe in order to analyze
# each character position separately
df_split = df['word'].apply(lambda x: pd.Series(list(x)))
df_split.columns = ['char1','char2','char3','char4','char5']
df_split.head()

Unnamed: 0,char1,char2,char3,char4,char5
0,C,I,G,A,R
1,R,E,B,U,T
2,S,I,S,S,Y
3,H,U,M,P,H
4,A,W,A,K,E


### Show most common letters for each position

In [365]:
df_common = pd.DataFrame()

# go column by column and retrieve a list of the most common characters
# using value_counts(), which automatically sorts in descending order
for colName, colVal in df_split.items():
    popular = colVal.value_counts().head().index.tolist()
    df_common[colName] = popular

# rename the dataframe indices for readability
idx = {0:'first',1:'second',2:'third',3:'fourth',4:'fifth'}
df_common.rename(index=idx,inplace=True)
df_common.head()

Unnamed: 0,char1,char2,char3,char4,char5
first,S,A,A,E,E
second,C,O,I,N,Y
third,B,R,O,S,T
fourth,T,E,E,A,R
fifth,P,I,U,L,L


### Show most popular letters overall

In [366]:
popularity = pd.Series([],dtype=int)
for col in list(df_split):
    popularity = popularity.add(df_split[col].value_counts(), fill_value=0)

popularity.sort_values(ascending=False).head()

E    1233.0
A     979.0
R     899.0
O     754.0
T     729.0
dtype: float64

# Calculate scores

- - -

### Calculate scores based on character position

In [367]:
"""
Calculate a score based on the rank of each character position. For the given
WORD, award POINTS if its characters match the corresponding location in LETTERS.
"""
def calc_positional_score(word, letters, points):
    total = 0
    for idx, val in enumerate(list(word)):
        if val == letters[idx]:
            total += points
    return total


df['positional score'] = 0

# Step through the first 3 rows of the most popular letters for each character position
# and award points for words whose characters match those popular characters.
for idy in range(3):
    letters = df_common.iloc[idy].tolist()
    df['positional score'] += df['word'].apply(
        lambda x: calc_positional_score(x, letters, 5-idy)
    )

df.sort_values(by=['positional score'], ascending=False).head(10)


Unnamed: 0,word,unique,positional score
1729,CRANE,5,21
481,SAINT,5,21
261,BRINE,5,19
140,COAST,5,19
2243,CRONE,5,19
2159,CRONY,5,18
2053,CAGEY,5,18
2270,SHINE,5,18
587,SWINE,5,18
1934,BOAST,5,18


### Calculate scores based on character popularity

In [368]:
df['popularity score'] = df['word'].apply(lambda word: sum(popularity[y] for y in word))
df.sort_values(by=['popularity score'], ascending=False).head(10)

Unnamed: 0,word,unique,positional score,popularity score
620,EERIE,3,5,5269.0
737,EATER,4,10,5073.0
1522,ERASE,4,16,5013.0
1857,RARER,3,10,4909.0
1565,ELATE,4,10,4893.0
271,TEASE,4,13,4843.0
1420,EASEL,4,10,4833.0
1603,LEASE,4,13,4833.0
2103,TEPEE,3,10,4795.0
1284,ESTER,4,5,4763.0


In [369]:
df[df['unique']==5].sort_values(by=['popularity score'], ascending=False).head(10)

Unnamed: 0,word,unique,positional score,popularity score
873,LATER,5,10,4559.0
1563,ALERT,5,3,4559.0
1508,ALTER,5,5,4559.0
1252,AROSE,5,14,4534.0
872,IRATE,5,13,4511.0
915,STARE,5,15,4509.0
1668,RAISE,5,17,4451.0
560,ARISE,5,15,4451.0
1278,LEARN,5,5,4405.0
1346,RENAL,5,0,4405.0


# Words with the most unique vowels or consonants

- - -

In [370]:
# returns a point for each unique vowel in a word
def score_vowels(val):
    total = 0
    for letter in 'AEIOUY':
        if letter in val:
            total += 1
    return total

df['vowels'] = df['word'].apply(lambda x: score_vowels(x))
df.sort_values(by=['vowels','popularity score','positional score'], ascending=False).head(10)


Unnamed: 0,word,unique,positional score,popularity score,vowels
1589,AUDIO,5,0,3264.0,4
352,BAYOU,5,8,2906.0,4
1252,AROSE,5,14,4534.0,3
872,IRATE,5,13,4511.0,3
1668,RAISE,5,17,4451.0,3
560,ARISE,5,15,4451.0,3
1658,AISLE,5,5,4271.0,3
369,ATONE,5,12,4270.0,3
1808,TEARY,5,9,4265.0,3
112,ALONE,5,12,4260.0,3


In [371]:
# returns a point for each consonant in the word
def score_consonants(val):
    total = 0
    for letter in val:
        if letter in 'BCDFGHJKLMNPQRSTVWXZ':
            total += 1
    return total

df['consonants'] = df['word'].apply(lambda x: score_consonants(x))
df.sort_values(by=['consonants','popularity score','positional score'], ascending=False).head(10)

Unnamed: 0,word,unique,positional score,popularity score,vowels,consonants
1297,STERN,5,5,4105.0,1,4
1128,CREST,5,13,4007.0,1,4
84,START,4,13,4005.0,1,4
736,CRESS,4,10,3947.0,1,4
1798,DRESS,4,6,3863.0,1,4
533,SNARL,5,10,3841.0,1,4
2235,PRESS,4,6,3837.0,1,4
1859,TREND,5,7,3829.0,1,4
194,TROLL,4,6,3820.0,1,4
1940,STALL,4,10,3815.0,1,4


# Searching for patterns

- - -

In [373]:
pattern = 'EAROT'

df['pattern'] = df['word'].apply(lambda word: sum(1 for letter in pattern if letter in word))
df.sort_values(['pattern','popularity score','positional score'], ascending=False).head()

Unnamed: 0,word,unique,positional score,popularity score,vowels,consonants,pattern
737,EATER,4,10,5073.0,2,2,4
1782,TERRA,4,0,4739.0,2,3,4
443,TREAT,4,6,4569.0,2,3,4
873,LATER,5,10,4559.0,2,3,4
1508,ALTER,5,5,4559.0,2,3,4
