# Load 5-letter words and count unique characters

- - -

Using Owen Yin's Wordle list from [here](https://medium.com/@owenyin/here-lies-wordle-2021-2027-full-answer-list-52017ee99e86).

In [None]:
import string
import pandas as pd

# using Owen Yin's Wordle list from here: 
# https://medium.com/@owenyin/here-lies-wordle-2021-2027-full-answer-list-52017ee99e86
df = pd.read_csv('nyt.txt', header=None, names=['word'])

# Filter out rows where words are not exactly 5 characters long. This should not
# be necessary as the word list should already be limited to words that are only
# 5 characters long.
df = df[df['word'].str.len()==5]

# calculate the number of unique characters in each word
df['unique'] = df['word'].apply(lambda x: pd.Series(list(x)).unique().size)

print(F"number of words = {df.size}")

# Analyze characters of each word

- - -

Create dataframe that splits each word into its letters

In [None]:
# split each word into 5 columns in a new dataframe in order to analyze
# each character position separately
df_split = df['word'].apply(lambda x: pd.Series(list(x)))
df_split.columns = ['char1','char2','char3','char4','char5']
df_split.head()

### Show most common letters for each position

In [None]:
df_common = pd.DataFrame()

# go column by column and retrieve a list of the most common characters
# using value_counts(), which automatically sorts in descending order
for colName, colVal in df_split.items():
    popular = colVal.value_counts().head().index.tolist()
    df_common[colName] = popular

# rename the dataframe indices for readability
idx = {0:'first',1:'second',2:'third',3:'fourth',4:'fifth'}
df_common.rename(index=idx,inplace=True)
df_common.head()

### Show most popular letters overall

In [None]:
popularity = pd.Series([],dtype=int)
for col in list(df_split):
    popularity = popularity.add(df_split[col].value_counts(), fill_value=0)

popularity.sort_values(ascending=False).head()

# Calculate scores

- - -

### Calculate scores based on character position

In [None]:
"""
Calculate a score based on the rank of each character position. For the given
WORD, award POINTS if its characters match the corresponding location in LETTERS.
"""
def calc_positional_score(word, letters, points):
    total = 0
    for idx, val in enumerate(list(word)):
        if val == letters[idx]:
            total += points
    return total


df['positional score'] = 0

# Step through the first 3 rows of the most popular letters for each character position
# and award points for words whose characters match those popular characters.
for idy in range(3):
    letters = df_common.iloc[idy].tolist()
    df['positional score'] += df['word'].apply(
        lambda x: calc_positional_score(x, letters, 5-idy)
    )

df.sort_values(by=['positional score'], ascending=False).head(10)


### Calculate scores based on character popularity

In [None]:
df['popularity score'] = df['word'].apply(lambda word: sum(popularity[y] for y in word))
df.sort_values(by=['popularity score'], ascending=False).head(10)

In [None]:
# remove words with double letters
df[df['unique']==5].sort_values(by=['popularity score'], ascending=False).head(10)

### Calculate scores based on a hybrid approach

In [None]:
# create an empty dataframe whose index is the alphabet
df_hybrid = pd.DataFrame()
df_hybrid.index = list(string.ascii_uppercase)

# concat the value counts of the letters of the alphabet for each character position
for letter in ['char1','char2','char3','char4','char5']:
    df_hybrid = pd.concat([df_hybrid, df_split[letter].value_counts().to_frame()],axis=1)

# concat can create NaN values in Pandas; replace those with zero
df_hybrid.fillna(0,inplace=True)
# the presence of NaN in a column force the column to be float; convert to int
df_hybrid = df_hybrid.astype(int)
df_hybrid.columns = ['char1','char2','char3','char4','char5']
df_hybrid


In [None]:
def calc_hybrid_score(word):
    total = 0
    # print(word)
    for idx,letter in enumerate(word):
        # give one point if the character 
        total += df_hybrid.iloc[:,idx][letter]
        for col in ['char1','char2','char3','char4','char5']:
            total += df_hybrid[col][letter]
    return total

df['hybrid score'] = df['word'].apply(lambda x: calc_hybrid_score(x))
df[df['unique']==5].sort_values(by=['hybrid score','popularity score','positional score'], ascending=False).head(10)

# Words with the most unique vowels or consonants

- - -

In [None]:
# returns a point for each unique vowel in a word
def score_vowels(val):
    total = 0
    for letter in 'AEIOUY':
        if letter in val:
            total += 1
    return total

df['vowels'] = df['word'].apply(lambda x: score_vowels(x))
df.sort_values(by=['vowels','popularity score','positional score'], ascending=False).head(10)


In [None]:
# returns a point for each consonant in the word
def score_consonants(val):
    total = 0
    for letter in val:
        if letter in 'BCDFGHJKLMNPQRSTVWXZ':
            total += 1
    return total

df['consonants'] = df['word'].apply(lambda x: score_consonants(x))
df.sort_values(by=['consonants','popularity score','positional score'], ascending=False).head(10)

# Searching for patterns

- - -

In [None]:
pattern = 'EAROT'

df['pattern'] = df['word'].apply(lambda word: sum(1 for letter in pattern if letter in word))
df.sort_values(['pattern','popularity score','positional score'], ascending=False).head()