In [1]:
import pandas as pd
import numpy as np

In [6]:
letter_frequencies = pd.read_csv('letter frequencies.csv')
letter_frequencies.head()

Unnamed: 0,Letter,English,French,German,Spanish,Portuguese,Italian,Turkish,Swedish,Polish,Dutch,Danish,Icelandic,Finnish,Czech,Hungarian
0,a,8.17%,7.64%,6.52%,11.53%,14.63%,11.75%,11.92%,9.38%,8.97%,7.49%,6.03%,10.11%,12.22%,8.42%,8.89%
1,b,1.49%,0.90%,1.89%,2.22%,1.04%,0.93%,2.84%,1.54%,1.48%,1.58%,2.00%,1.04%,0.28%,0.82%,1.94%
2,c,2.78%,3.26%,2.73%,4.02%,3.88%,4.50%,0.96%,1.49%,3.99%,1.24%,0.57%,0,0.28%,0.74%,0.65%
3,d,4.25%,3.67%,5.08%,5.01%,4.99%,3.74%,4.71%,4.70%,3.29%,5.93%,5.86%,1.58%,1.04%,3.48%,1.92%
4,e,12.70%,14.72%,16.40%,12.18%,12.57%,11.79%,8.91%,10.15%,7.92%,18.91%,15.45%,6.42%,7.97%,7.56%,11.60%


In [8]:
# remove % symbol
letter_frequencies = letter_frequencies.replace('%', '', regex=True)

# convert table to numeriic values
letter_frequencies = letter_frequencies.apply(pd.to_numeric, errors='ignore')
letter_frequencies.head()

Unnamed: 0,Letter,English,French,German,Spanish,Portuguese,Italian,Turkish,Swedish,Polish,Dutch,Danish,Icelandic,Finnish,Czech,Hungarian
0,a,8.17,7.64,6.52,11.53,14.63,11.75,11.92,9.38,8.97,7.49,6.03,10.11,12.22,8.42,8.89
1,b,1.49,0.9,1.89,2.22,1.04,0.93,2.84,1.54,1.48,1.58,2.0,1.04,0.28,0.82,1.94
2,c,2.78,3.26,2.73,4.02,3.88,4.5,0.96,1.49,3.99,1.24,0.57,0.0,0.28,0.74,0.65
3,d,4.25,3.67,5.08,5.01,4.99,3.74,4.71,4.7,3.29,5.93,5.86,1.58,1.04,3.48,1.92
4,e,12.7,14.72,16.4,12.18,12.57,11.79,8.91,10.15,7.92,18.91,15.45,6.42,7.97,7.56,11.6


In [9]:
# sort by english letter frequency
letter_frequencies = letter_frequencies.sort_values(by='English', ascending=False)
letter_frequencies.head(10)

Unnamed: 0,Letter,English,French,German,Spanish,Portuguese,Italian,Turkish,Swedish,Polish,Dutch,Danish,Icelandic,Finnish,Czech,Hungarian
4,e,12.7,14.72,16.4,12.18,12.57,11.79,8.91,10.15,7.92,18.91,15.45,6.42,7.97,7.56,11.6
19,t,9.06,7.24,6.15,4.63,4.34,5.62,3.31,7.69,3.97,6.79,6.86,4.95,8.75,5.73,6.96
0,a,8.17,7.64,6.52,11.53,14.63,11.75,11.92,9.38,8.97,7.49,6.03,10.11,12.22,8.42,8.89
14,o,7.51,5.8,2.59,8.68,9.74,9.83,2.48,4.48,7.59,6.06,4.64,2.17,5.61,6.7,3.65
8,i,6.97,7.53,6.55,6.25,6.19,10.14,8.6,5.82,8.29,6.5,6.0,7.58,10.82,6.07,4.25
13,n,6.75,7.1,9.78,6.71,4.45,6.88,7.49,8.54,5.6,10.03,7.24,7.71,8.83,6.47,6.82
18,s,6.33,7.95,7.27,7.98,6.81,4.98,3.01,6.59,4.26,3.73,5.81,5.63,7.86,5.21,6.99
7,h,6.09,0.94,4.58,1.97,1.28,0.14,1.21,2.09,1.07,2.38,1.62,1.87,1.85,1.36,1.26
17,r,5.99,6.69,7.0,6.87,6.53,6.37,6.72,8.43,4.57,6.41,8.96,8.58,2.87,4.8,2.65
3,d,4.25,3.67,5.08,5.01,4.99,3.74,4.71,4.7,3.29,5.93,5.86,1.58,1.04,3.48,1.92


In [31]:
import json
from collections import Counter

# Load the words from the json
with open('words.json') as file:
    words = json.load(file)

WORD_LENGTH = 8
PREFIX = "a"


In [34]:

def letter_frequency(words, length=WORD_LENGTH, prefix=PREFIX):
    # Filter words by the specified length and prefix if provided
    filtered_words = [
        word
        for word in words
        if len(word) == length and (prefix is None or word.startswith(prefix))
    ]


    # Join all the filtered words into a single string
    all_letters = "".join(filtered_words)

    # Calculate the frequency of each letter
    frequency = Counter(all_letters)

    # Calculate the total number of letters
    total_letters = sum(frequency.values())

    # Convert the frequency to percentage
    percentage_frequency = {
        letter: (count / total_letters) * 100 for letter, count in frequency.items()
    }

    # Sort the frequencies in descending order
    sorted_percentage_frequency = dict(
        sorted(percentage_frequency.items(), key=lambda item: item[1], reverse=True)
    )

    return sorted_percentage_frequency


def normalize_frequencies(frequencies):
    max_freq = max(frequencies.values())
    min_freq = min(frequencies.values())

    # Normalize the frequencies
    normalized_frequencies = {
        letter: round(((value - min_freq) / (max_freq - min_freq)) * 100, 2)
        for letter, value in frequencies.items()
    }

    return normalized_frequencies


freq = letter_frequency(words)
print(freq)
letter_scores = normalize_frequencies(freq)
print(letter_scores)

{'e': 10.472209188243825, 'i': 9.538028409333277, 'a': 8.198609392995778, 'n': 7.639807191912299, 's': 7.499040225227146, 'r': 7.494774559570021, 't': 7.4691805656272665, 'o': 6.09563622403276, 'l': 5.848227615919464, 'c': 4.40216695815382, 'u': 3.5788934863285418, 'd': 3.5618308237000385, 'm': 3.2888282216439877, 'g': 2.755620014503263, 'p': 2.4954144094185895, 'h': 2.0432538497632557, 'b': 1.9195495457066076, 'y': 1.7574542507358273, 'f': 0.9256494475962974, 'v': 0.8190078061681526, 'k': 0.5886618606833596, 'z': 0.48628588491234054, 'x': 0.3839099091413215, 'w': 0.33272192125581196, 'q': 0.2346116111419187, 'j': 0.17062662628503178}
{'e': 100.0, 'i': 90.93, 'a': 77.93, 'n': 72.51, 's': 71.14, 'r': 71.1, 't': 70.85, 'o': 57.52, 'l': 55.11, 'c': 41.08, 'u': 33.08, 'd': 32.92, 'm': 30.27, 'g': 25.09, 'p': 22.57, 'h': 18.18, 'b': 16.98, 'y': 15.4, 'f': 7.33, 'v': 6.29, 'k': 4.06, 'z': 3.06, 'x': 2.07, 'w': 1.57, 'q': 0.62, 'j': 0.0}
