# Loading the corpus

In [49]:
import os

with open('corpuses/coed.txt', encoding='utf-8') as f:
    words = [line.rstrip() for line in f]

print(f'{len(words)} words loaded.')

75754 words loaded.


# Filtering the corpus 

In [50]:
# Remove all words with non-alpha characters
import string
valid_letters = set([letter for letter in string.ascii_lowercase])
words = list(filter(lambda word: all((letter in valid_letters for letter in word)), words))
# Remove words shorter than 3 characters, and larger than 9 characters
words = list(filter(lambda word: len(word) >= 3 and len(word) <= 9, words))
# Remove all capitalised words 
words = list(filter(lambda word: word[0].islower(), words))

words = set(words)

with open('corpuses/coed_adverbs_with_ly.txt', encoding='utf-8') as f:
    adverbs_with_ly = set([line.rstrip() for line in f])
    words = words - adverbs_with_ly

with open('corpuses/coed_plurals.txt', encoding='utf-8') as f:
    plurals = set([line.rstrip() for line in f])
    words = words - plurals

with open('corpuses/coed_tenses_and_participles.txt', encoding='utf-8') as f:
    tenses_and_participles = set([line.rstrip() for line in f])
    words = words - tenses_and_participles

with open('corpuses/coed_abbreviations.txt', encoding='utf-8') as f:
    abbreviations = set([line.rstrip() for line in f])
    words = words - abbreviations

with open('corpuses/google_profane_words.txt', encoding='utf-8') as f:
    profanities = set([line.rstrip() for line in f])
    words = words - profanities

words = sorted(list(words))

print(f"{len(words)} valid words")

32706 valid words


# Save as a new corpus 

In [51]:
with open('corpuses/glypoon.txt', 'w+', encoding='utf-8') as f:
    f.write('\n'.join(sorted(words)))

# Group words by length

In [8]:
import pandas 

words_df = pandas.DataFrame(words, columns=['word'])
words_df['length'] = words_df.apply(lambda row: len(row['word']), axis=1)

words_df.head()

Unnamed: 0,word,length
0,aardvark,8
1,aardwolf,8
2,aargh,5
3,abaca,5
4,aback,5


In [10]:
df_group_by_length = words_df.groupby(by='length')['word'] \
                        .apply(list) \
                        .reset_index(name='words')

df_group_by_length['count'] = df_group_by_length.apply(lambda row: len(row['words']), axis=1)

df_group_by_length

Unnamed: 0,length,words,count
0,3,"[ace, act, add, ado, aft, aga, age, ago, aha, ...",766
1,4,"[abed, abet, able, abut, acer, ache, achy, aci...",2627
2,5,"[aargh, abaca, aback, abaft, abase, abash, aba...",4233
3,6,"[abacus, abatis, abbacy, abbess, abdabs, abduc...",5896
4,7,"[abalone, abandon, abashed, abaxial, abdomen, ...",6585
5,8,"[aardvark, aardwolf, abattoir, abbatial, abdic...",6631
6,9,"[abandoned, abdominal, abhorrent, ablutions, a...",6039


# Select a random word of length K

In [47]:
import random 

K = 8

words_with_length_k = df_group_by_length.loc[df_group_by_length['length'] == K]['words'].values[0]
chosen_word = random.choice(words_with_length_k)

print(chosen_word)

fanlight


# Find pangram words

In [48]:
import random
from collections import Counter

MIN_ANSWER_LENGTH = 4  # Minimum answer length
MIN_NUM_ANSWERS = 20  # Minimum number of answers
MAX_NUM_ANSWERS = 35  # Maximum number of answers

answers_by_keyword = {}
for keyword in chosen_word:  # For each possible letter to use as the 'center word' 
    answers_by_keyword[keyword] = []
    for word in words:
        if len(word) < MIN_ANSWER_LENGTH:
            continue
        if keyword not in word:
            continue
        if not Counter(word) - Counter(chosen_word):
            answers_by_keyword[keyword].append(word)

solutions = []
for keyword, answers in answers_by_keyword.items():
    if len(answers) >= MIN_NUM_ANSWERS and len(answers) <= MAX_NUM_ANSWERS:
        solutions.append((chosen_word, keyword, sorted(answers)))

print(f'{len(solutions)} possible solution(s) found.')
if solutions:
    solution = random.choice(solutions)

    print(f'Full word: {solution[0]}')
    print(f'Center letter: {solution[1]}')
    print(f'{len(solution[2])} answers: {", ".join(solution[2])}')

5 possible solution(s) found.
Full word: fanlight
Center letter: f
24 answers: fail, fain, faint, faith, fang, fanlight, fatling, fiat, fight, filth, final, fitna, flag, flan, flat, flight, fling, flint, flit, gift, haft, half, lift, naif


# Export as a JSON file

In [347]:
import json
import random

OUTPUT_FILE_NAME = 'answers.json'

letters = [char for char in solution[0]]
random.shuffle(letters)
letters.remove(solution[1])
letters.insert(0, solution[1])

json_solution = {
    'letters': letters,
    'answers': solution[2]
}

with open(OUTPUT_FILE_NAME, 'w+') as solution_file:
    json.dump(json_solution, solution_file, indent=4)