In [71]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from thefuzz import fuzz
from tqdm import tqdm
import pandas as pd
import time

import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.corpus import brown

nltk.download('brown')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

pd.options.mode.chained_assignment = None  # default='warn'

[nltk_data] Downloading package brown to /Users/eharrison/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eharrison/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/eharrison/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/eharrison/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
nytcrossword_raw = pd.read_csv("nytcrosswords.csv", encoding="latin-1")
nytcrossword_raw

Unnamed: 0,Date,Word,Clue
0,10/31/2021,PAT,"Action done while saying ""Good dog"""
1,10/31/2021,RASCALS,Mischief-makers
2,10/31/2021,PEN,It might click for a writer
3,10/31/2021,SEP,Fall mo.
4,10/31/2021,ECO,Kind to Mother Nature
...,...,...,...
781568,11/21/1993,NAT,Actor Pendleton
781569,11/21/1993,SHRED,Bit
781570,11/21/1993,NEA,Teachers' org.
781571,11/21/1993,BEG,Petition


In [4]:
# Remove null values in data frame
nytcrossword = nytcrossword_raw.dropna()
# Add new column giving info on the length of the answer
nytcrossword["Word_Length"] = nytcrossword.apply(lambda x: len(x["Word"]), axis=1)
# Add new column with info on what day of the week the clue was given
nytcrossword["Date"] = pd.to_datetime(nytcrossword["Date"])
nytcrossword["Day_of_Week"] = nytcrossword["Date"].dt.day_name()

nytcrossword

Unnamed: 0,Date,Word,Clue,Word_Length,Day_of_Week
0,2021-10-31,PAT,"Action done while saying ""Good dog""",3,Sunday
1,2021-10-31,RASCALS,Mischief-makers,7,Sunday
2,2021-10-31,PEN,It might click for a writer,3,Sunday
3,2021-10-31,SEP,Fall mo.,3,Sunday
4,2021-10-31,ECO,Kind to Mother Nature,3,Sunday
...,...,...,...,...,...
781568,1993-11-21,NAT,Actor Pendleton,3,Sunday
781569,1993-11-21,SHRED,Bit,5,Sunday
781570,1993-11-21,NEA,Teachers' org.,3,Sunday
781571,1993-11-21,BEG,Petition,3,Sunday


In [5]:
# Reduce clues to just the key words. Removing "stopwords" that add little to the clue
# TODO: Find more stopwords (or keep certain words) that are more useful for crosswords than speech
stopword = stopwords.words("english")
nytcrossword["Condensed_Clue"] = nytcrossword.apply(lambda row: ' '.join([i for i in row["Clue"].split() 
                                                                        if i not in stopword]), axis=1)
nytcrossword

Unnamed: 0,Date,Word,Clue,Word_Length,Day_of_Week,Condensed_Clue
0,2021-10-31,PAT,"Action done while saying ""Good dog""",3,Sunday,"Action done saying ""Good dog"""
1,2021-10-31,RASCALS,Mischief-makers,7,Sunday,Mischief-makers
2,2021-10-31,PEN,It might click for a writer,3,Sunday,It might click writer
3,2021-10-31,SEP,Fall mo.,3,Sunday,Fall mo.
4,2021-10-31,ECO,Kind to Mother Nature,3,Sunday,Kind Mother Nature
...,...,...,...,...,...,...
781568,1993-11-21,NAT,Actor Pendleton,3,Sunday,Actor Pendleton
781569,1993-11-21,SHRED,Bit,5,Sunday,Bit
781570,1993-11-21,NEA,Teachers' org.,3,Sunday,Teachers' org.
781571,1993-11-21,BEG,Petition,3,Sunday,Petition


In [28]:
# Add information about the clue's tense. This can be very important as 
# the tense of the clue usually matches the tense of the answer
def get_common_suffixes():
    suffix_fdist = nltk.FreqDist()
    for word in tqdm(brown.words()):
        word = str(word).lower()
        # Take a look at the last three characters in the word
        suffix_fdist[word[-1:]] += 1
        suffix_fdist[word[-2:]] += 1
        suffix_fdist[word[-3:]] += 1
    common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
    return common_suffixes

common_suffixes = get_common_suffixes()
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

def train_tense_model():
    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n,g) in tqdm(tagged_words)]

    test_size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[test_size:], featuresets[:test_size]
    print("Training dataset...")
    start = time.time()
    classifier = nltk.DecisionTreeClassifier.train(train_set)
    end = time.time()
    print("Completed training! Time:", (end - start) / 1000, "s")

    return classifier, test_set

tense_classifier, test_set = train_tense_model()
nltk.classify.accuracy(tense_classifier, test_set)

100%|██████████| 1161192/1161192 [00:02<00:00, 512963.31it/s]


In [61]:
# tokenize the clue to get the individual words
def pos_tag(text):
    tagged_text = []
    for word in text:
        tagged_text += [[word, tense_classifier.classify(pos_features(word))]]
    return tagged_text

def determine_tense(clue):
    text = word_tokenize(clue)
    tagged = pos_tag(text)

    tense = {}
    tense['future'] = [word for word in tagged if word[1] == "MD"]
    tense['past'] = [word for word in tagged if word[1] in ["VBP", "VBZ", "VBG"]]
    tense['present'] = [word for word in tagged if word[1] in ["VBD", "VBN"]]

    return max(tense.keys(), key=lambda x: len(tense[x]))

nytcrossword["Tense"] = nytcrossword.apply(lambda row: determine_tense(row["Clue"]), axis=1)

In [64]:
nytcrossword[nytcrossword["Tense"] == "past"]

Unnamed: 0,Date,Word,Clue,Word_Length,Day_of_Week,Condensed_Clue,Tense
0,2021-10-31,PAT,"Action done while saying ""Good dog""",3,Sunday,"Action done saying ""Good dog""",past
6,2021-10-31,WAGE,Living ___,4,Sunday,Living ___,past
36,2021-10-31,NSA,Code-cracking grp.,3,Sunday,Code-cracking grp.,past
105,2021-10-31,LATE,Running behind,4,Sunday,Running behind,past
119,2021-10-31,ICEBLUE,Piercing eye hue,7,Sunday,Piercing eye hue,past
...,...,...,...,...,...,...,...
781505,1993-11-21,SQUEEZE,Run-scoring bunt,7,Sunday,Run-scoring bunt,past
781544,1993-11-21,SHRINKINGVIOLETS,Self-effacing people,16,Sunday,Self-effacing people,past
781551,1993-11-21,RAN,"Kurosawa's ""King Lear""",3,Sunday,"Kurosawa's ""King Lear""",past
781560,1993-11-21,ASPENS,Fluttering trees,6,Sunday,Fluttering trees,past


In [74]:
# Fuzzy search for the clue! Use the % matching as a new column
phrase = input("Enter your clue:")
nytcrossword["Percent_Matching"] = nytcrossword.apply(lambda x: fuzz.ratio(x["Clue"], phrase), axis=1)

percent_error = 0
closest_match = max(nytcrossword["Percent_Matching"])
nytcrossword[nytcrossword["Percent_Matching"] >= closest_match - percent_error]

Unnamed: 0,Date,Word,Clue,Word_Length,Day_of_Week,Condensed_Clue,Tense,Percent_Matching
16256,2021-04-03,THICKSET,Squat,8,Saturday,Squat,future,100
72075,2019-04-06,NADA,Squat,4,Saturday,Squat,future,100
230844,2013-07-26,NADA,Squat,4,Friday,Squat,future,100
232363,2013-07-06,NADA,Squat,4,Saturday,Squat,future,100
307801,2010-10-23,NONE,Squat,4,Saturday,Squat,future,100
331746,2009-12-13,NADA,Squat,4,Sunday,Squat,future,100
