# Initial Data Exploration

Data taken from the Gutenberg Project

## Load Data

In [None]:
!ls ..

In [None]:
with open("../data/dictionary.txt", encoding="utf-8") as f:
    lines = f.readlines()

len(lines)

## Kill header and footer

In [None]:
STARTLINE = 27
ENDLINE = 973904

In [None]:
raw_data = lines[STARTLINE:ENDLINE]

## Parsing Functions

In [None]:
import itertools
import collections

# we want to basically iterate through the lines in the following manner:
# read a word (uppercase line)
# read until you fine a real line
# collect lines until blank line (this is a definition statement)
# read until line

def canonical_lines(raw_data):
    current = ""
    started = False
    for row in raw_data:
        row = row.strip()
        if not row:
            if current:
                yield current.strip()
                current = ""
        elif row.isupper():
            yield row
        else:
            current += " " + row

def get_pairs(data):
    word = ""
    defs = []
    for row in data:
        if row.isupper():
            if word:
                for w in word.split(";"):
                    if len(w.split()) == 1:
                        yield w.strip(), defs
            word = row
            defs = []
        elif row.lower().startswith("defn:"):
            for d in row[5:].split(".")[0].split(";"):
                if not d.strip().startswith("See "):
                    defs.append(d.strip().lower())
    for w in word.split(";"):
        if len(w.split()) == 1:
            yield w.strip(), defs

def defns(raw_data, filter_func=lambda x: True):
    words = collections.defaultdict(lambda : list())
    for word, defn in filter(filter_func, get_pairs(canonical_lines(raw_data))):
        if defn:
            words[word.lower()].extend(defn)
    return words

## Parse the data into a dictionary of `{word: [def1, def2, ...]}`

In [None]:
words = defns(raw_data, lambda x: x[0].isalnum())

In [None]:
len(words)

In [None]:
words['the']

## Set up a training dataset of (definition, word) pairs

In [None]:
data = [(d, word) for (word, defn) in words.items() for d in defn]

In [None]:
len(data)

In [None]:
data[0]