In [2]:
# Install Dependencies
import sys

!{sys.executable} -m pip install sklearn
!{sys.executable} -m pip install spacy
!{sys.executable}  -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [2]:
# Load Raw Jeopardy! Questions and Answers
import json

with open('../assets/jeopardy.json', 'r') as f:
    raw = json.load(f)

#### We want to determine which questions will be answered with a single PERSON. In this notebook, we will:
0. Divide corpus into training/validation and test sets.
1. Select all pairs from the training set that are a single PERSON according to the built-in spaCy model.
2. From these selections, apply high-precision labeling functions to yield positive training examples
3. Select all pairs from the training set that are not a single PERSON as negative training examples
4. Format and save the output of these functions as training data

##### Step 0. 70-10-20 training-validation-test split.

In [5]:
from sklearn.model_selection import train_test_split

raw_train, raw_test = train_test_split(raw, test_size=0.2, random_state=1)
raw_train, raw_val = train_test_split(raw_train, test_size=0.2, random_state=1)

##### Step 1. Select questions that have answers tagged by spaCy as "PERSON".

In [5]:
import spacy

nlp = spacy.load('en_core_web_sm')

def answer_filter_fn(tuple_):
    answer, context = tuple_
    return True if len(answer.ents) == 1 and answer.ents[0].label_ == 'PERSON' else False

# Print first 10 examples
raw_train_answers = map(lambda item: item['answer'], raw_train)
for index, (_, context) in enumerate(filter(answer_filter_fn, nlp.pipe(zip(raw_train_answers, raw_train), as_tuples=True))):
    print(f"Question: {context['question']}")
    print(f"Answer: {context['answer']}" + "\n")
    if index > 9:
        break

Question: 'On Jan. 20, 1977 he became the first veep to reside in the Admiral's House at the U.S. Naval Observatory'
Answer: Walter Mondale

Question: 'Before becoming attorney general of the U.S., she was state attorney for Dade County, Florida'
Answer: Janet Reno

Question: 'In a '77 film Diane Keaton was looking for him'
Answer: Mr. Goodbar

Question: 'He's known for one-man shows like "Mambo Mouth" & for film roles like Chi Chi in "To Wong Foo..."'
Answer: John Leguizamo

Question: 'After playing QB Paul Crewe in 1974's "The Longest Yard", he played coach Nate Scarborough in the 2005 remake'
Answer: Burt Reynolds

Question: 'A Brantford, Ontario native, this "Great One" has rewritten the NHL record books in his stellar career'
Answer: Wayne Gretzky

Question: 'Fittingly, Congressman Brad Sherman represents this L.A. suburb, home of the Galleria of "Valley Girl" fame'
Answer: Sherman Oaks

Question: 'He painted "Irises" & "Pink Roses" as well as "Sunflowers"'
Answer: Vincent Van Gog

##### Step 2. Select questions that have answers tagged by spaCy as "PERSON" and have a matched phrase.

In [7]:
import spacy
from spacy.tokens import Doc, Span
from spacy.matcher import Matcher

nlp_questions = spacy.load('en_core_web_sm')
nlp_answers = spacy.load('en_core_web_sm')


# Add Personal Words Component
with open('../assets/personal_words.txt', 'r') as f:
    OCCUPATIONS = [line.strip() for line in f.readlines()]
with open('../assets/others.txt', 'r') as f:
    OTHERS = [line.strip() for line in f.readlines()]
PERSONALS = OCCUPATIONS + OTHERS
    
matcher_personal = Matcher(nlp_questions.vocab)
matcher_personal.add("this_phrase", [
    [{"LOWER": "this"}, {"LOWER": personal}] for personal in PERSONALS
])

def personal_phrase(doc):
    # Apply the matcher to the doc
    matches = matcher_personal(doc)
    spans = [Span(doc, start, end, label="PERSONAL_PHRASE") for match_id, start, end in matches]
    doc.ents = list(doc.ents) + spans
    return doc

nlp_questions.add_pipe(personal_phrase, before="ner")


# Add Personal Pronouns Component
matcher_pronoun = Matcher(nlp_questions.vocab)
matcher_pronoun.add("person_phrase", [
    [{"TAG": "PRP", "DEP": "nsubj", "LOWER": {"IN": ["he", "she"]}}],
])

def pronoun_phrase(doc):
    # Apply the matcher to the doc
    matches = matcher_pronoun(doc)
    spans = [Span(doc, start, end, label="PRONOUN_PHRASE") for match_id, start, end in matches]
    doc.ents = list(doc.ents) + spans
    return doc

nlp_questions.add_pipe(pronoun_phrase, before="ner")

def text_filter_fn(tuple_):
    answer, question = tuple_
    if len(answer.ents) == 1 and answer.ents[0].label_ == 'PERSON' and len(answer.ents[0]) == len(answer):
        for ent in question.ents:
            if ent.label_ == "PERSONAL_PHRASE" or ent.label_ == "PRONOUN_PHRASE":
                return True
    return False
        
# Store positive examples 
raw_train_questions = map(lambda item: item['question'], raw_train)
raw_train_answers = map(lambda item: item['answer'], raw_train)
positives = []
for answer, question in filter(text_filter_fn, zip(nlp_answers.pipe(raw_train_answers), nlp_questions.pipe(raw_train_questions))):
    positives.append({'answer': str(answer), 'question': str(question)})

##### Step 3. Select negatives that do not contain a single "PERSON"as the answer.

In [8]:
import spacy

nlp = spacy.load('en_core_web_sm')

def answer_filter_fn(tuple_):
    answer, context = tuple_
    return False if len(answer.ents) == 1 and answer.ents[0].label_ == 'PERSON' else True

# Store negative examples
raw_train_answers = map(lambda item: item['answer'], raw_train)
negatives = []
for index, (_, context) in enumerate(filter(answer_filter_fn, nlp.pipe(zip(raw_train_answers, raw_train), as_tuples=True))):
    negatives.append({'question': context['question'], 'answer': context['answer']})

##### Step 4. Save positives and negatives.

In [9]:
import pickle

with open('../data/negatives.pkl', 'wb') as f:
    pickle.dump(negatives, f)
with open('../data/positives.pkl', 'wb') as f:
    pickle.dump(positives, f)
with open('../data/raw_test.pkl', 'wb') as f:
    pickle.dump(raw_test, f)