In [26]:
import gzip
from typing import Union

import pandas as pd
import numpy as np
from tqdm import tqdm
import joblib
import json


In [27]:
def load_examples() -> list[dict[str, Union[str, list[str]]]]:
    """Quick and dirty parser to turn examples.txt into a machine-readable form.

    Returns
    -------
    list[dict[str, Union[str, list[str]]]]
        list of examples. Each examples is of the form 
            {
                "question": QUESTION, 
                "context": CONTEXT or "", 
                "choices": [CHOICE1, CHOICE2, ...]
            }
    """

    with open("../data/raw/examples.txt") as fp:
        data = fp.read()

    examples = data.split("#")[1:]

    parsed = []

    for e in examples:
        parts = e.split("\n")

        question = next((p.removeprefix("Question:").strip() for p in parts if p.startswith("Question: ")), "")
        context = next((p.removeprefix("Context:").strip() for p in parts if p.startswith("Context: ")), "")
        choices = [p.split(")")[1].strip() for p in parts if p.startswith("(")]

        parsed.append({"question": question, "context": context, "choices": choices})

    return parsed
    

In [30]:
examples = load_examples()

In [31]:
import spacy

In [59]:
nlp = spacy.load('en_core_web_sm')
stopwords = set(nlp.Defaults.stop_words).union({",", ".", "?", ":", ";"})

In [57]:
type(stopwords)

set

In [37]:
import utils as U

[nltk_data] Downloading package wordnet to /home/felix/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/felix/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [62]:
def extract_terms(input: str) -> set[str]:
    doc = nlp(input)

    tokens = [*doc, *doc.noun_chunks]

    token_texts = set(s for t in tokens if (s := U.normalize_input(t.text)) not in stopwords)

    return token_texts

In [63]:
import itertools

In [64]:
def extract_terms_from_example(example: dict) -> tuple[list[str], list[str]]:
    
    question_context = set(itertools.chain(extract_terms(example["question"]), extract_terms(example["context"])))
    
    choices = set(itertools.chain.from_iterable(extract_terms(c) for c in example["choices"]))

    return question_context, choices

In [65]:
extract_terms_from_example(examples[0])

({'a drawstring bag',
  'bag',
  'baggage',
  'checked',
  'drawstring',
  'heading',
  'the only baggage',
  'the woman',
  'wa',
  'woman'},
 {'airport',
  'garbage',
  'jewelry',
  'jewelry store',
  'military',
  'safe',
  'store'})

In [47]:
extract_terms("The only baggage the woman checked was a drawstring bag, where was she heading with it?")

['baggage',
 'woman',
 'checked',
 'wa',
 'drawstring',
 'bag',
 ',',
 'wa',
 'heading',
 '?',
 'the only baggage',
 'the woman',
 'a drawstring bag']

In [41]:
examples[0]

{'question': 'The only baggage the woman checked was a drawstring bag, where was she heading with it?',
 'context': '',
 'choices': ['garbage can', 'military', 'jewelry store', 'safe', 'airport']}