In [1873]:
import spacy
from spacy import displacy
from spacy.language import Language
from spacy.tokens import Doc, Span, Token
from spacy.matcher import Matcher
from spacy.pipeline import SpanRuler, EntityRuler
from pymongo import MongoClient
import re
from nltk import Tree
import itertools
from pprint import pprint

In [1874]:
def letter_generator():
    yield from itertools.cycle("ABCDEFGHIJKLMNOPQRSTUVWXYZ")

letters = letter_generator()

In [1875]:
# Connect to mongodb
client = MongoClient("mongodb://root:password@localhost:27017/")
catalog = client.get_database("catalog")

In [1876]:
# Sort by length of title
subject_codes_docs = list(catalog.get_collection("subject_codes").find())
subject_codes_docs.sort(key=lambda x: len(x["title"]), reverse=True)
subject_codes_map = {doc["title"]: doc["code"] for doc in subject_codes_docs}
subject_codes = [doc["code"] for doc in subject_codes_docs]

In [1877]:
# Base patterns
subject_code_regex = r"([A-Z]{3,4})"  # ART, MATH
course_number_regex = r"(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})"  # 101, 30-1, 599.45
course_code_regex = rf"{subject_code_regex} {course_number_regex}"

In [1878]:
def replace_subject_code(sentence: str, loose: bool=False):
	for subject_code in subject_codes_docs:
		if loose:
			sentence = re.sub(rf"{subject_code["title"]}", rf"{subject_code["code"]}", sentence)
		else:
			sentence = re.sub(rf"{subject_code["title"]} {course_number_regex}", rf"{subject_code["code"]} \1", sentence)
	return sentence

In [1879]:
def get_replacement_letter():
    # Create an iterator that cycles through the alphabet
    for letter in itertools.cycle("ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
        yield letter

replacement_letters = get_replacement_letter()

In [1880]:
Token.set_extension("course_code", default=None, force=True)
Doc.set_extension("replacements", default=[], force=True)
Doc.set_extension("json_logics", default=[], force=True)

In [1881]:
nlp = spacy.load("en_core_web_sm", exclude=["ner"])
expand_nlp = spacy.load("en_core_web_sm", exclude=["ner"])
constituency_nlp = spacy.load("en_core_web_sm", exclude=["ner"])
structure_nlp = spacy.load("en_core_web_sm", exclude=["ner"])

In [1882]:
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc:
        token.is_sent_start = False

    for token in doc[:-1]:
        if token.text == ";":
            doc[token.i + 1].is_sent_start = True
            
    return doc

# nlp.add_pipe("set_custom_boundaries", before="parser")

In [1883]:
@Language.component("fix_ent_head")
def fix_ent_head(doc: Doc):
    for sent in doc.sents:
        for token in sent:
            if token.pos_ == "NUM" and re.match(course_number_regex, token.text) is not None:
                ancestors = list(filter(lambda x: x.text in subject_codes, token.ancestors))
                ancestor = ancestors[0] if len(ancestors) > 0 else None

                if ancestor:
                    token.head = ancestor
                    token._.course_code = f"{ancestor.text}{token.text}"

    return doc

In [1884]:
@Language.component("expand_course_code")
def expand_course_code(doc: Doc):
    sent = ""

    for token in doc:
        if token.text in subject_codes:
            continue

        elif token.pos_ == "NUM" and re.match(course_number_regex, token.text) is not None:
            left_tokens = [token.head] + list(reversed(list(doc[: token.i])))

            for left_token in left_tokens:
                if left_token.text in subject_codes:
                    sent += left_token.text_with_ws
                    break

            sent += token.text_with_ws

        else:
            sent += token.text_with_ws

    new_doc = nlp(sent)
    new_doc.ents = []
    return new_doc


In [1885]:
entity_ruler = EntityRuler(nlp)
patterns = [
    {
        "label": "COURSE",
        "pattern": [
            {"TEXT": {"REGEX": subject_code_regex}},
            {"TEXT": {"REGEX": course_number_regex}},
        ],
    },
    {
        "label": "REQUISITE",
        "pattern": [
            {"TEXT": "RQ"},
            {"TEXT": {"REGEX": "[A-Z]"}},
        ],
    },
]
entity_ruler.clear()
entity_ruler.add_patterns(patterns)

@Language.component("detect_entity")
def detect_entity(doc: Doc):
    ents = entity_ruler.match(doc)
    doc.ents = ents
    return doc

In [1886]:
@Language.component("merge_entity_spans")
def merge_entity_spans(doc: Doc):
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            if ent.label_ is not None:
                retokenizer.merge(ent, attrs={"ENT_TYPE": ent.label_, "ENT_IOB": "B", "ENT_IOE": "E", "ENT_IOR": "", "pos": "PROPN"})
    return doc


In [1887]:
expand_nlp.add_pipe("fix_ent_head")
expand_nlp.add_pipe("expand_course_code")
expand_nlp.add_pipe("detect_entity")
expand_nlp.add_pipe("merge_entity_spans")

expand_nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'fix_ent_head',
 'expand_course_code',
 'detect_entity',
 'merge_entity_spans']

In [1888]:
def get_repeating_array(head: list, repeat_elemnts: list, repeat_times: int, tail: list):
    repeated_part = repeat_elemnts * repeat_times
    return head + repeated_part + tail

def get_dynamic_patterns(head: list, repeat_tokens: list, repeat_range: range, tail: list):
    patterns = []
    for i in repeat_range:
        patterns.append(get_repeating_array(head, repeat_tokens, i, tail))
    return patterns

In [1889]:
def find_replacement(key: str, replacements: list[tuple[str, Span]]):
    if not replacements or not key:
        return

    key = key[:-1].strip() if key.strip().endswith(".") else key
    
    for _key, span in replacements:
        if key == _key:
            return span

def find_json_logic(span: Span, json_logic: list[tuple[Span, dict]]):
    if not json_logic or not span:
        return
    
    for _span, logic in json_logic:
        if span.text == _span.text:
            return logic

In [1890]:
requisite_pattern_matcher = Matcher(nlp.vocab)

### X Units of
def x_units_of(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]

    units_required = int(span[0].text)
    courses = [ent for ent in span.ents if ent.label_ == "COURSE"]

    json_logic = {
        "units": {
            "required": units_required,
            "from": [{"course": course.text} for course in courses],
        }
    }

    doc._.json_logics.append((span, json_logic))

x_units_of_patterns = get_dynamic_patterns(
    [
        {"IS_DIGIT": True},
        {"LEMMA": "unit"},
        {"POS": "ADP", "OP": "+"},
    ],
    [
        {"ENT_TYPE": "COURSE"},
        {"TEXT": {"IN": ["or", ","]}, "OP": "{1,2}"},
    ],
    range(1, 20),
    [
        {"ENT_TYPE": "COURSE"},
    ],
)

requisite_pattern_matcher.add("X units of", x_units_of_patterns, greedy="LONGEST", on_match=x_units_of)
### X Units of


### X of
def x_of(matcher, doc: Doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]

    number_token = span[0]
    switch = {
        "one": 1,
        "two": 2,
        "three": 3,
    }
    number = switch.get(number_token.lemma_, 1)

    courses = [ent for ent in span.ents if ent.label_ == "COURSE"]

    json_logic = {
        "courses": {
            "required": number,
            "from": [{"course": course.text} for course in courses],
        },
    }

    doc._.json_logics.append((span, json_logic))

x_of_patterns = get_dynamic_patterns(
    [
        {"POS": "NUM"},
        {"POS": "ADP", "OP": "+"},
    ],
    [
        {"ENT_TYPE": "COURSE"},
        {"TEXT": {"IN": ["or", ","]}},
    ],
    range(1, 20),
    [
        {"ENT_TYPE": "COURSE"},
    ],
)

requisite_pattern_matcher.add("X of", x_of_patterns, greedy="LONGEST", on_match=x_of)
### One of


### Consent of
def consent_of(matcher, doc: Doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]
    consent_of = span[2:].text.strip()
    if consent_of.endswith("."):
        consent_of = consent_of[:-1]
    json_logic = {"consent": consent_of}
    doc._.json_logics.append((span, json_logic))

requisite_pattern_matcher.add(
    "Consent of",
    [
        [
            {"LEMMA": "consent"},
            {"POS": "ADP", "OP": "+"},
            {"TEXT": {"REGEX": "[A-Za-z, ]"}, "OP": "*"},
            {"IS_SENT_START": False},
        ]
    ],
    greedy="LONGEST",
    on_match=consent_of,
)

def admission_of(matcher, doc: Doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]
    admission_of = span[2:].text.strip()
    if admission_of.endswith("."):
        admission_of = admission_of[:-1]
    json_logic = {"admission": admission_of}
    doc._.json_logics.append((span, json_logic))

admission_of_patterns = get_dynamic_patterns(
    [
        {"LEMMA": "admission"},
        {"POS": "ADP", "OP": "+"},
    ],
    [
        {"ENT_TYPE": {"NOT_IN": ["COURSE"]}},
    ],
    range(1, 200),
    [
        {"LEMMA": {"NOT_IN": ["and", "or", ",", ";"]}, "ENT_TYPE": {"NOT_IN": ["COURSE", "REQUISITE"]}},
    ],
)

requisite_pattern_matcher.add(
    "Admission to",
    admission_of_patterns,
    greedy="LONGEST",
    on_match=admission_of,
)
### Consent of


### Both A and B
def both_a_and_b(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]
    a = span[1]
    b = span[3]
    json_logic = {"and": [{"course": a.text}, {"course": b.text}]}
    doc._.json_logics.append((span, json_logic))

requisite_pattern_matcher.add(
    "Both A and B",
    [
        [
            {"LEMMA": "both"},
            {"ENT_TYPE": "COURSE"},
            {"LEMMA": "and"},
            {"ENT_TYPE": "COURSE"},
        ]
    ],
    greedy="LONGEST",
    on_match=both_a_and_b,
)
### Both A and B


### Either A or B
def either_a_or_b(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]
    a = span[1]
    b = span[3]
    json_logic = {"or": [{"course": a.text}, {"course": b.text}]}
    doc._.json_logics.append((span, json_logic))

requisite_pattern_matcher.add(
    "Either A or B",
    [
        [
            {"LEMMA": "either"},
            {"ENT_TYPE": "COURSE"},
            {"LEMMA": "or"},
            {"ENT_TYPE": "COURSE"},
        ]
    ],
    greedy="LONGEST",
    on_match=either_a_or_b,
)
### Either A or B

@Language.component("constitute_requisite")
def constitute_requisite(doc: Doc):
    sent = doc.text
    matches = requisite_pattern_matcher(doc)
    replacements = []

    # sort matches by length of span
    matches = sorted(matches, key=lambda x: x[2] - x[1], reverse=True)

    for match_id, start, end in matches:
        letter = next(letters)
        replacement = f"RQ {letter}"
        span = doc[start:end]
        new_sent = re.sub(re.escape(span.text), replacement, sent)

        if new_sent != sent:
            sent = new_sent
            replacements.append((replacement, span))

    new_doc = nlp(sent)
    new_doc._.replacements = doc._.replacements + replacements
    new_doc._.json_logics = doc._.json_logics
    return new_doc

In [1891]:
for t in range(1, 2):
    constituency_nlp.add_pipe("constitute_requisite", f"constitute_requisite_{t}")
    constituency_nlp.add_pipe("detect_entity", f"detect_entity_{t}")
    constituency_nlp.add_pipe("merge_entity_spans", f"merge_entity_spans_{t}")

constituency_nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'constitute_requisite_1',
 'detect_entity_1',
 'merge_entity_spans_1']

In [1892]:
structure_minor_matcher = Matcher(nlp.vocab)

### A, B, C, ..., and D
def and_list(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]
    predicates = []

    for ent in span.ents:
        if ent.label_ == "COURSE":
            predicates.append({"course": ent.text})

        elif ent.label_ == "REQUISITE":
            requisite_span = find_replacement(ent.text, doc._.replacements)
            requisite_logic = find_json_logic(requisite_span, doc._.json_logics)
            if requisite_logic:
                predicates.append(requisite_logic)


    if len(predicates) == 1:
        json_logic = predicates[0]
    else:
        json_logic = {"and": predicates}

    doc._.json_logics.append((span, json_logic))

and_list_patterns = get_dynamic_patterns(
    [
        {"ENT_TYPE": {"IN": ["COURSE", "REQUISITE"]}},
    ],
    [
        {"TEXT": {"IN": ["and", ","]}},
        {"ENT_TYPE": {"IN": ["COURSE", "REQUISITE"]}},
    ],
    range(0, 20),
    [
        {"TEXT": {"IN": ["and", ","]}},
        {"ENT_TYPE": {"IN": ["COURSE", "REQUISITE"]}},
        {"TEXT": {"IN": [","]}, "OP": "?"},
    ],
)

structure_minor_matcher.add("A, B, C, ..., and D", and_list_patterns, greedy="LONGEST", on_match=and_list)
### A, B, C, ..., and D


### A, B, C, ..., or D
def or_list(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]
    predicates = []
    
    for ent in span.ents:
        if ent.label_ == "COURSE":
            predicates.append({"course": ent.text})

        elif ent.label_ == "REQUISITE":
            requisite_span = find_replacement(ent.text, doc._.replacements)
            requisite_logic = find_json_logic(requisite_span, doc._.json_logics)
            if requisite_logic:
                predicates.append(requisite_logic)

    if len(predicates) == 1:
        json_logic = predicates[0]
    else:
        json_logic = {"or": predicates}

    doc._.json_logics.append((span, json_logic))

or_list_patterns = get_dynamic_patterns(
    [
        {"ENT_TYPE": {"IN": ["COURSE", "REQUISITE"]}},
    ],
    [
        {"TEXT": {"IN": ["or", ","]}},
        {"ENT_TYPE": {"IN": ["COURSE", "REQUISITE"]}},
    ],
    range(0, 20),
    [
        {"TEXT": {"IN": ["or"]}},
        {"ENT_TYPE": {"IN": ["COURSE", "REQUISITE"]}},
        {"TEXT": {"IN": [","]}, "OP": "?"},
    ],
)

structure_minor_matcher.add("A, B, C, ..., or D", or_list_patterns, greedy="LONGEST", on_match=or_list)
### A, B, C, ..., or D


@Language.component("constitute_structure_minor")
def constitute_structure(doc: Doc):
    sent = doc.text
    matches = structure_minor_matcher(doc)
    replacements = []

    # sort matches by length of span
    matches = sorted(matches, key=lambda x: x[2] - x[1], reverse=True)

    for match_id, start, end in matches:
        letter = next(letters)
        replacement = f"RQ {letter}"
        span = doc[start:end]
        new_sent = re.sub(re.escape(span.text), replacement, sent)

        if new_sent != sent:
            sent = new_sent
            replacements.append((replacement, span))

    new_doc = nlp(sent)
    new_doc._.replacements = doc._.replacements + replacements
    new_doc._.json_logics = doc._.json_logics
    return new_doc

In [1893]:
structure_major_matcher = Matcher(nlp.vocab)

### A; and B; and C; ... and D
major_and_list_patterns = get_dynamic_patterns(
    [],
    [
        {"ENT_TYPE": {"IN": ["COURSE", "REQUISITE"]}},
        {"TEXT": {"IN": ["and", ";"]}, "OP": "+"},
    ],
    range(1, 20),
    [
        {"ENT_TYPE": {"IN": ["COURSE", "REQUISITE"]}},
    ],
)
structure_major_matcher.add("A; and B; and C; ... and D", major_and_list_patterns, greedy="LONGEST", on_match=and_list)
### A; and B; and C; ... and D


### A; or B; or C; ... or D
major_or_list_patterns = get_dynamic_patterns(
    [],
    [
        {"ENT_TYPE": {"IN": ["COURSE", "REQUISITE"]}},
        {"TEXT": {"IN": ["or", ";"]}, "OP": "+"},
    ],
    range(1, 20),
    [
        {"ENT_TYPE": {"IN": ["COURSE", "REQUISITE"]}},
    ],
)
structure_major_matcher.add("A; or B; or C; ... or D", major_or_list_patterns, greedy="LONGEST", on_match=or_list)
### A; or B; or C; ... or D

@Language.component("constitute_structure_major")
def constitute_structure(doc: Doc):
    sent = doc.text
    matches = structure_major_matcher(doc)
    replacements = []

    # sort matches by length of span
    matches = sorted(matches, key=lambda x: x[2] - x[1], reverse=True)

    for match_id, start, end in matches:
        letter = next(letters)
        replacement = f"RQ {letter}"
        span = doc[start:end]
        new_sent = re.sub(re.escape(span.text), replacement, sent)
        
        if new_sent != sent:
            sent = new_sent
            replacements.append((replacement, span))

    new_doc = nlp(sent)
    new_doc._.replacements = doc._.replacements + replacements
    new_doc._.json_logics = doc._.json_logics
    return new_doc

In [1894]:
for t in range(1, 5):
    structure_nlp.add_pipe("constitute_structure_minor", f"constitute_structure_minor_{t}")
    structure_nlp.add_pipe("detect_entity", f"detect_entity_{t}")
    structure_nlp.add_pipe("merge_entity_spans", f"merge_entity_spans_{t}")

for t in range(6, 10):
    structure_nlp.add_pipe("constitute_structure_major", f"constitute_structure_major_{t}")
    structure_nlp.add_pipe("detect_entity", f"detect_entity_{t}")
    structure_nlp.add_pipe("merge_entity_spans", f"merge_entity_spans_{t}")
    
structure_nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'constitute_structure_minor_1',
 'detect_entity_1',
 'merge_entity_spans_1',
 'constitute_structure_minor_2',
 'detect_entity_2',
 'merge_entity_spans_2',
 'constitute_structure_minor_3',
 'detect_entity_3',
 'merge_entity_spans_3',
 'constitute_structure_minor_4',
 'detect_entity_4',
 'merge_entity_spans_4',
 'constitute_structure_major_6',
 'detect_entity_6',
 'merge_entity_spans_6',
 'constitute_structure_major_7',
 'detect_entity_7',
 'merge_entity_spans_7',
 'constitute_structure_major_8',
 'detect_entity_8',
 'merge_entity_spans_8',
 'constitute_structure_major_9',
 'detect_entity_9',
 'merge_entity_spans_9']

In [1895]:
def tok_format(tok):
    # return "_".join([tok.orth_, tok.tag_])
    return f"{tok.orth_} ({tok.dep_})"


def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children])
    else:
        return tok_format(node)

In [1896]:
def sanity_check(span: Span):
    if len(span) == 0:
        return False

    token = span[0]
    is_ent_type = token.ent_type_ in ["COURSE", "REQUISITE"]

    if len(span) == 1 and is_ent_type:
        return True

    if len(span) == 2 and is_ent_type:
        return True
    
    return False
        

In [1897]:
def extract_entity(token: Token, replacements: dict[str, Span], json_logics: list[tuple[Span, dict]]):
    if token.ent_type_ == "COURSE":
        return {"course": token.text}

    elif token.ent_type_ == "REQUISITE":
        replacement = find_replacement(token.text, replacements)
        json_logic = find_json_logic(replacement, json_logics)
        return json_logic

def extract_doc(doc: Doc):
    if not sanity_check(doc):
        print("A - Sanity check failed")
        return None

    token = doc[0]
    return extract_entity(token, doc._.replacements, doc._.json_logics)

In [1898]:
# sent = "Actuarial Science 327; Statistics 323; 3 units from Mathematics 311, 313, 367 or 375; and 3 units from Computer Science 217, 231, 235 or Data Science 211."
# sent = "CPSC 457 and 3 units from SENG 300, 301 or ENSF 480; and admission to the Schulich School of Engineering."
# sent = "SGMA 395 or ENTI 317 or 381."
# sent = "One of FILM 321 or 323 and one of FILM 331 or 333."
# sent = "FILM 331 or 333."
# sent = "ENCI 473; and ENGG 319 or ENDG 319."
# sent = "3 units from ENCI 481, ENEE 377 or 519.09."
# sent = "ENEL 341, BMEN 327 or ENGG 225."
# sent = "ENEL 471; and one of BMEN 319 or ENGG 319 or ENEL 419."
# sent = "3 units from ENGG 319, ENDG 319 or ENEL 419."
# sent = "FILM 201 and 3 units from 305 or 321."
# sent = "INDG 201 and 3 units from INDG 303 or 345."
# sent = "One of GEOG 211, 251, 253, UBST 253, GLGY 201, 209; and consent of the Department."
# sent = "STAT 205 or 213; and admission to the Kinesiology Honours program; and consent of the Faculty."
# sent = "MATH 209 and admission to the Energy Engineering program."
# sent = "Both MATH 349 and 353; or both MATH 283 and 381; or MATH 267."
# sent = "MATH 431 or PMAT 431; MATH 429 or PMAT 429 or MATH 327 or PMAT 427."
# sent = "MATH 445 or 447; 3 units of Mathematics in the Field of Mathematics at the 400 level or above."
# sent = "MATH 383; and 6 units of Mathematics in the Field of Mathematics at the 400 level or above."
# sent = "MRSC 451 and consent of the Department."
# sent = "Admission to the Haskayne School of Business and OBHR 317."
# sent = "PHYS 211 or 221 or 227."
# sent = "MATH 277 and PHYS 259 and admission to a program in Engineering."
# sent = "PHYS 341; and 3 units from CPSC 217, 231 or DATA 211."
# sent = "ACSC 327; and MATH 323 or STAT 323."
# sent = "ANTH 203."
sent = "One of GEOG 211, 251, 253, UBST 253, GLGY 201, 209; and consent of the Department."
# sent = "One of CORE 209, 435, KNES 355, NURS 303, 305, PSYC 203, 205, SOWK 300, 302, 304, 306, 363 or consent of the instructor(s)."

# Problematic sentences
# sent = "History 300 and one of East Asian Studies 331, 333, History 209, 301, 315, 317, 405, 407.01, 407.02, 407.03, or consent of the Department."
# sent = "Kinesiology 203, 213, 323 and admission to the Faculty of Kinesiology."
# sent = "Mathematics 30-1, Mathematics 30-2, or Mathematics 31."
# sent = "Admission to the Psychology major or Honours program and Psychology 300, 301, 369."
# sent = "Computer Science 219, 233 or Data Science 311 and enrolment in one of the Majors in Computer Science, Bioinformatics, Electrical Engineering, Software Engineering, Computer Engineering, Natural Sciences with a primary concentration in Computer Science."
# sent = "Computer Science 219, 233 or Data Science 311 and enrolment in one of the Majors in Computer Science, Bioinformatics, Electrical Engineering, Software Engineering, Computer Engineering, Natural Sciences with a primary concentration in Computer Science."
# sent = "Admission to the Haskayne School of Business, and 54 units including Accounting 217."
# sent = "3 units from Engineering 204, Chemistry 201, 209 or 211; and Chemistry 203 or 213; and 3 units from Mathematics 249, 265, 275."
# sent = "Software Engineering for Engineers 300; and 3 units from Engineering 319, Digital Engineering 319 or Electrical Engineering 419; and 3 units from Software Engineering for Engineers 337, Computer Engineering 335, 339, or Geomatics Engineering 333."
# sent = "3 units from Computer Science 219, 233 or 235; and Computer Science 251 or Statistics 213; and Mathematics 271 or 273; and 3 units from Mathematics 249, 265 or 275; and Philosophy 279 or 377."
# sent = "Psychology 200, 201, and admission to the International Indigenous Studies major."
sent = "Geology 201 and 202; and Mathematics 267 or 277; and Physics 211 or 221, and 223."
sent = "Physics 481; and Mathematics 433, Physics 435 or Physics Engineering 435."
sent = "Chemistry 333 or 433; and one of 353 or 355; or Chemistry 357 and 409."
sent = "Software Engineering for Engineers 300; and 3 units from Engineering 319, Digital Engineering 319 or Electrical Engineering 419; and 3 units from Software Engineering for Engineers 337, Computer Engineering 335, 339, or Geomatics Engineering 333."


sent = replace_subject_code(sent)
print("Original:", sent)


doc = expand_nlp(sent)
print("Expand  :", doc)


doc = constituency_nlp(doc)
print("Constituency:", doc)

doc = structure_nlp(doc)
print("Structure:", doc)

print("")

for key, replacement in doc._.replacements:
    replacement: Span
    print(f"{key} -> {replacement}")

print("")

for key, json_logic in doc._.json_logics:
    json_logic: dict
    print(f"{key} -> {json_logic}")

print("")

j = extract_doc(doc)
pprint(j, indent=2, depth=10)


displacy.render(doc, style="ent", jupyter=True, options={"compact": True, "distance": 100})
displacy.render(doc, style="dep", jupyter=True, options={"compact": True, "distance": 100})

Original: ENSF 300; and 3 units from ENGG 319, ENDG 319 or ENEL 419; and 3 units from ENSF 337, ENCM 335, 339, or ENGO 333.
Expand  : ENSF 300; and 3 units from ENGG 319, ENDG 319 or ENEL 419; and 3 units from ENSF 337, ENCM 335, ENSF 339, or ENGO 333.
Constituency: ENSF 300; and RQ B; and RQ A.
Structure: RQ C

RQ A -> 3 units from ENSF 337, ENCM 335, ENSF 339, or ENGO 333
RQ B -> 3 units from ENGG 319, ENDG 319 or ENEL 419
RQ C -> ENSF 300; and RQ B; and RQ A.

3 units from ENSF 337, ENCM 335, ENSF 339, or ENGO 333 -> {'units': {'required': 3, 'from': [{'course': 'ENSF 337'}, {'course': 'ENCM 335'}, {'course': 'ENSF 339'}, {'course': 'ENGO 333'}]}}
3 units from ENGG 319, ENDG 319 or ENEL 419 -> {'units': {'required': 3, 'from': [{'course': 'ENGG 319'}, {'course': 'ENDG 319'}, {'course': 'ENEL 419'}]}}
ENSF 300; and RQ B; and RQ A. -> {'and': [{'course': 'ENSF 300'}, {'units': {'required': 3, 'from': [{'course': 'ENGG 319'}, {'course': 'ENDG 319'}, {'course': 'ENEL 419'}]}}, {'units':

In [1899]:
def try_nlp(course: dict, sent: str):
    sent = replace_subject_code(sent)
    doc = expand_nlp(sent)
    doc = constituency_nlp(doc)
    doc = structure_nlp(doc)
    j = extract_doc(doc)
    return j

In [1900]:
# Get all courses
courses = list(
    catalog.get_collection("courses").find(
        {"prereq": {"$ne": None}, "career": "Undergraduate Programs", "active": True}
    )
)

courses_prereq = catalog.get_collection("courses_prereq")
courses_prereq.delete_many({})

for course in courses:
    prereq = course["prereq"]

    if prereq:
        print(prereq)

        result = try_nlp(course, prereq)

        print(result)
        print("")

        courses_prereq.insert_one(
            {"course": course["code"], "prereq_text": prereq, "prereq": result}
        )

Admission to the Haskayne School of Business and 12 units.
{'admission': 'the Haskayne School of Business and 12 units'}

24 units including Entrepreneurship and Innovation 201.
A - Sanity check failed
None

Admission to the Haskayne School of Business, and Accounting 217.
A - Sanity check failed
None

Admission to the Haskayne School of Business and Accounting 217 and 323.
{'and': [{'admission': 'the Haskayne School of Business'}, {'course': 'ACCT 217'}, {'course': 'ACCT 323'}]}

Admission to the Haskayne School of Business and Accounting 341.
{'and': [{'admission': 'the Haskayne School of Business'}, {'course': 'ACCT 341'}]}

24 units including Accounting 217 or 301. For certain topics consent of the Haskayne School of Business will also be required.
A - Sanity check failed
None

Admission to the Haskayne School of Business and Accounting 323.
{'and': [{'admission': 'the Haskayne School of Business'}, {'course': 'ACCT 323'}]}

Admission to the Haskayne School of Business, and 54 unit