In [577]:
import spacy
from spacy import displacy
from spacy.language import Language
from spacy.tokens import Doc, Span, Token
from pymongo import MongoClient
import re
from nltk import Tree
import itertools

In [578]:
# Connect to mongodb
client = MongoClient("mongodb://root:password@localhost:27017/")
catalog = client.get_database("catalog")

In [579]:
# Sort by length of title
subject_codes_docs = list(catalog.get_collection("subject_codes").find())
subject_codes_docs.sort(key=lambda x: len(x["title"]), reverse=True)
subject_codes_map = {doc["title"]: doc["code"] for doc in subject_codes_docs}
subject_codes = [doc["code"] for doc in subject_codes_docs]
subject_codes

['BIMA',
 'IFPE',
 'OBHR',
 'HROD',
 'IFPB',
 'CMMB',
 'SCPA',
 'EAPP',
 'APLA',
 'ENEE',
 'ALMC',
 'LLAC',
 'ENSF',
 'IPHE',
 'CEST',
 'CAAP',
 'EVDA',
 'IFPX',
 'ISEC',
 'EESS',
 'ASHA',
 'SUSE',
 'ENTI',
 'COMS',
 'MDPA',
 'LWFT',
 'STAS',
 'BTMA',
 'SEDV',
 'SGMA',
 'MDBT',
 'ANME',
 'MDPS',
 'MGIS',
 'EVDL',
 'RMIN',
 'EDTP',
 'EVDP',
 'PLMA',
 'EDBT',
 'GSXS',
 'MUHL',
 'MUTC',
 'MHST',
 'EALS',
 'DEST',
 'CMDA',
 'MDGE',
 'CMCL',
 'MDCH',
 'ENEN',
 'ENMF',
 'TDST',
 'IDST',
 'CORE',
 'MDPR',
 'BSEN',
 'ENFD',
 'SCMA',
 'INTR',
 'GRST',
 'EDPS',
 'BMEN',
 'ENME',
 'LAND',
 'SUST',
 'ENEL',
 'COOP',
 'LAST',
 'ASL',
 'COLT',
 'TRAN',
 'ENAE',
 'ENGO',
 'ENPE',
 'ENSC',
 'OPMA',
 'ARST',
 'INDL',
 'ENCH',
 'ENCM',
 'SENG',
 'EDER',
 'NTVE',
 'CTED',
 'EVDS',
 'REAL',
 'UNEX',
 'VETM',
 'TAP',
 'PHEN',
 'SAST',
 'ENDG',
 'ENPH',
 'CUSP',
 'SASO',
 'AMAT',
 'EAST',
 'HSOC',
 'LEAD',
 'MGST',
 'TOUR',
 'INDG',
 'ENER',
 'APSY',
 'PHED',
 'MUPF',
 'RELS',
 'ACSC',
 'QUAC',
 'ENCI',
 'E

In [580]:
# Base patterns
subject_code_regex = r"([A-Z]{3,4})"  # ART, MATH
course_number_regex = r"(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})"  # 101, 30-1, 599.45
course_code_regex = rf"{subject_code_regex} {course_number_regex}"

In [581]:
def replace_subject_code(sentence: str, loose: bool=False):
	for subject_code in subject_codes_docs:
		if loose:
			sentence = re.sub(rf"{subject_code["title"]}", rf"{subject_code["code"]}", sentence)
		else:
			sentence = re.sub(rf"{subject_code["title"]} {course_number_regex}", rf"{subject_code["code"]} \1", sentence)
	return sentence

In [582]:
def get_replacement_letter():
    # Create an iterator that cycles through the alphabet
    for letter in itertools.cycle("ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
        yield letter

replacement_letters = get_replacement_letter()

In [583]:
nlp = spacy.load("en_core_web_sm")


@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ";":
            doc[token.i + 1].is_sent_start = True
    return doc


nlp.add_pipe("set_custom_boundaries", before="parser")

<function __main__.set_custom_boundaries(doc)>

In [584]:
def fix_ent_head(doc: Doc):
    for sent in doc.sents:
        for token in sent:
            if token.pos_ == "NUM" and re.match(course_number_regex, token.text) is not None:
                ancestors = list(filter(lambda x: x.text in subject_codes, token.ancestors))
                ancestor = ancestors[0] if len(ancestors) > 0 else None

                if ancestor:
                    token.head = ancestor


In [585]:
def expand_course_code(doc: Doc):
    sent = ""

    for token in doc:
        if token.text in subject_codes:
            continue

        elif token.pos_ == "NUM" and re.match(course_number_regex, token.text) is not None:
            left_tokens = [token.head] + list(reversed(list(doc[: token.i])))

            for left_token in left_tokens:

                if left_token.text in subject_codes:
                    sent += left_token.text_with_ws
                    break

            sent += token.text_with_ws

        else:
            sent += token.text_with_ws

    new_doc = nlp(sent)
    return new_doc

In [586]:
def letter_generator():
    yield from itertools.cycle("ABCDEFGHIJKLMNOPQRSTUVWXYZ")

letters = letter_generator()


def replace_course_code_with_letter(doc: Doc):
    sent = doc.text
    replacements = []

    def replacement(match):
        letter = next(letters)
        replacements.append((letter, match))
        return letter

    sent = re.sub(course_code_regex, replacement, sent)

    return nlp(sent), replacements

In [587]:
def tok_format(tok):
    # return "_".join([tok.orth_, tok.tag_])
    return f"{tok.orth_} ({tok.dep_})"


def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children])
    else:
        return tok_format(node)

In [600]:
# sent = "Actuarial Science 327; Statistics 323; 3 units from Mathematics 311, 313, 367 or 375; and 3 units from Computer Science 217, 231, 235 or Data Science 211."
# sent = "CPSC 457 and 3 units from SENG 300, 301 or ENSF 480; and admission to the Schulich School of Engineering."
# sent = "SGMA 395 or ENTI 317 or 381."
# sent = "One of FILM 321 or 323 and one of FILM 331 or 333."
# sent = "FILM 331 or 333."
# sent = "ENCI 473; and ENGG 319 or ENDG 319."
# sent = "3 units from ENCI 481, ENEE 377 or 519.09."
# sent = "ENEL 341, BMEN 327 or ENGG 225."
# sent = "ENEL 471; and one of BMEN 319 or ENGG 319 or ENEL 419."
# sent = "3 units from ENGG 319, ENDG 319 or ENEL 419."
sent = "FILM 201 and 3 units from 305 or 321."
sent = "One of GEOG 211, 251, 253, UBST 253, GLGY 201, 209; and consent of the Department."

sent = replace_subject_code(sent)

doc = nlp(sent)
fix_ent_head(doc)
new_doc = expand_course_code(doc)

new_doc, replacements = replace_course_code_with_letter(new_doc)
print("New: ", new_doc)
print("Replacements: ", replacements)

for sent in new_doc.sents:
    root = sent.root

    print("Conjuncts:",  root.conjuncts)

    tree = to_nltk_tree(sent.root)
    if isinstance(tree, Tree):
        tree.pretty_print()


# displacy.render(doc.sents, style="dep", jupyter=True, options={"compact": True, "distance": 100})
# displacy.render(doc, style="dep", jupyter=True, options={"compact": True, "distance": 100})
displacy.render(new_doc, style="dep", jupyter=True, options={"compact": True, "distance": 100})

New:  One of U, V, W, X, Y, Z; and consent of the Department.
Replacements:  [('U', <re.Match object; span=(7, 15), match='GEOG 211'>), ('V', <re.Match object; span=(17, 25), match='GEOG 251'>), ('W', <re.Match object; span=(27, 35), match='GEOG 253'>), ('X', <re.Match object; span=(37, 45), match='UBST 253'>), ('Y', <re.Match object; span=(47, 55), match='GLGY 201'>), ('Z', <re.Match object; span=(57, 65), match='GLGY 209'>)]
Conjuncts: ()
          One (ROOT)                                                 
              |                                                       
          of (prep)                                                  
              |                                                       
           U (pobj)                                                  
     _________|____________________                                   
    |                           V (conj)                             
    |          ____________________|_________                 