In [1115]:
import spacy
from spacy import displacy
from spacy.language import Language
from spacy.tokens import Doc, Span, Token
from spacy.matcher import Matcher
from spacy.pipeline import SpanRuler, EntityRuler
from pymongo import MongoClient
import re
from nltk import Tree
import itertools

In [1116]:
# Connect to mongodb
client = MongoClient("mongodb://root:password@localhost:27017/")
catalog = client.get_database("catalog")

In [1117]:
# Sort by length of title
subject_codes_docs = list(catalog.get_collection("subject_codes").find())
subject_codes_docs.sort(key=lambda x: len(x["title"]), reverse=True)
subject_codes_map = {doc["title"]: doc["code"] for doc in subject_codes_docs}
subject_codes = [doc["code"] for doc in subject_codes_docs]
subject_codes

['BIMA',
 'IFPE',
 'OBHR',
 'HROD',
 'IFPB',
 'CMMB',
 'SCPA',
 'EAPP',
 'APLA',
 'ENEE',
 'ALMC',
 'LLAC',
 'ENSF',
 'IPHE',
 'CEST',
 'CAAP',
 'EVDA',
 'IFPX',
 'ISEC',
 'EESS',
 'ASHA',
 'SUSE',
 'ENTI',
 'COMS',
 'MDPA',
 'LWFT',
 'STAS',
 'BTMA',
 'SEDV',
 'SGMA',
 'MDBT',
 'ANME',
 'MDPS',
 'MGIS',
 'EVDL',
 'RMIN',
 'EDTP',
 'EVDP',
 'PLMA',
 'EDBT',
 'GSXS',
 'MUHL',
 'MUTC',
 'MHST',
 'EALS',
 'DEST',
 'CMDA',
 'MDGE',
 'CMCL',
 'MDCH',
 'ENEN',
 'ENMF',
 'TDST',
 'IDST',
 'CORE',
 'MDPR',
 'BSEN',
 'ENFD',
 'SCMA',
 'INTR',
 'GRST',
 'EDPS',
 'BMEN',
 'ENME',
 'LAND',
 'SUST',
 'ENEL',
 'COOP',
 'LAST',
 'ASL',
 'COLT',
 'TRAN',
 'ENAE',
 'ENGO',
 'ENPE',
 'ENSC',
 'OPMA',
 'ARST',
 'INDL',
 'ENCH',
 'ENCM',
 'SENG',
 'EDER',
 'NTVE',
 'CTED',
 'EVDS',
 'REAL',
 'UNEX',
 'VETM',
 'TAP',
 'PHEN',
 'SAST',
 'ENDG',
 'ENPH',
 'CUSP',
 'SASO',
 'AMAT',
 'EAST',
 'HSOC',
 'LEAD',
 'MGST',
 'TOUR',
 'INDG',
 'ENER',
 'APSY',
 'PHED',
 'MUPF',
 'RELS',
 'ACSC',
 'QUAC',
 'ENCI',
 'E

In [1118]:
# Base patterns
subject_code_regex = r"([A-Z]{3,4})"  # ART, MATH
course_number_regex = r"(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})"  # 101, 30-1, 599.45
course_code_regex = rf"{subject_code_regex} {course_number_regex}"

In [1119]:
def replace_subject_code(sentence: str, loose: bool=False):
	for subject_code in subject_codes_docs:
		if loose:
			sentence = re.sub(rf"{subject_code["title"]}", rf"{subject_code["code"]}", sentence)
		else:
			sentence = re.sub(rf"{subject_code["title"]} {course_number_regex}", rf"{subject_code["code"]} \1", sentence)
	return sentence

In [1120]:
def get_replacement_letter():
    # Create an iterator that cycles through the alphabet
    for letter in itertools.cycle("ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
        yield letter

replacement_letters = get_replacement_letter()

In [1121]:
Token.set_extension("course_code", default=None, force=True)

In [1122]:
nlp = spacy.load("en_core_web_sm")

In [1123]:
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ";":
            doc[token.i + 1].is_sent_start = True
    return doc

# nlp.add_pipe("set_custom_boundaries", before="parser")


In [1124]:
@Language.component("fix_ent_head")
def fix_ent_head(doc: Doc):
    for sent in doc.sents:
        for token in sent:
            if token.pos_ == "NUM" and re.match(course_number_regex, token.text) is not None:
                ancestors = list(filter(lambda x: x.text in subject_codes, token.ancestors))
                ancestor = ancestors[0] if len(ancestors) > 0 else None

                if ancestor:
                    token.head = ancestor
                    token._.course_code = f"{ancestor.text}{token.text}"

    return doc

nlp.add_pipe("fix_ent_head")

<function __main__.fix_ent_head(doc: spacy.tokens.doc.Doc)>

In [1125]:
@Language.component("expand_course_code")
def expand_course_code(doc: Doc):
    sent = ""

    for token in doc:
        if token.text in subject_codes:
            continue

        elif token.pos_ == "NUM" and re.match(course_number_regex, token.text) is not None:
            left_tokens = [token.head] + list(reversed(list(doc[: token.i])))

            for left_token in left_tokens:
                if left_token.text in subject_codes:
                    sent += left_token.text_with_ws
                    break

            sent += token.text_with_ws

        else:
            sent += token.text_with_ws

    
    with nlp.disable_pipes("expand_course_code"):
        new_doc = nlp(sent)
        new_doc.ents = []
        
    return new_doc

nlp.add_pipe("expand_course_code")

<function __main__.expand_course_code(doc: spacy.tokens.doc.Doc)>

In [1126]:
entity_ruler: EntityRuler = nlp.add_pipe("entity_ruler")
patterns = [
    {
        "label": "COURSE",
        "pattern": [
            {"TEXT": {"REGEX": subject_code_regex}},
            {"TEXT": {"REGEX": course_number_regex}},
        ],
    },
    {
        "label": "RQ",
        "pattern": [
            {"TEXT": "RQ"},
            {"TEXT": {"REGEX": "[A-Z]"}},
        ],
    }
]
entity_ruler.clear()
entity_ruler.add_patterns(patterns)

In [1127]:
@Language.component("merge_entity_spans")
def merge_entity_spans(doc: Doc):
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            if ent.label_ is not None:
                retokenizer.merge(ent)
    return doc

nlp.add_pipe("merge_entity_spans")

<function __main__.merge_entity_spans(doc: spacy.tokens.doc.Doc)>

In [1128]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'fix_ent_head',
 'expand_course_code',
 'entity_ruler',
 'merge_entity_spans']

In [1129]:
def letter_generator():
    yield from itertools.cycle("ABCDEFGHIJKLMNOPQRSTUVWXYZ")

letters = letter_generator()

In [1130]:
def tok_format(tok):
    # return "_".join([tok.orth_, tok.tag_])
    return f"{tok.orth_} ({tok.dep_})"


def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children])
    else:
        return tok_format(node)

In [1131]:
matcher = Matcher(nlp.vocab)
matcher.add(
    "X units of",
    [
        [
            {"IS_DIGIT": True},
            {"LEMMA": "unit"},
            {"POS": "ADP", "OP": "+"},
            #
            {"ENT_TYPE": "COURSE", "OP": "?"},
            {"TEXT": {"IN": ["and", "or", ","]}, "OP": "?"},
            #
            #
            {"ENT_TYPE": "COURSE", "OP": "?"},
            {"TEXT": {"IN": ["and", "or", ","]}, "OP": "?"},
            #
            {"ENT_TYPE": "COURSE", "OP": "?"},
            {"TEXT": {"IN": ["and", "or", ","]}, "OP": "?"},
            #
            {"ENT_TYPE": "COURSE", "OP": "?"},
            {"TEXT": {"IN": ["and", "or", ","]}, "OP": "?"},
            #
            {"ENT_TYPE": "COURSE", "OP": "?"},
            {"TEXT": {"IN": ["and", "or", ","]}, "OP": "?"},
            #
            {"ENT_TYPE": "COURSE", "OP": "?"},
            {"TEXT": {"IN": ["and", "or", ","]}, "OP": "?"},
            #
            {"ENT_TYPE": "COURSE", "OP": "?"},
            {"TEXT": {"IN": ["and", "or", ","]}, "OP": "?"},
        ]
    ],
    greedy="LONGEST",
)

matcher.add(
    "One of",
    [
        [
            {"LEMMA": "one"},
            {"POS": "ADP", "OP": "+"},
            {"POS": {"NOT_IN": ["PUNC"]}},
            {"TEXT": {"REGEX": "[A-Za-z, ]"}, "OP": "*"},
            {"IS_SENT_START": False},
        ]
    ],
    greedy="LONGEST",
)

matcher.add(
    "Consent of",
    [
        [
            {"LEMMA": "consent"},
            {"POS": "ADP", "OP": "+"},
            {"TEXT": {"REGEX": "[A-Za-z, ]"}, "OP": "*"},
            {"IS_SENT_START": False},
        ]
    ],
    greedy="LONGEST",
)

matcher.add(
    "Admission to",
    [
        [
            {"LEMMA": "admission"},
            {"POS": "ADP", "OP": "+"},
            {"TEXT": {"REGEX": "[A-Za-z, ]", "NOT_IN": ["and", "or"]}, "OP": "*"},
        ]
    ],
    greedy="LONGEST",
)

matcher.add(
    "Both A and B",
    [
        [
            {"LEMMA": "both"},
            {"IS_ALPHA": True},
            {"LEMMA": "and"},
            {"IS_ALPHA": True},
            {"IS_SENT_START": False},
        ]
    ],
    greedy="LONGEST",
)

def replace_segments(doc: Doc):
    sent = doc.text
    matches = matcher(doc)
    spans = []

    for match_id, start, end in matches:
        letter = f"RQ {next(letters)}"

        span = doc[start:end]
        spans.append(span)

        sent = re.sub(re.escape(span.text), letter, sent)

    return nlp(sent), spans

In [1134]:
# sent = "Actuarial Science 327; Statistics 323; 3 units from Mathematics 311, 313, 367 or 375; and 3 units from Computer Science 217, 231, 235 or Data Science 211."
# sent = "CPSC 457 and 3 units from SENG 300, 301 or ENSF 480; and admission to the Schulich School of Engineering."
# sent = "SGMA 395 or ENTI 317 or 381."
# sent = "One of FILM 321 or 323 and one of FILM 331 or 333."
# sent = "FILM 331 or 333."
# sent = "ENCI 473; and ENGG 319 or ENDG 319."
# sent = "3 units from ENCI 481, ENEE 377 or 519.09."
# sent = "ENEL 341, BMEN 327 or ENGG 225."
# sent = "ENEL 471; and one of BMEN 319 or ENGG 319 or ENEL 419."
# sent = "3 units from ENGG 319, ENDG 319 or ENEL 419."
# sent = "FILM 201 and 3 units from 305 or 321."
# sent = "INDG 201 and 3 units from INDG 303 or 345."
# sent = "One of GEOG 211, 251, 253, UBST 253, GLGY 201, 209; and consent of the Department."
# sent = "STAT 205 or 213; and admission to the Kinesiology Honours program; and consent of the Faculty."
# sent = "MATH 209 and admission to the Energy Engineering program."
# sent = "Both MATH 349 and 353; or both MATH 283 and 381; or MATH 267."
# sent = "MATH 431 or PMAT 431; MATH 429 or PMAT 429 or MATH 327 or PMAT 427."
# sent = "MATH 445 or 447; 3 units of Mathematics in the Field of Mathematics at the 400 level or above."
# sent = "MATH 383; and 6 units of Mathematics in the Field of Mathematics at the 400 level or above."
# sent = "MRSC 451 and consent of the Department."
sent = "Admission to the Haskayne School of Business and OBHR 317."
sent = "PHYS 211 or 221 or 227."
sent = "MATH 277 and PHYS 259 and admission to a program in Engineering."
sent = "PHYS 341; and 3 units from CPSC 217, 231 or DATA 211."

sent = replace_subject_code(sent)
print("Original:", sent)

doc = nlp(sent)
print("NLP     :", doc)

print(doc.ents)


for sent in doc.sents:
    root = sent.root
    tree = to_nltk_tree(sent.root)
    if isinstance(tree, Tree):
        tree.pretty_print()

    print(list(root.conjuncts))


doc, spans = replace_segments(doc)
print("New: ", doc)
print("Spans: ", spans)

print(doc.ents)

displacy.render(doc, style="ent", jupyter=True, options={"compact": True, "distance": 100})
displacy.render(doc, style="dep", jupyter=True, options={"compact": True, "distance": 100})
# # displacy.render(doc, style="dep", jupyter=True, options={"compact": True, "distance": 100})
# displacy.render(new_doc, style="dep", jupyter=True, options={"compact": True, "distance": 100})

Original: PHYS 341; and 3 units from CPSC 217, 231 or DATA 211.
NLP     : PHYS 341; and 3 units from CPSC 217, CPSC 231 or DATA 211.
(PHYS 341, CPSC 217, CPSC 231, DATA 211)
                             PHYS 341 (ROOT)                                                                        
     _______________________________|_______________________                                                         
    |        |         |                               units (conj)                                                 
    |        |         |             _______________________|______________                                          
    |        |         |            |                                 from (prep)                                   
    |        |         |            |                                      |                                         
    |        |         |            |                               CPSC 217 (pobj)                                 
    