In [1]:
import spacy
from spacy import displacy
from spacy.language import Language
from spacy.tokens import Doc, Span, Token
from pymongo import MongoClient
import re
from nltk import Tree

In [316]:
# Connect to mongodb
client = MongoClient("mongodb://root:password@localhost:27017/")
catalog = client.get_database("catalog")

In [344]:
# Sort by length of title
subject_codes_docs = list(catalog.get_collection("subject_codes").find())
subject_codes_docs.sort(key=lambda x: len(x["title"]), reverse=True)
subject_codes_map = {doc["title"]: doc["code"] for doc in subject_codes_docs}
subject_codes = [doc["code"] for doc in subject_codes_docs]
subject_codes

['BIMA',
 'IFPE',
 'OBHR',
 'HROD',
 'IFPB',
 'CMMB',
 'SCPA',
 'EAPP',
 'APLA',
 'ENEE',
 'ALMC',
 'LLAC',
 'ENSF',
 'IPHE',
 'CEST',
 'CAAP',
 'EVDA',
 'IFPX',
 'ISEC',
 'EESS',
 'ASHA',
 'SUSE',
 'ENTI',
 'COMS',
 'MDPA',
 'LWFT',
 'STAS',
 'BTMA',
 'SEDV',
 'SGMA',
 'MDBT',
 'ANME',
 'MDPS',
 'MGIS',
 'EVDL',
 'RMIN',
 'EDTP',
 'EVDP',
 'PLMA',
 'EDBT',
 'GSXS',
 'MUHL',
 'MUTC',
 'MHST',
 'EALS',
 'DEST',
 'CMDA',
 'MDGE',
 'CMCL',
 'MDCH',
 'ENEN',
 'ENMF',
 'TDST',
 'IDST',
 'CORE',
 'MDPR',
 'BSEN',
 'ENFD',
 'SCMA',
 'INTR',
 'GRST',
 'EDPS',
 'BMEN',
 'ENME',
 'LAND',
 'SUST',
 'ENEL',
 'COOP',
 'LAST',
 'ASL',
 'COLT',
 'TRAN',
 'ENAE',
 'ENGO',
 'ENPE',
 'ENSC',
 'OPMA',
 'ARST',
 'INDL',
 'ENCH',
 'ENCM',
 'SENG',
 'EDER',
 'NTVE',
 'CTED',
 'EVDS',
 'REAL',
 'UNEX',
 'VETM',
 'TAP',
 'PHEN',
 'SAST',
 'ENDG',
 'ENPH',
 'CUSP',
 'SASO',
 'AMAT',
 'EAST',
 'HSOC',
 'LEAD',
 'MGST',
 'TOUR',
 'INDG',
 'ENER',
 'APSY',
 'PHED',
 'MUPF',
 'RELS',
 'ACSC',
 'QUAC',
 'ENCI',
 'E

In [345]:
# Base patterns
subject_code_regex = r"(\w{3,4})"  # ART, MATH
course_number_regex = r"(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})"  # 101, 30-1, 599.45
course_code_regex = rf"{subject_code_regex} {course_number_regex}"

In [346]:
def replace_subject_code(sentence: str, loose: bool=False):
	for subject_code in subject_codes_docs:
		if loose:
			sentence = re.sub(rf"{subject_code["title"]}", rf"{subject_code["code"]}", sentence)
		else:
			sentence = re.sub(rf"{subject_code["title"]} {course_number_regex}", rf"{subject_code["code"]} \1", sentence)
	return sentence

In [347]:
def process_n_units_of_things(groups, course):
    units = None
    subject_code = None
    subject_codes = None
    level = None
    is_above = None

    if groups[0]:
        units = int(groups[0])

    if groups[1]:
        sent = replace_subject_code(groups[1], loose=True)
        sent = sent.replace(" or ", ", ")
        subject_codes = [subject_code.strip() for subject_code in sent.split(", ")]

        if len(subject_codes) == 1:
            subject_code = subject_codes[0]
            subject_codes = None

    if groups[2]:
        level = int(groups[2])

    if groups[3]:
        is_above = groups[3] == "above"

    return {
        "units": {
            "required": units,
            **({"subject": subject_code} if subject_code else {}),
            **({"subjects": subject_codes} if subject_codes else {}),
            **({"level": level} if level else {}),
            **({"is_above": is_above} if is_above else {}),
        }
    }

In [348]:
def consent_of_dept_fac(groups, course):
    type = str(groups[0])
    depts: list[str] = course.get("departments", [])
    depts.sort(key=lambda x: len(x), reverse=True)

    if type.lower() == "department":
        dept = depts[0] if depts else None
        return {"consent": {"department": dept}}

    elif type.lower() == "faculty" or type.lower() == "school":
        fac = depts[-1] if depts else None
        return {"consent": {"faculty": fac}}
    
def consent_of_program(groups, course):
    return {"consent": {"program": str(groups[0])}}

In [349]:
# Patterns
patterns = [
    {
        # ANTH 201.
        "regex": rf"^{course_code_regex}\.?$",
        "process": lambda groups, course: {"and": [{"course": groups[0] + groups[1]}]},
    },
    {
        # ANTH 201 and 311. OR ANTH 201, 311.
        "regex": rf"^{course_code_regex}(?: and |, ){course_number_regex}\.?$",
        "process": lambda groups, course: {
            "and": [
                {"course": groups[0] + groups[1]},
                {"course": groups[0] + groups[2]},
            ]
        },
    },
    {
        # ANTH 201 and MATH 311.
        "regex": rf"^{course_code_regex} and {course_code_regex}\.?$",
        "process": lambda groups, course: {
            "and": [
                {"course": groups[0] + groups[1]},
                {"course": groups[2] + groups[3]},
            ]
        },
    },
    {
        # ANTH 201 or 311.
        "regex": rf"^{course_code_regex} or {course_number_regex}\.?$",
        "process": lambda groups, course: {
            "or": [
                {"course": groups[0] + groups[1]},
                {"course": groups[0] + groups[2]},
            ]
        },
    },
    {
        # ANTH 201 or MATH 311.
        "regex": rf"^{course_code_regex} or {course_code_regex}\.?$",
        "process": lambda groups, course: {
            "or": [
                {"course": groups[0] + groups[1]},
                {"course": groups[2] + groups[3]},
            ]
        },
    },
    {
        # ANTH 201, 311, 329, ..., 401 and 411.
        "regex": rf"^{subject_code_regex} ((?:{course_number_regex})(?:, {course_number_regex})+)(?:, and | and |, ){course_number_regex}\.?$",
        "process": lambda groups, course: {
            "and": [
                {"course": groups[0] + number}
                for number in list(groups[1].split(", ")) + [groups[-1]]
            ]
        },
    },
    {
        # ANTH 201, 311, 329, ..., 401 or 411.
        # One of MATH 249, 265, ..., 271 or 275.
        "regex": rf"^(?:One of )?{subject_code_regex} ((?:{course_number_regex})(?:, {course_number_regex})+)(?:, or | or ){course_number_regex}\.?$",
        "process": lambda groups, course: {
            "or": [
                {"course": groups[0] + number}
                for number in list(groups[1].split(", ")) + [groups[-1]]
            ]
        },
    },
    {
        # ANTH 201 and either 311 or 329.
        "regex": rf"^{course_code_regex} and either {course_code_regex} or {course_number_regex}\.?$",
        "process": lambda groups, course: {
            "and": [
                {"course": groups[0] + groups[1]},
                {
                    "or": [
                        {"course": groups[2] + groups[3]},
                        {"course": groups[2] + groups[4]},
                    ]
                },
            ]
        },
    },
    {
        # This regex handles the following cases:
        # "24 units."
        # "18 units at the 200 level."
        # "6 units of English at the 400 level."
        # "6 units of courses labelled English."
        # "9 units in courses labelled Geography at the 300 level or above."
        # "9 units in courses labelled Dance, Drama, Music, Music Performance or School of Creative and Performing Arts."
        # "3 units in a course labelled Arabic Language and Muslim Cultures at the 300 level or above."
        "regex": rf"^(\d+) units(?:| (?:of|in)?(?: (?:courses|a course) labelled | )([A-Z][a-zA-Z ,]*))(?:| at the (\d+) level(?:| or (above)))\.?$",
        "process": process_n_units_of_things,
    },
    {
        # Consent of the Department.
        "regex": r"(?i)^Consent of the (Department|Faculty|School)\s?\.?$",
        "process": consent_of_dept_fac,
    },
    {
        "regex": r"(?i)^Consent of the ([A-za-z ]*?) Program\.?$",
        "process": consent_of_program,
    },
    {
        # DRAM 316 or both 301 and 302.
        "regex": rf"{course_code_regex} or both {course_number_regex} and {course_number_regex}\.?$",
        "process": lambda groups, course: {
            "or": [
                {"course": groups[0] + groups[1]},
                {
                    "and": [
                        {"course": groups[0] + groups[2]},
                        {"course": groups[0] + groups[3]},
                    ]
                },
            ]
        },
    },
]

In [350]:
sent = "Anthropology 201 and 311."
sent = "Anthropology 210 or 213."
sent = "Anthropology 203."
sent = "Arabic Language and Muslim Cultures 204."
sent = "Astrophysics 401 and Physics 343."
sent = "Biomedical Engineering 103, 309, 310, 319, and 327."
sent = "Computer Science 471 or Data Science 311."
sent = "Drama 200, 223, 225, and 340."
sent = "English 302 and either English 240 or 340."
sent = "Geomatics Engineering 363, 401, 402 and 421."
sent = "Sociology 313, 315, 325, 331, 333."
sent = "Sociology 313, 315, 331, 333."
sent = "Software Engineering 300 or 301."
sent = "Software Engineering 300, 301, or 311."
sent = "24 units."
sent = "18 units at the 200 level."
sent = "6 units of English at the 400 level."
sent = "6 units of courses labelled English."
sent = "9 units in courses labelled Geography at the 300 level or above."
sent = "9 units in courses labelled Dance, Drama, Music, Music Performance or School of Creative and Performing Arts."
sent = "BIOL 313, 315."
sent = "3 units in a course labelled Arabic Language and Muslim Cultures at the 300 level or above."
sent = "Consent of the Department."
sent = "DRAM 316 or both 301 and 302."
sent = "One of MATH 249, 265 or 275."
sent = "Consent of the Faculty."
sent = "Consent of the Department ."
sent = "54 units and consent of the Department."
sent = "Consent of the BHSc Honours program."
sent = "ENGG 225, MATH 277 and PHYS 259."
sent = "SENG 315, 352, PHYS 217, 219, and CPSC 503."

course = {
    "prereq": sent,
    "departments": ["ANTH"],
}
print(sent)
print(replace_subject_code(sent))
print("")

def try_patterns(sent: str, course: dict, patterns: list) -> dict | None:
    sent = replace_subject_code(sent)

    for pattern in patterns:
        regex, process = pattern["regex"], pattern["process"]
        matches = re.search(regex, sent)
        print(regex)

        if matches:
            groups = list(matches.groups())
            result = process(groups, course)
            return result

result = try_patterns(sent, course, patterns)
print(result)

SENG 315, 352, PHYS 217, 219, and CPSC 503.
SENG 315, 352, PHYS 217, 219, and CPSC 503.

^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})(?: and |, )(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) and (\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) or (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) or (\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) ((?:(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}))(?:, (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}))+)(?:, and | and |, )(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(?:One of )?(\w{3,4}) ((?:(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}))(?:, (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}))+)(?:, or | or )(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) and either (\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) or (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\d+) units(?:| (?:of|in)?(?: (?:courses|a cou

In [324]:
def try_compound_patterns(sent: str, course: dict, patterns: list):
    if ";" not in sent:
        return

    sents = sent.split("; ")

    # All sent from the second one, must start with "and"
    all_starts_and = all([s.startswith("and ") for s in sents[1:]])
    last_starts_and = sents[-1].startswith("and ")

    if not (all_starts_and or last_starts_and):
        return

    results = []
    for sent in sents:
        if sent.startswith("and "):
            sent = sent[4:]

        result = try_patterns(sent, course, patterns)

        if not result:
            return

        results.append(result)

    return {"and": results}

In [329]:
def try_n_units_from_course_set(sent: str, course: dict, patterns: list):
    regex = rf"^(\d+) units from (.*)\.$"
    matches = re.search(regex, sent)

    if not matches:
        return

    remaining_sent = matches.groups()[1]

    # Check if the remaining sentence is a course set
    result = try_patterns(remaining_sent, course, patterns)
    if not result:
        return

    if result.get("and") or result.get("or"):
        result = result.get("and") or result.get("or")

    return {
		"units": {
			"required": int(matches.groups()[0]),
			"from": result
		}
	}

In [454]:
# NLP
nlp = spacy.load("en_core_web_sm")


@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ";":
            doc[token.i + 1].is_sent_start = True
    return doc


nlp.add_pipe("set_custom_boundaries", before="parser")


def fix_ent_head(doc: Doc):
    sentences: list[Span] = []

    for sent in doc.sents:
        print("")
        print("Sentence: ", sent.text, sent.label_)

        sentence = sent
        sentences.append(sentence)

        for token in sentence:
            print(token.text, token.pos_, token.dep_, token.head.text)

            # If the current token is a course number
            if token.pos_ == "NUM" and re.match(course_number_regex, token.text):
                print(
                    "Token: ",
                    token.text,
                    "   ",
                    token.pos_,
                )

                # Re-link its ancestor to a subject code
                for ancestor in token.ancestors:
                    if ancestor.pos_ == "PROPN" and ancestor.text in subject_codes:
                        token.head = ancestor
                        break

                print("")

            elif token.pos_ == "NUM":
                neighbor = token.nbor()

                if neighbor.lemma_ == "unit":
                    token.head = neighbor
                    token.dep_ = "nummod"

                print("Token: ", token.text)
                print("")

            elif token.dep_ == "cc":
                print("Token: ", token.text)
                # Re-link its ancestor to a subject code
                for ancestor in token.ancestors:
                    if ancestor.pos_ == "PROPN" and ancestor.text in subject_codes:
                        token.head = ancestor
                        break

                print("")
                
    return sentences


def tok_format(tok):
    # return "_".join([tok.orth_, tok.tag_])
    return f"{tok.orth_} ({tok.dep_})"


def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children])
    else:
        return tok_format(node)


def try_nlp(sent: str, course: dict):
    doc = nlp(sent)

    # sentences = doc.sents
    sentences = fix_ent_head(doc)
    for sentence in sentences:
        # for token in sentence:
        #     print(f"{token.text:10} <--{token.dep_:10} {token.head.text:10}")

        tree = to_nltk_tree(sentence.root)
        if isinstance(tree, Tree):
            tree.pretty_print()
        else:
            print(tree)

        displacy.render(
            sentence,
            style="dep",
            jupyter=True,
            options={"compact": True, "distance": 100},
        )

In [455]:
def try_everything(sent: str, course: dict, patterns: list):
    result = try_patterns(sent, course, patterns)
    if not result:
        result = try_compound_patterns(sent, course, patterns)
    if not result:
        result = try_n_units_from_course_set(sent, course, patterns)
    if not result:
        result = try_nlp(sent, course)
    return result

In [456]:
sent = "SENG 315, 352, PHYS 217, 219, and CPSC 503."
sent = "CPSC 457; and 3 units from CPSC 351, MATH 271 or 273."
sent = "CPSC 351; or 3 units from CPSC 219, 233 or 235 and 3 units from MATH 271, 273, 315 and 3 units from STAT 205, 213, 321."
sent = "CPSC 457 and 3 units from MATH 321, STAT 205, 211, 213 or 321."

course = {
    "prereq": sent,
    "departments": ["ANTH"],
}
result = try_nlp(sent, course)
print(result)


Sentence:  CPSC 457 and 3 units from MATH 321, STAT 205, 211, 213 or 321. 
CPSC PROPN ROOT CPSC
457 NUM nummod units
Token:  457     NUM

and CCONJ cc 457
Token:  and

3 NUM conj 457
Token:  3

units NOUN appos CPSC
from ADP prep units
MATH PROPN pobj from
321 NUM nummod MATH
Token:  321     NUM

, PUNCT punct CPSC
STAT PROPN appos CPSC
205 NUM nummod STAT
Token:  205     NUM

, PUNCT punct STAT
211 NUM nummod STAT
Token:  211     NUM

, PUNCT punct 211
213 NUM conj 211
Token:  213     NUM

or CCONJ cc 213
Token:  or

321 NUM conj 213
Token:  321     NUM

. PUNCT punct CPSC
                                                                   CPSC (ROOT)                                                                     
      __________________________________________________________________|___________________________________                                        
     |          |         |         |                units (appos)                                          |            

None


In [327]:
# Get all courses
courses = list(catalog.get_collection("courses").find({"prereq": {"$ne": None}, "career": "Undergraduate Programs"}))

courses_prereq = catalog.get_collection("courses_prereq")
courses_prereq.delete_many({})

for course in courses:
		prereq = course["prereq"]

		if prereq:
			print(prereq)

			prereq = replace_subject_code(prereq)
			result = try_everything(prereq, course, patterns)

			print(result)
			print("")

			courses_prereq.insert_one({
					"course": course["code"],
					"prereq_text": prereq,
					"prereq": result
			})

Admission to the Haskayne School of Business and 12 units.
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})(?: and |, )(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) and (\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) or (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) or (\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) ((?:(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}))(?:, (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}))+)(?:, and | and |, )(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(?:One of )?(\w{3,4}) ((?:(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}))(?:, (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}))+)(?:, or | or )(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) and either (\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) or (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\d+) units(?:| (?:of|in)?(?: (?:courses|a course) labelled | )([A-Z][a-zA-Z