In [1]:
import spacy
from spacy import displacy
from spacy.language import Language, Doc
from pymongo import MongoClient
import re

In [2]:
# Connect to mongodb
client = MongoClient("mongodb://root:password@localhost:27017/")
catalog = client.get_database("catalog")

In [3]:
# Sort by length of title
subject_codes_docs = list(catalog.get_collection("subject_codes").find())
subject_codes_docs.sort(key=lambda x: len(x["title"]), reverse=True)
subject_codes_map = {doc["title"]: doc["code"] for doc in subject_codes_docs}
subject_codes = [doc["code"] for doc in subject_codes_docs]
subject_codes

['BIMA',
 'OBHR',
 'CMMB',
 'SCPA',
 'APLA',
 'ENEE',
 'ALMC',
 'LLAC',
 'ENSF',
 'IPHE',
 'CEST',
 'ISEC',
 'EESS',
 'ASHA',
 'SUSE',
 'ENTI',
 'COMS',
 'MDPA',
 'LWFT',
 'BTMA',
 'SEDV',
 'SGMA',
 'MDBT',
 'RMIN',
 'PLMA',
 'EDBT',
 'GSXS',
 'MHST',
 'EALS',
 'DEST',
 'CMDA',
 'MDGE',
 'CMCL',
 'MDCH',
 'ENEN',
 'ENMF',
 'TDST',
 'CORE',
 'MDPR',
 'ENFD',
 'SCMA',
 'INTR',
 'GRST',
 'EDPS',
 'BMEN',
 'ENME',
 'LAND',
 'SUST',
 'ENEL',
 'COOP',
 'LAST',
 'ASL',
 'ENAE',
 'ENGO',
 'ENPE',
 'ENSC',
 'OPMA',
 'INDL',
 'ENCH',
 'ENCM',
 'SENG',
 'EDER',
 'REAL',
 'UNEX',
 'VETM',
 'TAP',
 'PHEN',
 'SAST',
 'ENDG',
 'ENPH',
 'EAST',
 'HSOC',
 'LEAD',
 'MGST',
 'TOUR',
 'INDG',
 'ENER',
 'MUPF',
 'RELS',
 'ACSC',
 'QUAC',
 'ENCI',
 'ENMG',
 'POLI',
 'STST',
 'ACWR',
 'CPSC',
 'CNST',
 'LWSO',
 'MDPH',
 'MUED',
 'MDSC',
 'AFST',
 'ROST',
 'MRSC',
 'ESCI',
 'UBST',
 'EASC',
 'SPPH',
 'PLBI',
 'BIST',
 'PPOL',
 'ARCH',
 'BCEM',
 'ANTH',
 'NEUR',
 'ASPH',
 'DATA',
 'ARHI',
 'MATH',
 'SOWK',
 'K

In [4]:
# Base patterns
subject_code_regex = r"(\w{3,4})"  # ART, MATH
course_number_regex = r"(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})"  # 101, 30-1, 599.45
course_code_regex = rf"{subject_code_regex} {course_number_regex}"

In [5]:
def replace_subject_code(sentence: str, loose: bool=False):
	for subject_code in subject_codes_docs:
		if loose:
			sentence = re.sub(rf"{subject_code["title"]}", rf"{subject_code["code"]}", sentence)
		else:
			sentence = re.sub(rf"{subject_code["title"]} {course_number_regex}", rf"{subject_code["code"]} \1", sentence)
	return sentence

In [6]:
def process_n_units_of_things(groups):
	units = None
	subject_code = None
	subject_codes = None
	level = None
	is_above = None

	if groups[0]:
		units = int(groups[0])

	if groups[1]:
		sent = replace_subject_code(groups[1], loose=True)
		sent = sent.replace(" or ", ", ")
		subject_codes = [subject_code.strip() for subject_code in sent.split(", ")]

		if len(subject_codes) == 1:
			subject_code = subject_codes[0]
			subject_codes = None

	if groups[2]:
			level = int(groups[2])

	if groups[3]:
			is_above = groups[3] == "above"

	return {
			"units": {
					"required": units,
					**({"subject": subject_code} if subject_code else {}),
					**({"subjects": subject_codes} if subject_codes else {}),
					**({"level": level} if level else {}),
					**({"is_above": is_above} if is_above else {})
			}
	}

In [7]:
# Patterns
patterns = [
    {
        "regex": rf"^{course_code_regex}\.?$",  # ANTH 201.
        "process": lambda groups: {"and": [{"course": groups[0] + groups[1]}]},
    },
    {
        "regex": rf"^{course_code_regex} and {course_number_regex}\.?$",  # ANTH 201 and 311.
        "process": lambda groups: {
            "and": [
                {"course": groups[0] + groups[1]},
                {"course": groups[0] + groups[2]},
            ]
        },
    },
    {
        "regex": rf"^{course_code_regex} and {course_code_regex}\.?$",  # ANTH 201 and MATH 311.
        "process": lambda groups: {
            "and": [
                {"course": groups[0] + groups[1]},
                {"course": groups[2] + groups[3]},
            ]
        },
    },
    {
        "regex": rf"^{course_code_regex} or {course_number_regex}\.?$",  # ANTH 201 or 311.
        "process": lambda groups: {
            "or": [
                {"course": groups[0] + groups[1]},
                {"course": groups[0] + groups[2]},
            ]
        },
    },
    {
        "regex": rf"^{course_code_regex} or {course_code_regex}\.?$",  # ANTH 201 or MATH 311.
        "process": lambda groups: {
            "or": [
                {"course": groups[0] + groups[1]},
                {"course": groups[2] + groups[3]},
            ]
        },
    },
    {
        "regex": rf"^{subject_code_regex} ((?:{course_number_regex})(?:, {course_number_regex})+)(?:, and | and |, ){course_number_regex}\.?$",  # ANTH 201, 311, 329, ..., 401 and 411.
        "process": lambda groups: {
            "and": [
                {"course": groups[0] + number}
                for number in list(groups[1].split(", ")) + [groups[-1]]
            ]
        },
    },
    {
        "regex": rf"^{subject_code_regex} ((?:{course_number_regex})(?:, {course_number_regex})+)(?:, or | or ){course_number_regex}\.?$",  # ANTH 201, 311, 329, ..., 401 or 411.
        "process": lambda groups: {
            "or": [
                {"course": groups[0] + number}
                for number in list(groups[1].split(", ")) + [groups[-1]]
            ]
        },
    },
    {
        "regex": rf"^{course_code_regex} and either {course_code_regex} or {course_number_regex}\.?$",  # ANTH 201 and either 311 or 329.
        "process": lambda groups: {
            "and": [
                {"course": groups[0] + groups[1]},
                {
                    "or": [
                        {"course": groups[2] + groups[3]},
                        {"course": groups[2] + groups[4]},
                    ]
                },
            ]
        },
    },
    {
        # This regex handles the following cases:
        # "24 units."
        # "18 units at the 200 level."
        # "6 units of English at the 400 level."
        # "6 units of courses labelled English."
        # "9 units in courses labelled Geography at the 300 level or above."
        # "9 units in courses labelled Dance, Drama, Music, Music Performance or School of Creative and Performing Arts."
        "regex": rf"^(\d+) units(?:| (?:of|in)(?: courses labelled | )([a-zA-Z ,]*))(?:| at the (\d+) level(?:| or (above)))\.?$",
        "process": process_n_units_of_things,
    },
]

In [8]:
sent = "Anthropology 201 and 311."
sent = "Anthropology 210 or 213."
sent = "Anthropology 203."
sent = "Arabic Language and Muslim Cultures 204."
sent = "Astrophysics 401 and Physics 343."
sent = "Biomedical Engineering 103, 309, 310, 319, and 327."
sent = "Computer Science 471 or Data Science 311."
sent = "Drama 200, 223, 225, and 340."
sent = "English 302 and either English 240 or 340."
sent = "Geomatics Engineering 363, 401, 402 and 421."
sent = "Sociology 313, 315, 325, 331, 333."
sent = "Sociology 313, 315, 331, 333."
sent = "Software Engineering 300 or 301."
sent = "Software Engineering 300, 301, or 311."
sent = "24 units."
sent = "18 units at the 200 level."
sent = "6 units of English at the 400 level."
sent = "6 units of courses labelled English."
sent = "9 units in courses labelled Geography at the 300 level or above."
sent = "9 units in courses labelled Dance, Drama, Music, Music Performance or School of Creative and Performing Arts."

sent = replace_subject_code(sent)
print(sent)
print("")

def try_patterns(sent: str, patterns: list):
    for pattern in patterns:
        regex, process = pattern["regex"], pattern["process"]
        matches = re.search(regex, sent)

        if matches:
            result = process(list(matches.groups()))
            return result

result = try_patterns(sent, patterns)
print(result)

9 units in courses labelled Dance, Drama, Music, Music Performance or School of Creative and Performing Arts.

{'units': {'required': 9, 'subjects': ['DNCE', 'DRAM', 'MUSI', 'MUPF', 'SCPA']}}


In [9]:
# Get all courses
courses = list(catalog.get_collection("courses").find({"prereq": {"$ne": None}, "career": "Undergraduate Programs"}))

courses_prereq = catalog.get_collection("courses_prereq")
courses_prereq.delete_many({})

for course in courses:
		prereq = course["prereq"]

		if prereq:
			print(prereq)

			prereq = replace_subject_code(prereq)
			result = try_patterns(prereq, patterns)

			print(result)
			print("")

			courses_prereq.insert_one({
					"course": course["code"],
					"prereq_text": prereq,
					"prereq": result
			})

Admission to the Haskayne School of Business and 12 units.
None

24 units including Entrepreneurship and Innovation 201.
None

Admission to the Haskayne School of Business, and Accounting 217.
None

Admission to the Haskayne School of Business and Accounting 217 and 323.
None

Admission to the Haskayne School of Business and Accounting 341.
None

24 units including Accounting 217 or 301. For certain topics consent of the Haskayne School of Business will also be required.
None

Admission to the Haskayne School of Business and Accounting 323.
None

Admission to the Haskayne School of Business, and 54 units including Accounting 217.
None

Admission to the Haskayne School of Business and Accounting 421.
None

Admission to the Haskayne School of Business and 54 units including Accounting 341.
None

Admission to the Haskayne School of Business and 54 units including Accounting 343.
None

Admission to the Haskayne School of Business and 54 units including Accounting 343.
None

Admission to th