In [276]:
import spacy
from spacy import displacy
from spacy.language import Language, Doc
from pymongo import MongoClient
import re

In [277]:
# Connect to mongodb
client = MongoClient("mongodb://root:password@localhost:27017/")
catalog = client.get_database("catalog")

In [278]:
# Sort by length of title
subject_codes_docs = list(catalog.get_collection("subject_codes").find())
subject_codes_docs.sort(key=lambda x: len(x["title"]), reverse=True)
subject_codes_map = {doc["title"]: doc["code"] for doc in subject_codes_docs}
subject_codes = [doc["code"] for doc in subject_codes_docs]
subject_codes

['BIMA',
 'OBHR',
 'CMMB',
 'SCPA',
 'APLA',
 'ENEE',
 'ALMC',
 'LLAC',
 'ENSF',
 'IPHE',
 'CEST',
 'ISEC',
 'EESS',
 'ASHA',
 'SUSE',
 'ENTI',
 'COMS',
 'MDPA',
 'LWFT',
 'BTMA',
 'SEDV',
 'SGMA',
 'MDBT',
 'RMIN',
 'PLMA',
 'EDBT',
 'GSXS',
 'MHST',
 'EALS',
 'DEST',
 'CMDA',
 'MDGE',
 'CMCL',
 'MDCH',
 'ENEN',
 'ENMF',
 'TDST',
 'CORE',
 'MDPR',
 'ENFD',
 'SCMA',
 'INTR',
 'GRST',
 'EDPS',
 'BMEN',
 'ENME',
 'LAND',
 'SUST',
 'ENEL',
 'COOP',
 'LAST',
 'ASL',
 'ENAE',
 'ENGO',
 'ENPE',
 'ENSC',
 'OPMA',
 'INDL',
 'ENCH',
 'ENCM',
 'SENG',
 'EDER',
 'REAL',
 'UNEX',
 'VETM',
 'TAP',
 'PHEN',
 'SAST',
 'ENDG',
 'ENPH',
 'EAST',
 'HSOC',
 'LEAD',
 'MGST',
 'TOUR',
 'INDG',
 'ENER',
 'MUPF',
 'RELS',
 'ACSC',
 'QUAC',
 'ENCI',
 'ENMG',
 'POLI',
 'STST',
 'ACWR',
 'CPSC',
 'CNST',
 'LWSO',
 'MDPH',
 'MUED',
 'MDSC',
 'AFST',
 'ROST',
 'MRSC',
 'ESCI',
 'UBST',
 'EASC',
 'SPPH',
 'PLBI',
 'BIST',
 'PPOL',
 'ARCH',
 'BCEM',
 'ANTH',
 'NEUR',
 'ASPH',
 'DATA',
 'ARHI',
 'MATH',
 'SOWK',
 'K

In [279]:
def process_n_units_of_a_at_b_level(groups):
    title = groups[1]
    subject_code = subject_codes_map[title]
    return {
        "units": {
            "required": int(groups[0]),
            "subject": subject_code,
            "level": int(groups[2]),
        }
    }

In [280]:
# Patterns

subject_code_regex = r"(\w{3,4})" # ART, MATH
course_number_regex = r"(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})"  # 101, 30-1, 599.45
course_code_regex = rf"{subject_code_regex} {course_number_regex}"

patterns = [
    {
        "regex": rf"^{course_code_regex}\.?$",  # ANTH 201.
        "process": lambda groups: {"and": [{"course": groups[0] + groups[1]}]},
    },
    {
        "regex": rf"^{course_code_regex} and {course_number_regex}\.?$",  # ANTH 201 and 311.
        "process": lambda groups: {
            "and": [
                {"course": groups[0] + groups[1]},
                {"course": groups[0] + groups[2]},
            ]
        },
    },
    {
        "regex": rf"^{course_code_regex} and {course_code_regex}\.?$",  # ANTH 201 and MATH 311.
        "process": lambda groups: {
            "and": [
                {"course": groups[0] + groups[1]},
                {"course": groups[2] + groups[3]},
            ]
        },
    },
    {
        "regex": rf"^{course_code_regex} or {course_number_regex}\.?$",  # ANTH 201 or 311.
        "process": lambda groups: {
            "or": [
                {"course": groups[0] + groups[1]},
                {"course": groups[0] + groups[2]},
            ]
        },
    },
    {
        "regex": rf"^{course_code_regex} or {course_code_regex}\.?$",  # ANTH 201 or MATH 311.
        "process": lambda groups: {
            "or": [
                {"course": groups[0] + groups[1]},
                {"course": groups[2] + groups[3]},
            ]
        },
    },
    {
        "regex": rf"^{subject_code_regex} ((?:{course_number_regex})(?:, {course_number_regex})+)(?:, and | and |, ){course_number_regex}\.?$",  # ANTH 201, 311, 329, ..., 401 and 411.
        "process": lambda groups: {
            "and": [
                {"course": groups[0] + number}
                for number in list(groups[1].split(", ")) + [groups[-1]]
            ]
        },
    },
    {
        "regex": rf"^{subject_code_regex} ((?:{course_number_regex})(?:, {course_number_regex})+)(?:, or | or ){course_number_regex}\.?$",  # ANTH 201, 311, 329, ..., 401 or 411.
        "process": lambda groups: {
            "or": [
                {"course": groups[0] + number}
                for number in list(groups[1].split(", ")) + [groups[-1]]
            ]
        },
    },
    {
        "regex": rf"^{course_code_regex} and either {course_code_regex} or {course_number_regex}\.?$",  # ANTH 201 and either 311 or 329.
        "process": lambda groups: {
            "and": [
                {"course": groups[0] + groups[1]},
                {
                    "or": [
                        {"course": groups[2] + groups[3]},
                        {"course": groups[2] + groups[4]},
                    ]
                },
            ]
        },
    },
    {
        "regex": rf"^(\d+) units\.?$",  # 12 units.
        "process": lambda groups: {"units": int(groups[0])},
    },
    {
        "regex": rf"^(\d+) units at the (\d+) level\.?",
        "process": lambda groups: {
            "units": {"required": int(groups[0]), "level": int(groups[1])}
        },
    },
    {
        "regex": rf"^(\d+) units of (.*) at the (\d+) level\.?",
        "process": process_n_units_of_a_at_b_level,
    },
]

In [281]:
def replace_subject_code(sentence: str):
	for subject_code in subject_codes_docs:
		sentence = re.sub(rf"{subject_code["title"]} {course_number_regex}", rf"{subject_code["code"]} \1", sentence)
	return sentence

In [282]:
sent = "Anthropology 201 and 311."
sent = "Anthropology 210 or 213."
sent = "Anthropology 203."
sent = "Arabic Language and Muslim Cultures 204."
sent = "Astrophysics 401 and Physics 343."
sent = "Biomedical Engineering 103, 309, 310, 319, and 327."
sent = "Computer Science 471 or Data Science 311."
sent = "Drama 200, 223, 225, and 340."
sent = "English 302 and either English 240 or 340."
sent = "Geomatics Engineering 363, 401, 402 and 421."
sent = "Sociology 313, 315, 325, 331, 333."
sent = "Sociology 313, 315, 331, 333."
sent = "Software Engineering 300 or 301."
sent = "Software Engineering 300, 301, or 311."
sent = "24 units."
sent = "18 units at the 200 level."
sent = "6 units of English at the 400 level."


sent = replace_subject_code(sent)
print(sent)
print("")

for pattern in patterns:
	regex, process = pattern["regex"], pattern["process"]
	matches = re.search(regex, sent)

	print(regex)
	if matches:
		print("")
		print(matches.groups())
		result = process(list(matches.groups()))
		print(result)

6 units of English at the 400 level.

^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) and (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) and (\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) or (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) or (\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) ((?:(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}))(?:, (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}))+)(?:, and | and |, )(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) ((?:(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}))(?:, (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}))+)(?:, or | or )(\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) and either (\w{3,4}) (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3}) or (\d{2}-\d|\d{3}\.\d{1,2}|\d{2,3})\.?$
^(\d+) units\.?$
^(\d+) units at the (\d+) level\.?
^(\d+) units of (.*) at the (\d+) level\.?

('6', 'English', '400