# Create Catalog
## Convert PDF to TXT

In [13]:
import sys
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LTTextContainer, LAParams
from pdfminer.image import ImageWriter

def pdf_to_txt(outfile='tmp_pdf2txt.txt', infile='2023-2024-catalog-courses.pdf', 
               pages='', maxpages=0, password=b'', debug=0 ):

    # input option
    pagenos = set()
    if pages != '':
        pagenos.update( int(x)-1 for x in pages.split(',') )
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = open(outfile, 'w', encoding=encoding)
    device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=None)
    with open(infile, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            interpreter.process_page(page)
    device.close()
    outfp.close()
    return

In [14]:
pdf_to_txt()

## Functions for Creating Courses

### Regular Expressions

In [16]:
import re

# Course Start and End patterns
course_pattern = re.compile(r'^([A-Z]{3}[A-Z]? \d{3}[A-Z]?) ([A-Z].+?) \((\d(?:–\d)?)\)$')
course_start_pattern = re.compile(r'^([A-Z]{3}[A-Z]? \d{3}[A-Z]?) ([A-Z].+?)$')
course_end_pattern = re.compile(r'^(.+?) \((\d(?:–\d)?)\)$')

# Prerequisites
prerequisite_pattern = re.compile(r'Prerequisites?:\s(.*?)(?=\.\s|\.\))', re.DOTALL)
    
# Recommended
recommended_pattern = re.compile(r'Recommended:\s([^\.]+)\.\s')
    
# Substitutions
substitution_pattern = re.compile(r'may\s+receive\s+credit\s+for\s+only\s+one\s+of\s+the\s+following\s+courses: ')

# Warnings 
warning_pattern = re.compile(r'^\(([^\)]+)\)')

In [17]:
def create_new_course(name, title, credit):
    course = {
        'name': name.strip(),
        'title': title.strip(),
        'credit': credit.strip(),
        'description': '',
        'prerequisites': '',
        'recommended': '',
        'warnings': '',
        'substitutions': ''
    }
    return course

def update_description(course, sub=substitution_pattern, pre=prerequisite_pattern, 
                       warn=warning_pattern, recd=recommended_pattern):
    description = re.sub(r'\n', '', course['description'])
    # Substitutions
    submatch = sub.search(description)
    if submatch:
        start, end = submatch.span()
        course['substitutions'] = description[end:-1].strip()
    # Prerequisites
    prematch = pre.search(description)
    if prematch:
        course['prerequisites'] = prematch.group(1)
    # Recommended
    recmatch = recd.search(description)
    if recmatch:
        course['recommended'] = recmatch.group(1)
    # Warnings
    warnmatch = warn.search(description)
    if warnmatch:
        course['warnings'] = warnmatch.group(1)
    
    return course

def parse_course_info(text):
    
    # Logic Overview
    #
    # A course starts by matching a pattern, either
    #
    #     OneLine = STAT 221 Introduction to Statistics (3)
    # or
    #     TwoLine = STAT 536 A Really Long Description that 
    #               Takes Up More than One Line (1-3)
    #
    # - `course_match` will match OneLine and a new course is created
    # - `course_match_start` will match the first line of TwoLine 
    #   and make the variable `start_course = True`
    # - `course_match_end` will match the second line of TwoLine and
    #   a new course is created. This is checked only if the variable 
    #   `start_course = True`
    # - `course_match` and `course_match_end` will turn on the indicator
    #   variable `description_on = True`
    # - a blank line will set `description_on = False` and indicates
    #   the end of the course.
    # - after a course is ended, it is moved to `prior_course` and the
    #   description is parsed to populate warnings, prerequisites, and 
    #   recommended in the course
    
    courses = []
    prior_course = None
    current_course = None
    start_course = False
    add_description = False

    for line in text.split('\n'):
        
        course_match = course_pattern.search(line)
        course_match_start = course_start_pattern.search(line)
        course_match_end = course_end_pattern.search(line)

        if start_course:
            # Add the second half of a two-line course title
            if course_match_end:
                course_title2, course_credit = course_match_end.groups()
                course_title = course_title1.strip() + ' ' + course_title2.strip()
                current_course = create_new_course(course_name, course_title, course_credit)
                courses.append(current_course)
                start_course = False
                add_description = True

        elif course_match:
            # If we find a course line, extract information and start a new course
            course_name, course_title, course_credit = course_match.groups()
            current_course = create_new_course(course_name, course_title, course_credit)
            courses.append(current_course)
            start_course = False
            add_description = True

        elif course_match_start:
            course_name, course_title1 = course_match_start.groups()
            start_course = True                    
            
        elif current_course is not None:
            # If we are in the middle of a course, add the line to its description
            if add_description:
                # When the description ends, parse it to fill in prerequisites, 
                # recommended, warnings, and substitutions
                if line == '':
                    add_description = False
                    current_course = update_description(course=current_course)
                else:
                    current_course['description'] += line + '\n'

    return courses

## Read in raw file

In [19]:
infile = 'tmp_pdf2txt.txt'
with open(infile, 'r') as file:
    # Read the entire content of the file into a string
    text = file.read()
    
courses = parse_course_info(text)

### Look at the courses

In [307]:
# Create a dictionary with name as the key
classes = {course['name']: course for course in courses}

In [337]:
classes['CAPL 398A']['prerequisites']

''

In [333]:
classes['WRTG 394']['prerequisites']

'WRTG 112 or equivalent'

#### Editing the course prerequisites to make automation easier

- Adding a field ['done'] to exclude while editing

In [302]:
class_template = r'([A-Z]{3}[A-Z]? \d{3}[A-Z]?)'
A = class_template

singleton = re.compile(f'^{A}$')

course_or_list2 = re.compile(f'^{A} or {A}$')
course_or_list3 = re.compile(f'^{A}, {A}, or {A}$')
course_or_list4 = re.compile(f'^{A}, {A}, {A}, or {A}$')

course_and_list2 = re.compile(f'^{A} and {A}$')
course_and_list3 = re.compile(f'^{A}, {A}, and {A}$')
course_and_list4 = re.compile(f'^{A}, {A}, {A}, and {A}$')

In [303]:
def mark_done(class_info, pattern=singleton):
    # Mark classes with prerequisites done so we don't iterate over them anymore
    prerequisites_text = class_info['prerequisites']
    match = pattern.search(prerequisites_text)

    if prerequisites_text == '':
        class_info['pre']  = ''
        class_info['done'] = 1
    elif match:
        class_info['pre']  = prerequisites_text
        class_info['done'] = 1
    else:
        class_info['pre']  = ''
        class_info['done'] = 0

In [304]:
def mark_all_done(classes, pattern=singleton):
    # Mark classes with prerequisites done so we don't iterate over them anymore

    for class_name, class_info in classes.items():
        prerequisites_text = class_info['prerequisites']
        match = pattern.search(prerequisites_text)

        if prerequisites_text == '':
            class_info['pre']  = ''
            class_info['done'] = 1
        elif match:
            class_info['pre']  = prerequisites_text
            class_info['done'] = 1
        else:
            class_info['pre']  = ''
            class_info['done'] = 0

    return classes

In [305]:
def update_prerequisites(class_info, pattern, type='or'):
    prerequisites_text = class_info['prerequisites']
    match = pattern.search(prerequisites_text)
    if match:
        groups = match.groups()
        if type == 'or': 
            replaced_text = f'({ " | ".join(groups) })'
        elif type == 'and':
            replaced_text = f'({ " & ".join(groups) })'
        class_info['pre'] = replaced_text
        class_info['done'] = 1


In [306]:
classes = mark_all_done(classes, pattern = singleton)

In [299]:
for class_name, class_info in classes.items():
    print(class_name + ':') 
    print(class_info.pre)

ACCT 220:
ACCT 221:
ACCT 301:
ACCT 310:
ACCT 311:
ACCT 320:
ACCT 321:
ACCT 323:
ACCT 417:
ACCT 422:
ACCT 424:
ACCT 326:
ACCT 350:
ACCT 410:
ACCT 411:
ACCT 452:
ACCT 486A:
ACCT 486B:
AASP 201:
ACCT 425:
ACCT 436:
ACCT 438:
ACCT 440:
ANTH 101:
ANTH 102:
ANTH 298:
ANTH 345:
ANTH 346:
ANTH 350:
ANTH 351:
ANTH 398:
ANTH 417:
APTC 495:
ARAB 111:
ARAB 112:
ARAB 114:
ARAB 115:
ARAB 333:
ARTT 110:
ARTT 120:
ARTT 152:
ARTT 210:
ARTT 320:
ARTT 428:
ARTH 204:
ARTH 334:
ARTH 372:
ARTH 373:
ARTH 375:
ARTH 478:
ASTD 135:
ASTD 155:
ASTD 284:
ASTD 285:
ASTD 302:
ASTD 370:
ASTD 398:
ASTD 485:
BEHS 103:
BEHS 210:
BEHS 220:
BEHS 250:
BEHS 300:
BEHS 320:
BEHS 343:
BEHS 364:
BEHS 380:
BEHS 453:
BEHS 486A:
BEHS 486B:
BEHS 495:
BIOL 101:
BIOL 102:
BIOL 103:
BIOL 160:
BIOL 161:
BIOL 164:
BIOL 181:
BIOL 220:
BIOL 230:
BIOL 301:
BIOL 318:
BIOL 320:
BIOL 325:
BIOL 328:
BIOL 302:
BIOL 304:
BIOL 307:
BIOL 350:
BIOL 357:
BIOL 362:
BIOL 398:
BIOL 422:
BIOL 486A:
BIOL 486B:
BIOL 495:
BMGT 110:
BMGT 121A:
BMGT 121B:
BM

In [287]:
for class_name, class_info in classes.items():
    update_prerequisites(class_info, pattern = course_or_list2,  type = 'or')
    update_prerequisites(class_info, pattern = course_and_list2, type = 'and')
    update_prerequisites(class_info, pattern = course_or_list3,  type = 'or')
    update_prerequisites(class_info, pattern = course_and_list3, type = 'and')
    update_prerequisites(class_info, pattern = course_or_list4,  type = 'or')
    update_prerequisites(class_info, pattern = course_and_list4, type = 'and')

In [288]:
for class_name, class_info in classes.items():
    if class_info['done'] != True:
        print(class_name + ':', class_info['prerequisites'])

ACCT 320: (ACCT 220 | ACCT 301)
ACCT 323: (ACCT 220 | FINC 321)
ACCT 417: (ACCT 311 & ACCT 323)
ACCT 350: (ACCT 220 | ACCT 301)
ACCT 486A: 9 credits in the discipline and prior program approval (requirements detailed online at umgc.edu/wkpl)
ACCT 486B: 9 credits in the discipline and prior program approval (requirements detailed online at umgc.edu/wkpl)
ACCT 440: (ACCT 320 | ACCT 438)
APTC 495: 27 credits in major coursework
ARAB 112: ARAB 111 or appropriate score on a placement test
ARAB 114: ARAB 112 or appropriate score on a placement test
ARAB 115: ARAB 114 or appropriate score on a place-ment test
ASTD 302: Any writing course
ASTD 370: Any WRTG course
ASTD 485: MATH 105, STAT 200, or a higher MATH or STAT course
BEHS 300: (BEHS 210 & STAT 200)
BEHS 486A: 9 credits in the discipline and prior program approval (requirements detailed online at umgc.edu/wkpl)
BEHS 486B: 9 credits in the discipline and prior program approval (requirements detailed online at umgc.edu/wkpl)
BEHS 495: BEH

In [204]:
classes_with_or = {k: v for k, v in classes.items() if ' or ' in v['prerequisites']}

In [233]:
classes_with_or = {k: {'prerequisites': v['prerequisites']} for k, v in classes.items() if ' or ' in v['prerequisites']}

In [234]:
classes_with_or

{'ACCT 320': {'prerequisites': 'ACCT 220 or ACCT 301'},
 'ACCT 323': {'prerequisites': 'ACCT 220 or FINC 321'},
 'ACCT 350': {'prerequisites': 'ACCT 220 or ACCT 301'},
 'ACCT 440': {'prerequisites': 'ACCT 320 or ACCT 438'},
 'ARAB 112': {'prerequisites': 'ARAB 111 or appropriate score on a placement test'},
 'ARAB 114': {'prerequisites': 'ARAB 112 or appropriate score on a placement test'},
 'ARAB 115': {'prerequisites': 'ARAB 114 or appropriate score on a place-ment test'},
 'ASTD 485': {'prerequisites': 'MATH 105, STAT 200, or a higher MATH or STAT course'},
 'BIOL 164': {'prerequisites': 'BIOL 101, BIOL 103, or BIOL 160'},
 'BIOL 230': {'prerequisites': 'BIOL 103 or other introductory biology course with laboratory'},
 'BIOL 357': {'prerequisites': 'BIOL 325 or another upper-level biology course'},
 'BIOL 362': {'prerequisites': 'BIOL 101, BIOL 103, or BIOL 160'},
 'BIOL 422': {'prerequisites': 'BIOL 230, BIOL 301, BIOL 302, or BIOL 398G'},
 'BMGT 365': {'prerequisites': 'BMGT 110 o

In [246]:
phrase = 'STAT 221 or MATH 441'
match = course_or_list2.search(phrase)
match.groups()

('STAT 221', 'MATH 441')

In [236]:
if match:
    print ('(' + match.group(1) + ' | ' + match.group(2) + ')')

(STAT 221 | MATH 441)


In [250]:
phrase = 'BIOL 230, BIOL 301, BIOL 302, or BIOL 398G'
match = course_or_list4.search(phrase)
groups = match.groups()
groups

('BIOL 230', 'BIOL 301', 'BIOL 302', 'BIOL 398G')

In [252]:
# Convert to a single string with '|'
result_string = f'({ " | ".join(groups) })'
result_string

'(BIOL 230 | BIOL 301 | BIOL 302 | BIOL 398G)'

In [None]:
course_pattern = re.compile(r'([A-Z]{3}[A-Z]? \d{3}[A-Z]?)')
course_pattern_or = re.compile(r'([A-Z]{3}[A-Z]? \d{3}[A-Z]?) or ([A-Z]{3}[A-Z]? \d{3}[A-Z]?)')

def replace_classes(match):
    classes = re.split(r' or |, ', match.group(1))
    return '(' + ' | '.join(classes) + ')'

# Using regex to filter classes with prerequisites matching the pattern '[A-Z]{4} \d{3}'
pattern = re.compile('[A-Z]{4} \d{3}')
classes_with_regex = {k: v for k, v in classes.items() if course_pattern.search(v['prerequisites'])}

In [225]:
course_pattern_and = re.compile(r'([A-Z]{3}[A-Z]? \d{3}[A-Z]?) or ([A-Z]{3}[A-Z]? \d{3}[A-Z]?)')

In [None]:
phrase = 'STAT 221, STAT 536, or MATH 441'

In [231]:
course_pattern_or = re.compile(r'([A-Z]{3}[A-Z]? \d{3}[A-Z]?) or ([A-Z]{3}[A-Z]? \d{3}[A-Z]?)')
phrase = 'STAT 221 or MATH 441'
match = course_pattern_or.search(phrase)
if match:
    print ('(' + match.group(1) + ' | ' + match.group(2) + ')')

(STAT 221 | MATH 441)


In [232]:
course_pattern_and = re.compile(r'([A-Z]{3}[A-Z]? \d{3}[A-Z]?) and ([A-Z]{3}[A-Z]? \d{3}[A-Z]?)')
phrase = 'STAT 221 and MATH 441'
match = course_pattern_and.search(phrase)
if match:
    print ('(' + match.group(1) + ' & ' + match.group(2) + ')')

(STAT 221 & MATH 441)


In [None]:
modified_phrase = re.sub(course_pattern, replace_classes, phrase)

print(modified_phrase)

In [214]:
phrase = 'MATH 140, MATH 141, or PHYS 101'

modified_phrase = re.sub(course_pattern, replace_classes, phrase)

print(modified_phrase)

(MATH 140), (MATH 141), or (PHYS 101)


In [201]:
len(courses)

970

In [208]:
classes_with_and = {k: {'name': v['name'], 'prerequisites': v['prerequisites']} for k, v in classes.items() if ' and ' in v['prerequisites']}

In [209]:
classes_with_and

{'ACCT 417': {'name': 'ACCT 417', 'prerequisites': 'ACCT 311 and ACCT 323'},
 'ACCT 486A': {'name': 'ACCT 486A',
  'prerequisites': '9 credits in the discipline and prior program approval (requirements detailed online at umgc.edu/wkpl)'},
 'ACCT 486B': {'name': 'ACCT 486B',
  'prerequisites': '9 credits in the discipline and prior program approval (requirements detailed online at umgc.edu/wkpl)'},
 'BEHS 300': {'name': 'BEHS 300', 'prerequisites': 'BEHS 210 and STAT 200'},
 'BEHS 486A': {'name': 'BEHS 486A',
  'prerequisites': '9 credits in the discipline and prior program approval (requirements detailed online at umgc.edu/wkpl)'},
 'BEHS 486B': {'name': 'BEHS 486B',
  'prerequisites': '9 credits in the discipline and prior program approval (requirements detailed online at umgc.edu/wkpl)'},
 'BEHS 495': {'name': 'BEHS 495',
  'prerequisites': 'BEHS 300 and completion of all requirements for the social science major'},
 'BIOL 486A': {'name': 'BIOL 486A',
  'prerequisites': '9 credits in

### Add Data to the MySQL DB

In [28]:
from sqlalchemy import create_engine, Column, Integer, String, Sequence, Text, create_engine
from sqlalchemy.orm import declarative_base, Session

# Replace 'root' and 'root' with your MySQL username and password
db_username = 'root'
db_password = 'root'
db_name = 'umgc'
socket_path = '/tmp/mysql.sock'  # Specify the path to your MySQL socket

# Create the MySQL engine with the socket information
engine = create_engine(f'mysql+mysqlconnector://{db_username}:{db_password}@localhost/{db_name}?unix_socket={socket_path}')

Base = declarative_base()

In [29]:
# Define the Course class as a model
class Course(Base):
    __tablename__ = 'courses'

    id = Column(Integer, Sequence('course_id_seq'), primary_key=True)
    name = Column(String(20), unique=True)
    title = Column(String(120))
    credit = Column(String(5))
    description = Column(Text)
    prerequisites = Column(Text)
    recommended = Column(Text)
    warnings = Column(Text)
    substitutions = Column(Text)

# Create the table in the database
Base.metadata.create_all(engine)

In [30]:
# Insert data into the 'courses' table
with Session(engine) as session:
    for course_info in courses:
        course = Course(**course_info)
        session.add(course)
    session.commit()

In [31]:
# Query courses starting with 'STAT'
with Session(engine) as session:
    stat_courses = session.query(Course).filter(Course.name.like('STAT%')).all()

    for course in stat_courses:
        print(f"Course Name: {course.name}, Credit: {course.credit}, Prerequisites: {course.prerequisites}")

Course Name: STAT 200, Credit: 3, Prerequisites: 
Course Name: STAT 400, Credit: 3, Prerequisites: MATH 141


In [31]:
# Query courses starting with 'STAT'
with Session(engine) as session:
    stat_courses = session.query(Course).filter(Course.name.like('STAT%')).all()

    for course in stat_courses:
        print(f"Course Name: {course.name}, Credit: {course.credit}, Prerequisites: {course.prerequisites}")

Course Name: STAT 200, Credit: 3, Prerequisites: 
Course Name: STAT 400, Credit: 3, Prerequisites: MATH 141


In [38]:
classes['MATH 141']['prerequisites']

'MATH 140'

In [40]:
classes['MATH 108']['prerequisites']

'MATH 107'

In [41]:
classes['MATH 107']['prerequisites']

''

In [42]:
classes['MATH 115']['prerequisites']

''

In [43]:
# Prerequisites for STAT 400: ['MATH 141', 'MATH 140', [['MATH 108', 'MATH 107'], ['MATH 115']]]

# Prerequisites for STAT 400: ['MATH 141', 'MATH 140', [['MATH 108', 'MATH 107'], 
#                                                       ['MATH 115']]]


In [112]:
def small_pre(course_name, classes):
    if 'prerequisites' in classes[course_name]:
        prerequisite_str = classes[course_name]['prerequisites']
        if prerequisite_str == '':
            return []
        else:
            # will probably need to use regex to extract the classes
            prerequisite_list = [pre.strip() for pre in pre_str.split('or')]
            return prerequisite_list
    else:
        return []
    # returning a list for the time being, will process to return a string

In [155]:
all_prerequisites = []
course_name = 'STAT 400'

In [156]:
current_branch = []
course_list = small_pre(course_name, classes)
course_list

['MATH 141']

In [157]:
course_name = course_list[0]
current_branch.append(course_name)
current_branch

['MATH 141']

In [158]:
course_list = small_pre(course_name, classes)
course_list

['MATH 140']

In [159]:
course_name = course_list[0]
current_branch.append(course_name)
current_branch

['MATH 141', 'MATH 140']

In [160]:
course_list = small_pre(course_name, classes)
course_list

['MATH 108', 'MATH 115']

In [None]:
# make a current branch for the first course, then another for the second course

In [161]:
multi_branch = []
current_branch_1 = []

course_name = course_list[0]
course_name

'MATH 108'

In [162]:
current_branch_1.append(course_name)
current_branch_1

['MATH 108']

In [163]:
course_list = small_pre(course_name, classes)
course_list

['MATH 107']

In [164]:
course_name = course_list[0]
course_name

'MATH 107'

In [165]:
current_branch_1.append(course_name)
current_branch_1

['MATH 108', 'MATH 107']

In [166]:
course_list = small_pre(course_name, classes)
course_list

[]

In [None]:
# Since course_list is empty, append current_branch_1 to multi_branch

In [167]:
multi_branch.append(current_branch_1)
multi_branch

[['MATH 108', 'MATH 107']]

In [None]:
# now repeat the process for 'MATH 115'

In [168]:
current_branch_2 = []
## better: 
## for course_name in course_list:
# course_name = course_list[1]
course_name = 'MATH 115'
course_name

'MATH 115'

In [169]:
current_branch_2.append(course_name)
current_branch_2

['MATH 115']

In [170]:
course_list = small_pre(course_name, classes)
course_list

['STAT 101', 'MATH 100']

In [None]:
# Note: These are "and" instead of "or" 

In [122]:
classes[course_name]['prerequisites']

''

In [63]:
'pre' in classes[course_name]

False

In [64]:
'prerequisites' in classes[course_name]

True

In [54]:
classes[course_name]['prerequisites'] == ''

False

In [57]:
prerequisites_str = classes[course_name]['prerequisites']
prerequisites_str

'MATH 141'

In [59]:
prerequisites_list = [prerequisite.strip() for prerequisite in prerequisites_str.split('or')]
prerequisites_list

['MATH 141']

In [171]:
prelist = ['STAT 400', 'MATH 141', 'MATH 140', ['|', ['MATH 108', 'MATH 107'], ['MATH 115', ['&',['STAT 101'],['MATH 100']]]]] 

In [172]:
prelist

['STAT 400',
 'MATH 141',
 'MATH 140',
 ['|',
  ['MATH 108', 'MATH 107'],
  ['MATH 115', ['&', ['STAT 101'], ['MATH 100']]]]]

In [173]:
prelist[0]

'STAT 400'

In [174]:
prelist[1]

'MATH 141'

In [175]:
prelist[2]

'MATH 140'

In [176]:
prelist[3]

['|',
 ['MATH 108', 'MATH 107'],
 ['MATH 115', ['&', ['STAT 101'], ['MATH 100']]]]

In [199]:
#check if it is a list
isinstance(prelist[3], list)

True

In [200]:
len(prelist[3])

3

In [178]:
prelist[3][0]

'|'

In [180]:
prelist[3][1]

['MATH 108', 'MATH 107']

In [183]:
#check if it is a list
isinstance(prelist[3][1], list)

True

In [184]:
prelist[3][1][0]

'MATH 108'

In [185]:
#check if it is a list
isinstance(prelist[3][1][0], list)

False

In [181]:
prelist[3][2]

['MATH 115', ['&', ['STAT 101'], ['MATH 100']]]

In [186]:
#check if it is a list
isinstance(prelist[3][2], list)

True

In [187]:
prelist[3][2][0]

'MATH 115'

In [188]:
prelist[3][2][1]

['&', ['STAT 101'], ['MATH 100']]

In [189]:
#check if it is a list
isinstance(prelist[3][2][1], list)

True

In [190]:
prelist[3][2][1][0]

'&'

In [196]:
#check if it is a list
not isinstance(prelist[3][2][1][0], list) and prelist[3][2][1][0] == '|'

False

In [197]:
#check if it is a list
not isinstance(prelist[3][2][1][0], list) and prelist[3][2][1][0] == '&'

True

# Updated Code

In [146]:
def find_prerequisites(course_code, classes):
    def recursive_helper(course_code, classes):
        # Check if the course has prerequisites
        if 'prerequisites' in classes[course_code]:
            # Get the raw string of prerequisites for the current course
            prerequisites_str = classes[course_code]['prerequisites']

            # Check if prerequisites are not empty
            if prerequisites_str.strip() != '':
                # Split the raw string into individual prerequisites
                prerequisites_list = [prerequisite.strip() for prerequisite in prerequisites_str.split('or')]

                # Initialize the list to store the current branch of prerequisites
                current_branch = []

                # Process each prerequisite separately
                for prerequisite in prerequisites_list:
                    # Recursively find prerequisites for each prerequisite and add to the current branch
                    current_branch.extend(recursive_helper(prerequisite, classes))

                return current_branch

        # If the course has no prerequisites or empty prerequisites, return an empty list
        return []

    # Call the recursive helper function to get the overall list of courses
    return recursive_helper(course_code, classes)

In [147]:
# Example usage:
classes = {
    'STAT 400': {'prerequisites': 'MATH 141'},
    'MATH 141': {'prerequisites': 'MATH 140'},
    'MATH 140': {'prerequisites': 'MATH 108 or MATH 115'},
    'MATH 108': {'prerequisites': 'MATH 107'},
    'MATH 107': {'prerequisites': ''},
    'MATH 115': {'prerequisites': 'STAT 101 or MATH 100'},
    'STAT 101': {'prerequisites': ''},
    'MATH 100': {'prerequisites': ''}
}

In [148]:
course_code = 'STAT 400'
prerequisites = find_prerequisites(course_code, classes)

In [149]:
print(f"Prerequisites for {course_code}: {prerequisites}")

Prerequisites for STAT 400: []


In [144]:
# Example usage:
classes = {
    'STAT 400': {'prerequisites': 'MATH 141'},
    'MATH 141': {'prerequisites': 'MATH 140'},
    'MATH 140': {'prerequisites': 'MATH 108 or MATH 115'},
    'MATH 108': {'prerequisites': 'MATH 107'},
    'MATH 107': {'prerequisites': ''},
    'MATH 115': {'prerequisites': 'STAT 101 or MATH 100'},
    'STAT 101': {'prerequisites': ''},
    'MATH 100': {'prerequisites': ''}
}

In [145]:
course_code = 'STAT 400'
prerequisites = find_prerequisites(course_code, classes)

KeyError: ''

In [None]:
print(f"Prerequisites for {course_code}: {prerequisites}")

In [50]:
def find_prerequisites(course_name, classes):
    # Check if the course has prerequisites 
    if 'prerequisites' in classes[course_name]:
        if classes[course_name] == '':
            # If prerequisites are empty, return nothing
            return ''
        else:
            # Get the raw string of prerequisites for the current course
            prerequisites_str = classes[course_name]['prerequisites']

            # Split the raw string into individual prerequisites
            prerequisites_list = [prerequisite.strip() for prerequisite in prerequisites_str.split('or')]

            # Initialize the list to store the complete set of prerequisites
            all_prerequisites = []

            # Process each branch separately
            for prerequisite in prerequisites_list:
                # Recursively find prerequisites for each branch
                branch_prerequisites = find_prerequisites(prerequisite, classes)
                # Add the current branch to the overall list
                all_prerequisites.append(branch_prerequisites)

            return all_prerequisites
    else:
        # If the course has no prerequisites, return an empty list
        return []

In [51]:
find_prerequisites('STAT 400', classes)

KeyError: ''

In [None]:
# Example usage:
classes = {
    'STAT 400': {'prerequisites': 'MATH 141'},
    'MATH 141': {'prerequisites': 'MATH 140'},
    'MATH 140': {'prerequisites': 'MATH 108 or MATH 115'},
    'MATH 108': {'prerequisites': 'MATH 107'},
    'MATH 107': {'prerequisites': ''},
    'MATH 115': {'prerequisites': ''}
}

course_code = 'STAT 400'
prerequisites = find_prerequisites(course_code, classes)

print(f"Prerequisites for {course_code}: {prerequisites}")

In [None]:
from sqlalchemy import create_engine, Column, Integer, String, Sequence, Text, create_engine
from sqlalchemy.orm import declarative_base, Session

# Replace 'root' and 'rootroot' with your MySQL username and password
db_username = 'root'
db_password = 'root'
db_name = 'umgc'

# Create the MySQL engine
engine = create_engine(f'mysql+mysqlconnector://{db_username}:{db_password}@localhost/{db_name}')

Base = declarative_base()

# Define the Course class as a model
class Course(Base):
    __tablename__ = 'courses'

    id = Column(Integer, Sequence('course_id_seq'), primary_key=True)
    name = Column(String(20), unique=True)
    title = Column(String(100))
    credit = Column(String(5))
    description = Column(Text)
    prerequisites = Column(String(20))
    recommended = Column(String(20))
    warnings = Column(String(20))
    substitutions = Column(String(50))

# Create the table in the database
Base.metadata.create_all(engine)

# Data to be inserted into the 'courses' table
courses_data = [
    {'name': 'ACCT 221', 'title': 'Principles of Accounting II', 'credit': '3', 'description': '...', 'prerequisites': 'ACCT 220', 'recommended': '', 'warnings': '', 'substitutions': 'ACCT 221 or BMGT 221'},
    # Add more course data as needed
]

# Insert data into the 'courses' table
with Session(engine) as session:
    for course_info in courses_data:
        course = Course(**course_info)
        session.add(course)
    session.commit()


In [20]:
# Print the extracted information
for course in courses:
    print("=" * 50)
    print("Course Name:", course['name'])
    print("Course Title:", course['title'])
    print("Course Credit:", course['credit'])
    print("Course Prerequisites:", course['prerequisites'])
    print("Course Recommended:", course['recommended'])
    print("Course Warnings:", course['warnings'])
    print("Course Substitutions:", course['substitutions'])
    print("Course Description:")
    print(course['description'])
    print("=" * 50)

Course Name: ACCT 220
Course Title: Principles of Accounting I
Course Credit: 3
Course Prerequisites: 
Course Recommended: 
Course Substitutions: ACCT 220 or BMGT 220
Course Description:
An introduction to the basic theory and techniques of contem-
porary financial accounting. The objective is to identify the 
fundamental principles of accounting, identify and analyze 
business transactions, prepare financial statements, and com-
municate this information to users with different needs. Topics 
include the accounting cycle, transactions, and the preparation 
of financial statements for single-owner business organizations 
that operate as service companies or merchandisers. Students 
may receive credit for only one of the following courses: ACCT 
220 or BMGT 220.

Course Name: ACCT 221
Course Title: Principles of Accounting II
Course Credit: 3
Course Prerequisites: ACCT 220
Course Recommended: 
Course Substitutions: ACCT 221 or BMGT 221
Course Description:
Prerequisite: ACCT 220. Further

# Add courses to DB

In [None]:
import mysql.connector

In [None]:
# Connection details (later learn how to not include password here)
config = {
    'unix_socket': '/tmp/mysql.sock',
    'user': 'root',
    'password': 'root',
    'database': 'umgc',
}

# Debug 

### Debug inner loop

In [None]:
courses = parse_course_info(test_text)

In [None]:
courses[0]

In [None]:
courses[1]

## Above is not working correctly. Not separating ANTH 298 and ANTH 345. 
### Debugging below

#### Inner Loop

### End description debug

In [None]:
# Establish the connection
connection = mysql.connector.connect(**config)

# Create a cursor for executing queries
cursor = connection.cursor()

# Now you can execute queries using the cursor

# Remember to close the cursor and connection when done
cursor.close()
connection.close()

# Archive of Prior Functions (Skip Now)

In [None]:
def parse_course_info_works(text):
    courses = []
    prior_course = None
    current_course = None
    start_course = False
    warnings = None
    prereq = False

    # Define regular expressions for extracting information
    course_pattern = re.compile(r'^([A-Z]{4} \d{3}[A-Z]?) ([A-Z].+?) \((\d-?\d?)\)$')
    course_start_pattern = re.compile(r'^([A-Z]{4} \d{3}[A-Z]?) ([A-Z].+?)$')
    course_end_pattern = re.compile(r'^(.+?) \((\d-?\d?)\)$')    
    end_pattern = re.compile(r'^COURSE INFORMATION')

    # Define prerequisite regular expression patterns to capture the desired text
    prereq_oneline = re.compile(r'Prerequisite: (.+?)\.')
    prereq_start = re.compile(r'Prerequisite: (.+)$')
    prereq_end = re.compile(r'^(.+)\.')
    
    # Define substitutions regular expression pattern
    substitutions_pattern = re.compile(r'Students\s+may\s+receive\s+credit\s+for\s+only\s+one\s+of\s+the\s+following\s+courses: ')

    for line in text.split('\n'):

        end_match = end_pattern.match(line)
        course_match = course_pattern.match(line)
        course_match_start = course_start_pattern.match(line)
        course_match_end = course_end_pattern.match(line)

        if end_match:
            # Parse prior_course to add substitutions
            prior_course = current_course
            if prior_course is not None:
                prior_course = get_substitutions(course=prior_course)
            # Start a new course
            current_course = None
            continue

        elif start_course:
            # Parse prior_course to add substitutions
            prior_course = current_course
            if prior_course is not None:
                prior_course = get_substitutions(course=prior_course)
            # Add the second half of a two-line course title
            if course_match_end:
                course_title2, course_credit = course_match_end.groups()
                course_title = course_title1.strip() + ' ' + course_title2.strip()
                current_course = create_new_course(course_name, course_title, course_credit)
                courses.append(current_course)
                start_course = False

        elif course_match:
            # Parse prior_course to add substitutions
            prior_course = current_course
            if prior_course is not None:
                prior_course = get_substitutions(course=prior_course)

            # If we find a course line, extract information and start a new course
            course_name, course_title, course_credit = course_match.groups()
            current_course = create_new_course(course_name, course_title, course_credit)
            courses.append(current_course)
            start_course = False

        elif course_match_start:
            course_name, course_title1 = course_match_start.groups()
            start_course = True                    
            
        elif current_course is not None:
            # If we are in the middle of a course, add the line to its description
            current_course['description'] += line + '\n'

            ##### Warnings #####

            # Check for course warnings in the description
            if line.startswith("("):
                line = line[1:].strip()
                warnings = True
                
            # for case where warnings are not yet true or false
            if warnings is not None:
                if warnings:
                    # Check if the close parenthesis is in the line
                    closing_parenthesis_index = line.find(")")
                    if closing_parenthesis_index != -1:
                        current_course['warnings'] += line[:closing_parenthesis_index].strip()
                        warnings = False
                    else:
                        current_course['warnings'] += line.strip() + '\n'

            ##### Prerequisites #####
            
            # Check for course prerequisites in the description
            if "Prerequisite:" in line:
                wholematch = prereq_oneline.findall(line)
                if wholematch:
                    prerequisite_text = wholematch[0].strip()
                    current_course['prerequisites'] = prerequisite_text
                    prereq = False
                else:
                    startmatch = prereq_start.findall(line)
                    if startmatch:
                        prerequisite_text = startmatch[0].strip()
                        prereq = True
            elif prereq:
                # assumes prerequisites are on multiple lines
                endmatch = prereq_end.findall(line)
                if endmatch:
                    # Add end of line
                    prerequisite_text += ' ' + endmatch[0].strip()
                    current_course['prerequisites'] = prerequisite_text
                    prereq = False
                else:
                    # Add additional line
                    prerequisite_text += ' ' + line.strip() + '\n'
                    prereq = True

    return courses