# Create Catalog
## Convert PDF to TXT

In [1]:
import sys
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LTTextContainer, LAParams
from pdfminer.image import ImageWriter

def pdf_to_txt(outfile='tmp_pdf2txt.txt', infile='2023-2024-catalog-courses.pdf', 
               pages='', maxpages=0, password=b'', debug=0 ):

    # input option
    pagenos = set()
    if pages != '':
        pagenos.update( int(x)-1 for x in pages.split(',') )
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = open(outfile, 'w', encoding=encoding)
    device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=None)
    with open(infile, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            interpreter.process_page(page)
    device.close()
    outfp.close()
    return

In [2]:
pdf_to_txt()

## Parse Text to Create Course Dictionary

### Regular Expressions

In [3]:
import re

# Course Start and End patterns
course_pattern = re.compile(r'^([A-Z]{3}[A-Z]? \d{3}[A-Z]?) ([A-Z].+?) \((\d(?:–\d)?)\)$')
course_start_pattern = re.compile(r'^([A-Z]{3}[A-Z]? \d{3}[A-Z]?) ([A-Z].+?)$')
course_end_pattern = re.compile(r'^(.+?) \((\d(?:–\d)?)\)$')

# Prerequisites
prerequisite_pattern = re.compile(r'Prerequisites?:\s(.*?)(?=\.\s|\.\))', re.DOTALL)
    
# Recommended
recommended_pattern = re.compile(r'Recommended:\s([^\.]+)\.\s')
    
# Substitutions
substitution_pattern = re.compile(r'may\s+receive\s+credit\s+for\s+only\s+one\s+of\s+the\s+following\s+courses: ')

# Warnings 
warning_pattern = re.compile(r'^\(([^\)]+)\)')

### Functions for Creating Courses

In [4]:
def create_new_course(name, title, credit):
    course = {
        'name': name.strip(),
        'title': title.strip(),
        'credit': credit.strip(),
        'description': '',
        'prerequisites': '',
        'recommended': '',
        'warnings': '',
        'substitutions': ''
    }
    return course

def update_description(course, sub=substitution_pattern, pre=prerequisite_pattern, 
                       warn=warning_pattern, recd=recommended_pattern):
    description = re.sub(r'\n', '', course['description'])
    # Substitutions
    submatch = sub.search(description)
    if submatch:
        start, end = submatch.span()
        course['substitutions'] = description[end:-1].strip()
    # Prerequisites
    prematch = pre.search(description)
    if prematch:
        course['prerequisites'] = prematch.group(1)
    # Recommended
    recmatch = recd.search(description)
    if recmatch:
        course['recommended'] = recmatch.group(1)
    # Warnings
    warnmatch = warn.search(description)
    if warnmatch:
        course['warnings'] = warnmatch.group(1)
    
    return course

def parse_course_info(text):
    
    # Logic Overview
    #
    # A course starts by matching a pattern, either
    #
    #     OneLine = STAT 221 Introduction to Statistics (3)
    # or
    #     TwoLine = STAT 536 A Really Long Description that 
    #               Takes Up More than One Line (1-3)
    #
    # - `course_match` will match OneLine and a new course is created
    # - `course_match_start` will match the first line of TwoLine 
    #   and make the variable `start_course = True`
    # - `course_match_end` will match the second line of TwoLine and
    #   a new course is created. This is checked only if the variable 
    #   `start_course = True`
    # - `course_match` and `course_match_end` will turn on the indicator
    #   variable `description_on = True`
    # - a blank line will set `description_on = False` and indicates
    #   the end of the course.
    # - after a course is ended, it is moved to `prior_course` and the
    #   description is parsed to populate warnings, prerequisites, and 
    #   recommended in the course
    
    courses = []
    prior_course = None
    current_course = None
    start_course = False
    add_description = False

    for line in text.split('\n'):
        
        course_match = course_pattern.search(line)
        course_match_start = course_start_pattern.search(line)
        course_match_end = course_end_pattern.search(line)

        if start_course:
            # Add the second half of a two-line course title
            if course_match_end:
                course_title2, course_credit = course_match_end.groups()
                course_title = course_title1.strip() + ' ' + course_title2.strip()
                current_course = create_new_course(course_name, course_title, course_credit)
                courses.append(current_course)
                start_course = False
                add_description = True

        elif course_match:
            # If we find a course line, extract information and start a new course
            course_name, course_title, course_credit = course_match.groups()
            current_course = create_new_course(course_name, course_title, course_credit)
            courses.append(current_course)
            start_course = False
            add_description = True

        elif course_match_start:
            course_name, course_title1 = course_match_start.groups()
            start_course = True                    
            
        elif current_course is not None:
            # If we are in the middle of a course, add the line to its description
            if add_description:
                # When the description ends, parse it to fill in prerequisites, 
                # recommended, warnings, and substitutions
                if line == '':
                    add_description = False
                    current_course = update_description(course=current_course)
                else:
                    current_course['description'] += line + '\n'

    return courses

## Read in raw text file

In [None]:
infile = 'tmp_pdf2txt.txt'
with open(infile, 'r') as file:
    # Read the entire content of the file into a string
    text = file.read()
    
courses = parse_course_info(text)

# Create a dictionary with name as the key
classes = {course['name']: course for course in courses}

### Look at the courses

In [3]:
## Regular expression patterns to extract information from the catalog

class_template = r'([A-Z]{3}[A-Z]? \d{3}[A-Z]?)'
A = class_template

## Patterns found in the catalog (add more as needed)
singleton = re.compile(f'^{A}$')

course_or_list2 = re.compile(f'^{A} or {A}$')
course_or_list2a = re.compile(f'^{A} \(or {A}\)$')
course_or_list3 = re.compile(f'^{A}, {A}, or {A}$')
course_or_list4 = re.compile(f'^{A}, {A}, {A}, or {A}$')
course_or_list5 = re.compile(f'^{A}, {A}, {A}, {A}, or {A}$')

course_and_list2 = re.compile(f'^{A} and {A}$')
course_and_list3 = re.compile(f'^{A}, {A}, and {A}$')
course_and_list4 = re.compile(f'^{A}, {A}, {A}, and {A}$')
course_and_list5 = re.compile(f'^{A}, {A}, {A}, {A}, and {A}$')

language_pattern = re.compile(f'^{A} or appropriate score on a place')

prior_pattern = re.compile('(prior program approval)')

NameError: name 're' is not defined

In [7]:
classes['CAPL 398A']['prerequisites']

''

In [8]:
classes['WRTG 394']['prerequisites']

'WRTG 112 or equivalent'

#### Editing the course prerequisites to make automation easier

- Adding a field ['done'] to exclude while editing

In [None]:
def initialize_done(classes):
    # Initialize 'done' and 'pre' items in classes dictionary
    # We will iterate over items that are not done in the future

    for class_name, class_info in classes.items():
        prerequisites_text = class_info['prerequisites']

        if prerequisites_text == '':
            class_info['pre']  = ''
            class_info['done'] = 1
        else:
            class_info['pre']  = ''
            class_info['done'] = 0

    return classes

def mark_all_singles(classes, pattern=singleton):
    # Mark classes with prerequisites done so we don't iterate over them anymore

    for class_name, class_info in classes.items():
        if class_info['done'] == 0:
            prerequisites_text = class_info['prerequisites']
            match = pattern.search(prerequisites_text)

            if match:
                class_info['pre']  = prerequisites_text
                class_info['done'] = 1

    return classes

def update_all_prerequisites(classes, pattern, type='or'):   
    for class_name, class_info in classes.items():
        if class_info['done'] == 0:
            prerequisites_text = class_info['prerequisites']
            match = pattern.search(prerequisites_text)
            if match:
                groups = match.groups()
                if type == 'or': 
                    replaced_text = f'({ " | ".join(groups) })'
                elif type == 'and':
                    replaced_text = f'({ " & ".join(groups) })'
                class_info['pre'] = replaced_text
                class_info['done'] = 1
                
    return classes

def update_language(classes, pattern=language_pattern):   
    for class_name, class_info in classes.items():
        if class_info['done'] == 0:
            prerequisites_text = class_info['prerequisites']
            match = pattern.search(prerequisites_text)
            if match:
                replaced_text = '(' + match.group(1) + ' | "placement test")' 
                class_info['pre'] = replaced_text
                class_info['done'] = 1
                
    return classes

def update_prior_program(classes, pattern=prior_pattern):   
    for class_name, class_info in classes.items():
        if class_info['done'] == 0:
            prerequisites_text = class_info['prerequisites']
            match = pattern.search(prerequisites_text)
            if match:
                replaced_text = '"' + match.group(1) + '"' 
                class_info['pre'] = replaced_text
                class_info['done'] = 1
                
    return classes

In [None]:
# Mark done classes without prerequisites
classes = initialize_done(classes)

# Mark done classes with single prerequisite
classes = mark_all_singles(classes)

# Mark done classes with known patterns
classes = update_all_prerequisites(classes, pattern = course_or_list2, type='or')
classes = update_all_prerequisites(classes, pattern = course_or_list2a, type='or')
classes = update_all_prerequisites(classes, pattern = course_or_list3, type='or')
classes = update_all_prerequisites(classes, pattern = course_or_list4, type='or')
classes = update_all_prerequisites(classes, pattern = course_or_list5, type='or')

classes = update_all_prerequisites(classes, pattern = course_and_list2, type='and')
classes = update_all_prerequisites(classes, pattern = course_and_list3, type='and')
classes = update_all_prerequisites(classes, pattern = course_and_list4, type='and')
classes = update_all_prerequisites(classes, pattern = course_and_list5, type='and')

classes = update_language(classes)

classes = update_prior_program(classes)

In [55]:
classes['CMSC 465']['prerequisites']

'MATH 141 and CMSC 315 (or CMSC 350)'

In [56]:
course_and_or_list3 = re.compile(f'^{A} and {A} \(or {A}\)$')

In [62]:
def update_course3(classes, pattern=course_and_or_list3):   
    for class_name, class_info in classes.items():
        if class_info['done'] == 0:
            prerequisites_text = class_info['prerequisites']
            match = pattern.search(prerequisites_text)
            if match:
                replaced_text = '(' + match.group(1) + ' & (' + match.group(2) + ' | ' + match.group(3) + '))'
                class_info['pre'] = replaced_text
                class_info['done'] = 1
                
    return classes

In [63]:
classes = update_course3(classes)

In [58]:
match = course_and_or_list3.search(classes['CMSC 465']['prerequisites'])

In [59]:
match.groups()

('MATH 141', 'CMSC 315', 'CMSC 350')

In [61]:
'(' + match.group(1) + ' & (' + match.group(2) + ' | ' + match.group(3) + '))'

'(MATH 141 & (CMSC 315 | CMSC 350))'

In [64]:
# Check status
for class_name, class_info in classes.items():
    if class_info['done'] == 0:
        print (class_name + ':', class_info['prerequisites'])

APTC 495: 27 credits in major coursework
ASTD 302: Any writing course
ASTD 370: Any WRTG course
ASTD 485: MATH 105, STAT 200, or a higher MATH or STAT course
BEHS 495: BEHS 300 and completion of all requirements for the social science major
BIOL 230: BIOL 103 or other introductory biology course with laboratory
BIOL 357: BIOL 325 or another upper-level biology course
BMGT 485: BMGT 317, BMGT 364, BMGT 464 (or BMGT 465), and BMGT 484
BMGT 495: BMGT 364, BMGT 365, FINC 330 (or BMGT 340), and MRKT 310
COMM 300: WRTG 112 or equiva-lent
COMM 202: WRTG 112 or equiva-lent
COMM 400: WRTG 112 or equivalent
COMM 492: WRTG 112 or equivalent
COMM 495: COMM 300, COMM 302, and at least 9 additional credits of upper-level COMM, SPCH, and/or JOUR courses
CMIT 265: CMIT 202 or CMSC 115 (or CMIS 141)
CMIT 320: CMIT 265 or CompTIA Network+ certification
CMIT 424: CMIT 202 (or CompTIA A+ certification), CMIT 320 (or CompTIA Security+ certification), and CCJS 321
CMIT 495: Completion of at least 27 credits

In [24]:
testmatch = language_pattern.search(classes['ARAB 112']['prerequisites'])