In [5]:
import requests
import json
from bs4 import BeautifulSoup
import unidecode
import re
import dateutil
import time
from datetime import datetime
import calendar


In [163]:
url = "https://courses.yale.edu/api/?page=fose&route=search"
payload = {'other': {'srcdb': '201803'}, 'criteria': []}

r = requests.post(url, data=json.dumps(payload))

courses = json.loads(r.text)["results"]

courses[0]

{'key': '1',
 'code': 'ACCT 270',
 'title': 'Foundations of Accounting and Valuation',
 'crn': '12603',
 'no': '01',
 'total': '1',
 'schd': 'H',
 'stat': 'A',
 'isCancelled': '',
 'meets': 'MW 8:45-10a',
 'mpkey': '2260',
 'instr': 'R. Antle',
 'meetingTimes': '[{"meet_day":"0","start_time":"845","end_time":"1000"},{"meet_day":"2","start_time":"845","end_time":"1000"}]',
 'start_date': '2018-08-29',
 'end_date': '2018-12-19',
 'srcdb': '201803'}

In [167]:
course_infos = []

for course in courses[:5]:
    url = "https://courses.yale.edu/api/?page=fose&route=details"
    payload = {"group":"code:" + course["code"] + "",
               "key":"crn:" + course["crn"] + "",
               "srcdb":"" + course["srcdb"] + "",
               "matched":"crn:" + course["crn"] + ""}

    r = requests.post(url, data=json.dumps(payload))
    r = json.loads(r.text)
    
    course_infos.append(r)

In [168]:
course_infos

[{'key': '1',
  'gmods': 'Y,1,A,Q,F,C,V,2,3,7,4,G,K,J,W,L,8,Z,5,H,M,6,N,9,D,B,X,I,P,R,O,U,S,T',
  'col': 'YC',
  'stat': 'A',
  'mpkey': '2260',
  'code': 'ACCT 270',
  'section': '01',
  'crn': '12603',
  'title': 'Foundations of Accounting and Valuation',
  'xlist': '',
  'yc_attrs': '',
  'ci_attrs': '',
  'description': '<p>Modern accounting practices and their use in distinguishing value creation from value redistribution. Basic determinants of value and the techniques used to assess it; the creation of value through the production and delivery of goods or services; the conversion of that value into cash flows; basic financial statements, balance sheets, income statements, and cash flow statements, and the accounting mechanics with which they are built. Undergraduate enrollment limited to 50. Juniors and seniors only.</p>',
  'hours': '1',
  'regnotes': '',
  'rp_attr': '',
  'instructordetail_html': '<div class="instructor-detail"><div class="instructor-name"><a href="#" data-act

In [12]:
def professors_from_html(html):
    soup = BeautifulSoup(html)
    
    matched_divs = soup.findAll("div",{"class":"instructor-name"})
    
    names = []
    
    for div in matched_divs:
        text = div.get_text()
        
        if "mailto:" not in text:
            name = unidecode.unidecode(text)
            
            if len(name) > 0 and name != "Staff":
                names.append(name)
    
    return names

In [159]:
course_json["code"]

'AFAM 170'

In [18]:
def course_codes_from_fields(code, xlist, section, crn):
    primary_course_code = code.split(" ")
    
    course_codes = [
        {"subject": primary_course_code[0],
         "number": primary_course_code[1]
        }
    ]
        
    return {
        "oci_id": crn,
        "section": int(section),
        "listings": course_codes
    }

In [25]:
def days_of_week_from_letters(letters):
    
    if letters == "M-F":
        return ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']

    days = []

    letter_to_day = {
        "M": "Monday",
        "(T[^h]|T$)": "Tuesday",
        "W": "Wednesday",
        "Th": "Thursday",
        "F": "Friday"
    }
    
    letters = letters + " "
    
    for letter, day in letter_to_day.items():
        if re.search(letter, letters):
            days.append(day)
            
    return days

In [76]:
def time_of_day_float_from_string(string):
    
    matches = list(re.findall('([0-9]*):?([0-9]*)(am|pm)',string)[0])
    
    time = int(matches[0])
    
    if matches[2] == "pm" and time != 12:
        time += 12
        
    if len(matches[1]) == 0:
        matches[1] = "0"
        
    time += int(matches[1])/100
    
    return time

In [101]:
def course_times_from_fields(meeting_html, all_sections_remove_children):
    
    soup = BeautifulSoup(meeting_html,features="lxml")
    
    meetings = soup.find_all("div")
    
    found_htba = False
        
    htba_course_time = {
        "days": ["HTBA"],
        "start_time": "1",
        "end_time": "1",
        "location": ""
    }
    
    matched_meetings = []
        
    for meeting in meetings:
        meeting_text = "".join(meeting.find_all(text=True))
        
        if len(meeting_text) == 0:
            pass
        
        if "HTBA" in meeting_text:
            
            matched_meetings.append(htba_course_time)
            
            found_htba = True
            
        else:
            meeting_parts = meeting_text.split(" ")
            
            days = days_of_week_from_letters(meeting_parts[0])
            
            times = meeting_parts[1].split("-")
            start  = time_of_day_float_from_string(times[0])
            end = time_of_day_float_from_string(times[1])
            
            location_matches = re.findall(' in ([^<]*)',meeting_text)

            if len(location_matches) > 0:
                location = location_matches[0]
            else:
                location = ''

            matched_meetings.append({
                "days": days,
                "start_time": start,
                "end_time": end,
                "location": location
            })
    
    if not found_htba and len(matched_meetings) == 0 and "HTBA" in all_sections_remove_children:
        matched_meetings.append(htba_course_time)
        
    return matched_meetings

In [107]:
skills_map = {
    'Writing': 'WR',
    'Quantitative Reasoning': 'QR',
    'Language (1)': 'L1',
    'Language (2)': 'L2',
    'Language (3)': 'L3',
    'Language (4)': 'L4',
    'Language (5)': 'L5'
}

areas_map = {
    'Humanities': 'Hu',
    'Social Sciences': 'So',
    '>Sciences': 'Sc',
}


In [109]:
def found_items(text, mapping):
    
    items = []
    
    for search_text, code in mapping.items():
        if search_text in text:
            items.append(code)
            
    return items

In [1]:
import dateutil


In [3]:
dateutil.parser.parse("September 7 2019")

datetime.datetime(2019, 9, 7, 0, 0)

In [144]:
def exam_from_field(final_exam):
    
    no_final = "No regular final examination" in final_exam
    
    if len(final_exam) == 0 or no_final:
        return {
            'group': 0,
            'date': '1000-01-01',
            'day_of_week': '',
            'time': 0.0
        }
    
    if "HTBA" in final_exam:
        return {
            'group': 1,
            'date': '1000-01-01',
            'day_of_week': '',
            'time': 0.0
        }
    
    split_date_time = final_exam.split(" at ")
    
    date = dateutil.parser.parse(split_date_time[0])
    date_unix = int(time.mktime(date.timetuple()))
    date_unix = datetime.fromtimestamp(date_unix)
    
    time_float = time_of_day_float_from_string(split_date_time[1])
        
    return {
            'group': 2,
            'date': date_unix.strftime('%Y-%m-%d'),
            'day_of_week': calendar.day_name[date_unix.weekday()],
            'time': time_float
        }

In [145]:
course_json = course_infos[3]

exam_from_field(course_json["final_exam"])

{'group': 2, 'date': '2018-12-15', 'day_of_week': 'Saturday', 'time': 19.0}

In [165]:
def extract_course_info(course_json):
    
    course_info = {}
    
    raw_description = BeautifulSoup(course_json["description"])
    
    description = raw_description.find('p')
    
    if len(description) > 0:
        course_info["description"] = description.get_text()
    else:
        course_info["description"] = ""
        
    prereqs = raw_description.findAll("p", {"class" : "prerequisites"})
    
    if len(prereqs) == 1:
        course_info["requirements"] = prereqs[-1].get_text()
    else:
        course_info["requirements"] = ""
        
    if course_json["hours"] != "1" and course_json["hours"] != "":
        course_info["description"] += "\n\n" + course_json["hours"] + " Yale College course credits"
        
    if len(course_json["title"]) > 32:
        course_info["title"] = course_json["title"][:30] + "..."
    else:
        course_info["title"] = course_json["title"]
        
    course_info["long_title"] = course_json["title"]
    
    course_info["extra_info"] = course_json["stat"]
    
    if course_info["extra_info"] == "C":
        course_info["title"] = "CANCELLED"
    elif course_info["extra_info"] == "F":
        course_info["title"] = "FULL:" + course_info["title"]
        
    course_info["professors"] = professors_from_html(course_json["instructordetail_html"])
    
    course_info["course_codes"] = course_codes_from_fields(
        course_json["code"],
        course_json["xlist"],
        course_json["section"],
        course_json["crn"]
    )
    
    course_info["sessions"] = course_times_from_fields(
        course_json["meeting_html"],
        course_json["all_sections_remove_children"]
    )
    
    course_info["skills"] = found_items(course_json["yc_attrs"], 
                                        skills_map)
    course_info["areas"] = found_items(course_json["yc_attrs"], 
                                       areas_map)
    
    course_info["exam"] = exam_from_field(course_json["final_exam"])
    
    course_info["extra_flags"] = []
    
    if len(course_json["ci_attrs"]) > 0:
        course_info["extra_flags"].append(course_json["ci_attrs"])
        
    matched_homepage = re.findall('href="([^"]*)"[^>]*>HOMEPAGE</a>',course_json["resources"])
    
    if len(matched_homepage) > 0:
        course_info["course_home_url"] = matched_homepage[0]
        
    matched_syllabus = re.findall('href="([^"]*)"[^>]*>SYLLABUS</a>',course_json["resources"])
        
    if len(matched_syllabus) > 0:
        course_info["syllabus_url"] = matched_syllabus[0]
        
    return course_info

In [166]:
[extract_course_info(x) for x in course_infos]


[{'description': 'Modern accounting practices and their use in distinguishing value creation from value redistribution. Basic determinants of value and the techniques used to assess it; the creation of value through the production and delivery of goods or services; the conversion of that value into cash flows; basic financial statements, balance sheets, income statements, and cash flow statements, and the accounting mechanics with which they are built. Undergraduate enrollment limited to 50. Juniors and seniors only.',
  'requirements': '',
  'title': 'Foundations of Accounting and ...',
  'long_title': 'Foundations of Accounting and Valuation',
  'extra_info': 'A',
  'professors': ['Rick Antle'],
  'course_codes': {'oci_id': '12603',
   'section': 1,
   'listings': [{'subject': 'ACCT', 'number': '270'}]},
  'sessions': [{'days': ['Monday', 'Wednesday'],
    'start_time': 8.45,
    'end_time': 10.0,
    'location': 'EVANS 4200'}],
  'skills': [],
  'areas': [],
  'exam': {'group': 2,
 

In [160]:
course_json["stat"]

'A'