# Web Scraping UCSD Course Catalog
**DSGN Department**

In [189]:
# Web Scraping Code
from requests import get
url = 'https://www.ucsd.edu/catalog/courses/DSGN.html'    # Change URL to your UCSD department
response = get(url)

from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')   # html_soup = all html code on course catalog page
#print(html_soup)

# Check to see if there are empty tags for course-name/descriptions
for tag in html_soup.find_all():
  if len(tag.get_text(strip=True)) == 0:
    tag.extract()

In [190]:
# Find all paragraph tags w/ 'course-name'
classes = html_soup.find_all('p', class_ = 'course-name')

# Find all paragraph tags w/ 'course-descriptions'
req_sub = html_soup.find_all('p', class_ = 'course-descriptions')

In [191]:
# *****CELL #3*****

class_code = []     # List of course codes
req_super = []      # List of course descriptions
classDict = {}      # Dictionary of KEY (course codes) and VALUE (course descriptions)

# Loop through all classes and append class names to req_super
for i in range(len(classes)):
  class_split = classes[i].text.split(".", 2)     # Separates into course code and course name
  #print(class_split)     # Check for any unusual class codes

  # if unusual class codes:
  #   class_code.append("")
  #   req_super.append(req_sub[i])
  # else:
  class_code.append(class_split[0])   # add course code to class_code
  req_super.append(req_sub[i])        # add course description to req_super
  classDict[class_split[0]] = req_sub[i]    # Class dictionary with KEY (course code) and VALUE (course description)

#print(class_code)
#print(req_super)

# Convert bs4 Tags into prerequisite string
for key in classDict:
  if classDict[key].text.find("Prerequisites:") == -1:    # skip if no prerequisites
    classDict[key] = ''
    continue;
  start = (classDict[key].text.find("Prerequisites:")) + len("Prerequisites:")  # start = index after 'Prerequisites:'
  reqs = classDict[key].text[start:]
  classDict[key] = reqs

classDict   # View prerequisite course description text

{'DSGN 1': '',
 'DSGN 100': ' DSGN 1.',
 'DSGN 119': ' COMM 124A or COGS 10 or DSGN 1. ',
 'DSGN 160': ' upper-division standing or consent of instructor.',
 'DSGN 161': ' upper-division standing or consent of instructor.',
 'DSGN 195': ' upper-division standing, 3.0 GPA, consent of instructor, and department approval.',
 'DSGN 198': ' upper-division standing, 2.5 GPA, consent of instructor, and department approval.',
 'DSGN 199': ' upper-division standing, 2.5 GPA, consent of instructor, and department approval.\t\t',
 'DSGN 260': ' graduate standing and consent of instructor.',
 'DSGN 261': ' graduate standing and consent of instructor.',
 'DSGN 299': ' graduate standing and consent of instructor.\t\t',
 'DSGN 90': '',
 'DSGN 99': ' lower-division standing, completion of thirty units of UC San Diego undergraduate study, a minimum UC San Diego GPA of 3.0, and a completed and approved Special Studies form.'}

In [192]:
# Function for converting clean course description into Lists
# May have to fix for different departments (some departments split with commas, 'and's, etc)
# Inputs: descr = course description (string)
# Output: prereqList = list of prerequisites (nested list)
def convertDescrToPrereqList(descr):
  prereqList = []
  if len(descr) > 5 and any(i.isdigit() for i in descr):      # if descr is longer than 5 char and contains digits
    newreq = descr.split(' and ')                             # First split by 'and's, might have to change to ','
    newreq = [item.strip(',').strip() for item in newreq]     # Strips whitespace
    
    for item in newreq:                                       # Split by 'or's
      items = item.split('or')
      items = [i.strip() for i in items]
      prereqList.append(items)
  elif descr == '':
    return prereqList
  else:
    #print('Error: ', descr)     # double check if these should be included
    return prereqList

  return prereqList

# Test this function with your own examples!!
#print(convertDescrToPrereqList(' upper-division standing or consent of instructor. '))
#print(convertDescrToPrereqList(' COMM 124A and COMM 1 or COGS 10 or DSGN 1. ',))

In [193]:
# Loop through course descriptions and clean prereqs
for key in classDict:
  descr = classDict[key].strip().split(' ')
  for x in range(len(descr)):
    descr[x] = descr[x].strip().strip('.')    # Can keep adding .strip('#') with other punctuation if necessary

    # Clear unnecessary words
    if len(descr[x]) > 4:
      descr[x] = ''
    # Edit accordingly to the words on your course description
    elif '.' in descr[x] or 'of' in descr[x] or 'GPA' in descr[x] or 'UC' in descr[x] or 'San' in descr[x] or descr[x] == 'a':
      descr[x] = ''
  reqs = ' '.join(descr).strip()
  print(reqs)          # print to check for edge cases

  # Uncomment when descriptions are clean: reduced to course codes + 'and' + 'or'
  #classDict[key] = convertDescrToPrereqList(reqs)     # Each KEY (class code) has VALUE (list of prereqs)

#classDict
# *NOTE* IF YOU HAVE ERROR WHEN RERUNNING THIS CELL:
# You must rerun cell #3 so the classDict contains strings again




and   and    form
DSGN 1
COMM 124A or COGS 10 or DSGN 1
or
or
and
and
and
and
and
and


In [182]:
# Result: classDict = dictionary of all course codes and list of prerequisites
classDict

"""
Nested lists for 'and' + 'or' explanation
Example: 'COGS 1 and COGS 2 and MATH 1 or MATH 2 or MATH 3'

1. Split by 'and's first
    ['COGS 1', 'COGS 2', 'MATH 1 or MATH 2 or MATH 3']

2. Then split each by 'or's
    [ [COGS 1], [COGS 2], [MATH 1, MATH 2, MATH 3] ]
"""

{'DSGN 1': [],
 'DSGN 100': [['DSGN 1']],
 'DSGN 119': [['COMM 124A', 'COGS 10', 'DSGN 1']],
 'DSGN 160': [],
 'DSGN 161': [],
 'DSGN 195': [],
 'DSGN 198': [],
 'DSGN 199': [],
 'DSGN 260': [],
 'DSGN 261': [],
 'DSGN 299': [],
 'DSGN 90': [],
 'DSGN 99': []}

# Neo4j Section
still in progress pls ignore for now

In [116]:
# Neo4j Code w/ py2neo
"""
from py2neo import *
graph = Graph("http://localhost:7474/browser/", user="", password="")
g = graph.begin()
"""

# Create Neo4j Graph with Prereqs relationships
#matcher = NodeMatcher(graph)
print(classDict)
for key in classDict:
  """
  c = matcher.match("Class", name=key).first()
  if c == None:
    g.create(Node("Class", name=key))
  g.commit()    # commit to display on graph
  """
  if len(classDict[key]) == 0:      # skip if no prerequisites exist
    continue;
  else:
    for listP in classDict[key]:
      print(len(listP))



{'DSGN 1': [], 'DSGN 90': [], 'DSGN 99': [], 'DSGN 100': ['DSGN 1'], 'DSGN 119': ['COMM 124A', 'COGS 10', 'DSGN 1'], 'DSGN 160': [], 'DSGN 161': [], 'DSGN 195': [], 'DSGN 198': [], 'DSGN 199': [], 'DSGN 260': [], 'DSGN 261': [], 'DSGN 299': []}
6
9
7
6


In [None]:
# Neo4j Graph Functions
"""
matcher = NodeMatcher(graph)    # Create matcher object to find specific nodes on graph

# Create all class nodes into Neo4j graph
for code in class_code:
  c = matcher.match("Class", name=code).first()     # check if node exists with course label and saves as c
  if c == None:
    g.create(Node("Class", name=code))              # if node doesn't exist, creates new node
g.commit()    # commit to display on graph
matcher = NodeMatcher(graph)
"""