# Dynamic Web scraper

This notebook scrapes the MSU courses website. 

In [None]:
#Load selinimum and automatically install the Chrome Driver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import re

from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from getpass import getpass
from bs4 import BeautifulSoup

In [None]:
# Chrome Driver
# options = webdriver.ChromeOptions()
# driver = webdriver.Chrome(options=options)

In [None]:
# FireFox Driver
driver = webdriver.Firefox()
time.sleep(5)

In [None]:
url = "https://student.msu.edu/psc/public/EMPLOYEE/SA/c/NUI_FRAMEWORK.PT_AGSTARTPAGE_NUI.GBL?CONTEXTIDPARAMS=TEMPLATE_ID%3aPTPPNAVCOL&scname=MSU_AA_SCHEDULE_NEW0&PanelCollapsible=Y"
driver.get(url)
time.sleep(5)

body = driver.page_source
soup = BeautifulSoup(body, 'html.parser')
cells = soup.find_all("tr", class_="ps_grid-row psc_rowact")
semesters = dict()
for cell in cells: # Find each semester's ID
    semester = cell.find("a", class_="ps-link")
    semesters[semester.text] = semester.get("href")

url = semesters["Spring Semester 2024"]
driver.execute_script(url); 
time.sleep(5)

element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_SUBJECT')  
element.send_keys("CMSE") #pick cmse for example

url = f"javascript:submitAction_win0(document.win0,'MSU_CLSRCH_WRK_SSR_PB_SEARCH');"
driver.execute_script(url); # Hit search
time.sleep(5)

In [None]:
def get_basic_info(soup):
    '''Scrape entire page for all of the class boxes'''
    # get the number of results
    result_element = soup.find('span', id='MSU_RSLT_NAV_WK_PTPG_ROWS_GRID')
    # Extract the text content
    result_text = result_element.get_text(strip=True)
    result = int(result_text.split()[0])
    pages = (result + 49) // 50 # get the number of aggregated pages
    
    reshaped_list = []
    cells = soup.find_all("tr", class_="ps_grid-row psc_rowact") # Find all table row in the page (Classes)
    if pages > 1: # If there's more than 1 page, go to the next page and scrape
        for i in range(pages - 1):
            url = f"javascript:submitAction_win0(document.win0,'MSU_RSLT_NAV_WK_SEARCH_CONDITION2$46$');"
            driver.execute_script(url)
            time.sleep(5)
            body = driver.page_source
            next_soup = BeautifulSoup(body, 'html.parser')
            cells += next_soup.find_all("tr", class_="ps_grid-row psc_rowact")
    
    for cell in cells:
        values = cell.text.split("\n") # Split text in a cell 
        values = list(filter(lambda x: x != "", values))  

        reshaped_list.append(values[:-3]) # Other columns will be scrape from SIS
    col_names = ['Course', 'Type', 'Section']
    df = pd.DataFrame(reshaped_list, columns=col_names)
    return df
        
body = driver.page_source
soup = BeautifulSoup(body, 'html.parser')
df = get_basic_info(soup)  # getting info on the first page

In [None]:
url = "https://student.msu.edu/splash.html"
driver.get(url)

element = driver.find_element(By.ID, 'loginUrl1') # Hit the login button
element.click() 
time.sleep(5)

In [None]:
# Signing in
element = driver.find_element(By.ID, 'input28') # Find the MSU email input
if not element.get_attribute('value'):
    print("Please enter your MSU email: ")
    email = input()
    element.send_keys(email)
    
element = driver.find_element(By.ID, 'input36') # Find the password input
print("Please enter your password: ")
password = getpass()
element.send_keys(password)

element = driver.find_element(By.CLASS_NAME, 'o-form-button-bar') # Click Sign in
element.click()
time.sleep(5)

In [None]:
# Finding the authentication method by phone, might not be in the right window
# If not in the right window, click 'Verify with something else'
# If there's an error, run the cell again
buttons = driver.find_elements(By.CLASS_NAME, 'authenticator-button')
for button in buttons:
    if button.get_attribute('data-se') == 'phone_number':
        button.click() # Click on authenticate by phone
time.sleep(5)

element = driver.find_element(By.TAG_NAME, 'input')  
element.click() # Click on Receive code via SMS
time.sleep(5)

element = driver.find_element(By.TAG_NAME, "input") # Find the code input
print("Please enter your SMS code: ")
code = input()
element.send_keys(code)
 
element = driver.find_element(By.CLASS_NAME, 'o-form-button-bar') # Click Verify
element.click()
time.sleep(5)

In [None]:
try:
    element = driver.find_element(By.ID, "win0groupletPTNUI_LAND_REC_GROUPLET$1") # Click on Classes
    element.click()
    time.sleep(5)
except: # In case the website want to login again
    element = driver.find_element(By.ID, 'loginUrl1') # Click on Login
    element.click()
    time.sleep(5)
    
    element = driver.find_element(By.ID, "win0groupletPTNUI_LAND_REC_GROUPLET$1")  # Click on Classes
    element.click()
    time.sleep(5)

try:
    element = driver.find_element(By.ID, 'SCC_LO_FL_WRK_SCC_VIEW_BTN$2')  # Click on Class Search & Enroll
    element.click()
    time.sleep(5)
except:
    pass # If there's no Class Search & Enroll, proceed

body = driver.page_source
soup = BeautifulSoup(body, 'html.parser')
cells = soup.find_all("tr", class_="ps_grid-row psc_rowact")
semesters = dict()
for cell in cells: # Find each semester's ID
    semester = cell.find("a", class_="ps-link")
    semesters[semester.text] = semester.get("href")

url = semesters["Spring Semester 2024"]
driver.execute_script(url);
time.sleep(5)

element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_SUBJECT')  
element.send_keys("CMSE") #pick cmse for example

url = f"javascript:submitAction_win9(document.win9,'MSU_CLSRCH_WRK_SSR_PB_SEARCH');"
driver.execute_script(url); # Hit search
time.sleep(5)

Function to get basic classes' info

In [None]:
body = driver.page_source
soup = BeautifulSoup(body, 'html.parser')
result_element = soup.find('span', id='MSU_RSLT_NAV_WK_PTPG_ROWS_GRID')
# Extract the text content
result_text = result_element.get_text(strip=True)
result = int(result_text.split()[0])
pages = (result + 49) // 50 # get the number of aggregated pages

def get_advanced_info(soup):
    schedule_ls = []
    for i in range(3): # Find the class schedules, there might be multiple
        div = soup.find("div", id=f"win9divMSU_CLS_DTL_WK2_HTMLAREA1$160$${i}")
        if not div:
            break
        schedule_ls.append(div)
        
    date_tds = soup.find_all("td", class_="ps_grid-cell E_HTMLAREA2") # Find class dates
    loc_ins_tds = soup.find_all("td", class_="ps_grid-cell E_HTMLAREA3") # Find class locations, instructors, and modes

    if not loc_ins_tds: # If these cannot be found, return all None
        return None, None, None, None, None, None
    
    schedules = []
    dates = []
    locs = []
    modes = []
    names = []
    emails = []
    for i in range(len(loc_ins_tds)):
        # Append multiple schedules to a list
        schedule_div = schedule_ls[i].find('div', class_="ps-htmlarea")
        schedules.append(schedule_div.get_text(strip=True))
        
        # Append multiple dates to a list
        date_div = date_tds[i].find('div', class_="ps-htmlarea")
        dates.append(date_div.get_text(strip=True))

        # Append multiple locations and modes to lists
        loc_ins_div = loc_ins_tds[i].find('div', class_="ps-htmlarea")
        txt = loc_ins_div.get_text(strip=True, separator='\n')
        locs.append(txt.split('\n')[0])
        modes.append(txt.split('\n')[-1])

        # Append multiple professors in different schedules to lists
        a_elements = loc_ins_div.find_all("a")
        if not a_elements:
            continue
        email_ind = ''
        name_ind = ''
        for a_element in a_elements: # If one schedule of the class has more than 1 professor, separate with '\n'
            # Extract the email address from the href attribute
            email_ind += a_element['href'].split(':')[1] + '\n'
            name_ind += a_element.text + '\n'
        emails.append(email_ind.strip())
        names.append(name_ind.strip())
        
    return schedules, dates, locs, modes, names, emails

In [None]:
def add_info(df):
    '''Function to add info from the breakout windows for each course
    '''
    schedules = []
    dates = []
    locations = []
    modes = []
    emails = []
    names = []
    for i in range(pages): # Go through all pages
        for i in range(50): # Go through all cells in a page
            try:
                rowname = f"DESCR100$0_row_{i}"
                element = driver.find_element(By.ID, rowname) 
                print(rowname)
            except: # End of dataframe, if cannot found, break the loop
                break
            driver.execute_script("arguments[0].click();", element) # Click on cell
            time.sleep(3) # If too slow can change this line, if there's an error, increase the time
            driver.switch_to.frame(0) # Switch to Class Description frame
            body = driver.page_source
            soup = BeautifulSoup(body, 'html.parser')
            schedule, date, loc, mode, name, email = get_advanced_info(soup)
            schedules.append(schedule)
            dates.append(date)
            locations.append(loc)
            modes.append(mode)
            emails.append(email)
            names.append(name)
            cancel_cmd="javascript:doUpdateParent(document.win9,'#ICCancel');" 
            driver.execute_script(cancel_cmd); # Close the frame
            driver.switch_to.default_content(); # Switch to the main page
            time.sleep(2)
        
        if pages-1 != i: # If there's more pages, click Next
            url = f"javascript:submitAction_win9(document.win9,'MSU_RSLT_NAV_WK_SEARCH_CONDITION2$46$');"
            driver.execute_script(url)
            time.sleep(5)
        else: # If there's no more page, break out of the big loop
            break
    
    df['Schedule'] = schedules
    df['Dates'] = dates
    df['Location'] = locations
    df['Mode'] = modes
    df['email'] = emails
    df['Instructor'] = names
    return df

In [None]:
df = add_info(df)
df.to_csv("Spring2024.csv", index=False)

In [None]:
def make_courses_df(df): 
    # Explode all these columns, will result in duplicate rows for classes that have multiple schedules
    df = df.explode(['Schedule', 'Dates', 'Location', 'Mode', 'email', 'Instructor'])
            
    df[['Days', 'Time']] = df['Schedule'].str.split(':',n=1,expand=True)
    df[['Course Code', 'Course Name']] = df['Course'].str.split(':', n=1, expand=True)  
    split_result = df['Type'].str.split('(', n=1, expand=True)
    # Check if the split operation resulted in two columns
    if len(split_result.columns) == 2:
        df[['Type', 'Units']] = split_result
    else:
        # Handle the case where the split didn't result in two columns
        df['Type'] = split_result[0]  # Assign the first part to 'Type'
        df['Units'] = '' 
    df[['Section', 'Class Nbr', 'Academic Session']] = df['Section'].str.split('/', n=2, expand=True)
    df[['Units','Status']] = df['Units'].str.split(')',n=1,expand=True)
    df[['Subject','Course Number']] = df['Course Code'].str.split(' ',n=1,expand=True)
    df['Dates'] = df['Dates'].apply(lambda x: x.replace("Approval Required", '').strip() if x else x)
    df['Status'] = df['Status'].str.replace('Reserved Capacity', '').str.strip()
    
    #df['Dates'] = df['Dates'].str.replace('Approval Required', '').str.strip()
    df['Course Name'] = df['Course Name'].str.replace('Cross-Listed', '').str.strip()
    df['Course Name'] = df['Course Name'].str.replace('Approval Required', '').str.strip()

    df = df.drop(['Course', 'Schedule','Course Code'], axis=1)
    df = df[['Subject','Course Number','Course Name','Type','Units','Status',
             'Section','Class Nbr','Academic Session','Days','Time','Dates', 
             'Location', 'Mode', 'email', 'Instructor']]
    df['Units'] = df['Units'].str.replace(' units', '')
    df['Section'] = df['Section'].str.extract(r'(\d+(?:\.\d+)?)')
    df['Class Nbr'] = df['Class Nbr'].str.extract(r'(\d+(?:\.\d+)?)')
    return df

In [None]:
df = make_courses_df(df)
df

In [None]:
df.to_csv('Spring2024.csv')