# Dynamic Web scraper

This notebook scrapes the MSU courses website. 

In [None]:
#Load selinimum and automatically install the Chrome Driver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import re

from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from getpass import getpass
from bs4 import BeautifulSoup

In [None]:
# Chrome Driver
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)

In [None]:
# FireFox Driver
# driver = webdriver.Firefox()
# time.sleep(5)

In [None]:
url = "https://student.msu.edu/psc/public/EMPLOYEE/SA/c/NUI_FRAMEWORK.PT_AGSTARTPAGE_NUI.GBL?CONTEXTIDPARAMS=TEMPLATE_ID%3aPTPPNAVCOL&scname=MSU_AA_SCHEDULE_NEW0&PanelCollapsible=Y"
driver.get(url)
time.sleep(5)

In [None]:
body = driver.page_source
soup = BeautifulSoup(body, 'html.parser')
cells = soup.find_all("tr", class_="ps_grid-row psc_rowact") # Find all table row in the page (Semesters)
semesters = dict()

for cell in cells:
    semester = cell.find("a", class_="ps-link")
    semesters[semester.text] = semester.get("href")

In [None]:
url = semesters["Fall Semester 2024"]
driver.execute_script(url);
time.sleep(5)

In [None]:
element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_SUBJECT')  
element.send_keys("CMSE") #pick cmse for example

In [None]:
url = f"javascript:submitAction_win0(document.win0,'MSU_CLSRCH_WRK_SSR_PB_SEARCH');"
driver.execute_script(url);
time.sleep(5)

Function to get basic classes' info

In [None]:
body = driver.page_source
soup = BeautifulSoup(body, 'html.parser')

# get the number of results
result_element = soup.find('span', id='MSU_RSLT_NAV_WK_PTPG_ROWS_GRID')
# Extract the text content
result_text = result_element.get_text(strip=True)
result = int(result_text.split()[0])
pages = (result + 49) // 50# get the number of aggregated pages

def get_class_info(soup):
    '''Scrape entire page for all of the class boxes'''
    
    reshaped_list = []
    cells = soup.find_all("tr", class_="ps_grid-row psc_rowact") # Find all table row in the page (Classes)
    if pages > 1:
        for i in range(pages - 1):
            url = f"javascript:submitAction_win0(document.win0,'MSU_RSLT_NAV_WK_SEARCH_CONDITION2$46$');"
            driver.execute_script(url)
            time.sleep(5)
            body = driver.page_source
            next_soup = BeautifulSoup(body, 'html.parser')
            cells += next_soup.find_all("tr", class_="ps_grid-row psc_rowact")
    
    for cell in cells:
        values = cell.text.split("\n") # Split text in a cell 
        values = list(filter(lambda x: x != "", values))  
        values.pop()
        brs = cell.find_all("br") # Find classes that have two class time
        if brs: # Update class with two class time
            values[-2] = brs[0].previous_sibling.text.strip() + "\n" + brs[0].next_sibling.text.strip()
            values[-1] = brs[1].previous_sibling.text.strip() + "\n" + brs[1].next_sibling.text.strip()
        reshaped_list.append(values)
        
    # Create a DataFrame from the reshaped list
    col_names = ['Course', 'Type', 'Section', 'Schedule', 'Dates']
    df = pd.DataFrame(reshaped_list, columns=col_names)

    # Deal with classes with two class time
    for index, row in df.iterrows():
        if "\n" in row['Schedule']:
            if "To Be Announced" in row['Schedule']:
                df.loc[index, 'Schedule'] = row['Schedule'].partition("\n")[0]
                df.loc[index, 'Dates'] = row['Dates'].partition("\n")[0]
                
    df[['Days', 'Time']] = df['Schedule'].str.split(':',n=1,expand=True)
    subdf = df[df['Schedule'].apply(lambda x: "\n" in x)]
    for index, row in subdf.iterrows():
        first_sc = row['Schedule'].partition('\n')[0]
        second_sc = row['Schedule'].partition('\n')[-1]
        df.loc[index, 'Days'] = first_sc.partition(' : ')[0] + "\n" + second_sc.partition(' : ')[0]
        df.loc[index, 'Time'] = first_sc.partition(' : ')[-1] + "\n" + second_sc.partition(' : ')[-1]
    
    df[['Course Code', 'Course Name']] = df['Course'].str.split(':', n=1, expand=True)  
    split_result = df['Type'].str.split('(', n=1, expand=True)
    # Check if the split operation resulted in two columns
    if len(split_result.columns) == 2:
        df[['Type', 'Units']] = split_result
    else:
        # Handle the case where the split didn't result in two columns
        df['Type'] = split_result[0]  # Assign the first part to 'Type'
        df['Units'] = '' 
    df[['Section', 'Class Nbr', 'Academic Session']] = df['Section'].str.split('/', n=2, expand=True)
    df[['Units','Status']] = df['Units'].str.split(')',n=1,expand=True)
    df[['Subject','Course Number']] = df['Course Code'].str.split(' ',n=1,expand=True)
    df['Dates'] = df['Dates'].apply(lambda x: x.replace("Approval Required", '').strip() if x else x)
    df['Status'] = df['Status'].str.replace('Reserved Capacity', '').str.strip()
    
    #df['Dates'] = df['Dates'].str.replace('Approval Required', '').str.strip()
    df['Course Name'] = df['Course Name'].str.replace('Cross-Listed', '').str.strip()
    df['Course Name'] = df['Course Name'].str.replace('Approval Required', '').str.strip()

    df = df.drop(['Course', 'Schedule','Course Code'], axis=1)
    df = df[['Subject','Course Number','Course Name','Type','Units','Status',
             'Section','Class Nbr','Academic Session','Days','Time','Dates']]
    df['Units'] = df['Units'].str.replace(' units', '')
    df['Section'] = df['Section'].str.extract(r'(\d+(?:\.\d+)?)')
    df['Class Nbr'] = df['Class Nbr'].str.extract(r'(\d+(?:\.\d+)?)')
    return df

df = get_class_info(soup)  # getting info on the first page

In [None]:
#Remove none
df = df[df['Course Name'].notna()]
df

In [None]:
df.to_csv("Fall2024.csv", index=False)

In [None]:
df = pd.read_csv("Fall2024.csv")

url = "https://student.msu.edu/splash.html"
driver.get(url)

element = driver.find_element(By.ID, 'loginUrl1') 
element.click()
time.sleep(5)

In [None]:
# Signing in
element = driver.find_element(By.ID, 'input28') 
if not element.get_attribute('value'):
    print("Please enter your MSU email: ")
    email = input()
    element.send_keys(email)
    
element = driver.find_element(By.ID, 'input36')  
print("Please enter your password: ")
password = getpass()
element.send_keys(password)

element = driver.find_element(By.CLASS_NAME, 'o-form-button-bar')  
element.click()
time.sleep(5)

In [None]:
# Finding the authentication method by phone, might not be in the right window
# If not in the right window, click 'Verify with something else'
buttons = driver.find_elements(By.CLASS_NAME, 'authenticator-button')
for button in buttons:
    if button.get_attribute('data-se') == 'phone_number':
        button.click()
time.sleep(5)

element = driver.find_element(By.TAG_NAME, 'input')  
element.click()
time.sleep(5)

In [None]:
# Enter SMS code
element = driver.find_element(By.TAG_NAME, "input") 
print("Please enter your SMS code: ")
code = input()
element.send_keys(code)
 
element = driver.find_element(By.CLASS_NAME, 'o-form-button-bar')  
element.click()
time.sleep(5)

In [None]:
try:
    element = driver.find_element(By.ID, "win0groupletPTNUI_LAND_REC_GROUPLET$1")  
    element.click()
    time.sleep(5)
except: # In case the website want to login again
    element = driver.find_element(By.ID, 'loginUrl1') 
    element.click()
    time.sleep(5)
    
    element = driver.find_element(By.ID, "win0groupletPTNUI_LAND_REC_GROUPLET$1")  
    element.click()
    time.sleep(5)

element = driver.find_element(By.ID, 'SCC_LO_FL_WRK_SCC_VIEW_BTN$2')  
element.click()
time.sleep(5)

body = driver.page_source
soup = BeautifulSoup(body, 'html.parser')
cells = soup.find_all("tr", class_="ps_grid-row psc_rowact")
semesters = dict()

for cell in cells:
    semester = cell.find("a", class_="ps-link")
    semesters[semester.text] = semester.get("href")

url = semesters["Fall Semester 2024"]
driver.execute_script(url);
time.sleep(5)

element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_SUBJECT')  
element.send_keys("CMSE") #pick cmse for example

url = f"javascript:submitAction_win9(document.win9,'MSU_CLSRCH_WRK_SSR_PB_SEARCH');"
driver.execute_script(url);
time.sleep(5)

In [None]:
body = driver.page_source
soup = BeautifulSoup(body, 'html.parser')

# get the number of results
result_element = soup.find('span', id='MSU_RSLT_NAV_WK_PTPG_ROWS_GRID')
# Extract the text content
result_text = result_element.get_text(strip=True)
result = int(result_text.split()[0])
pages = (result + 49) // 50# get the number of aggregated pages

def get_advanced_info(soup):
    loc = None
    email = None
    name = None
    values = []
    divs = soup.find_all("div", class_="ps-htmlarea")
    for div in divs:
        val = (div.get_text(strip=True))
        values.append(val)
    values = list(filter(lambda x: x != "", values))
    if len(values) > 5 and values[5] != "":
        string = values[5].split('Instructor:')
    else:
        return None, None, None
    
    loc = string[0]
    a_elements = soup.find_all('a')
    for a_element in a_elements:
        # Check if the 'href' attribute exists
        if 'href' in a_element.attrs:
            # Check if the href attribute contains "mailto:"
            if 'mailto:' in a_element['href']:
                # Extract the email address from the href attribute
                email = a_element['href'].split(':')[1]
                name = a_element.text
                break
    return loc, email, name

In [None]:
def add_info(df):
    '''Function to add info from the breakout windows for each course
    '''
    location = []
    emails = []
    names = []
    for i in range(pages):
        for i in range(50):
            try:
                rowname = f"DESCR100$0_row_{i}"
                element = driver.find_element(By.ID, rowname) 
                print(rowname)
            except:
                break
            driver.execute_script("arguments[0].click();", element)
            time.sleep(5) # If too slow can change this line
            driver.switch_to.frame(0)
            body = driver.page_source
            soup = BeautifulSoup(body, 'html.parser')
            loc, email, name = get_advanced_info(soup)
            location.append(loc)
            emails.append(email)
            names.append(name)
            cancel_cmd="javascript:doUpdateParent(document.win9,'#ICCancel');"
            driver.execute_script(cancel_cmd);
            driver.switch_to.default_content();
            time.sleep(5) # If too slow can change this line
        
        if pages-1 != i:
            url = f"javascript:submitAction_win9(document.win9,'MSU_RSLT_NAV_WK_SEARCH_CONDITION2$46$');"
            driver.execute_script(url)
            time.sleep(5)
        else:
            break
        
    df['Location'] = location
    df['email'] = emails
    df['Instructor'] = names
    return df

In [None]:
df = add_info(df)

In [None]:
df

In [None]:
df.to_csv("Fall2024.csv", index=False)