# Scrape by Instructors

This notebook takes in a data frame of instructors' names and filter classes based on them

In [None]:
#Load selinimum and automatically install the Chrome Driver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import re

from selenium.webdriver.chrome.service import  Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from getpass import getpass
from bs4 import BeautifulSoup

In [None]:
# FireFox Driver
driver = webdriver.Firefox()
time.sleep(5)

In [None]:
url = "https://student.msu.edu/splash.html"
driver.get(url)

element = driver.find_element(By.ID, 'loginUrl1') # Hit the login button
element.click() 
time.sleep(5)

In [None]:
# Signing in
element = driver.find_element(By.ID, 'input28') # Find the MSU email input
if not element.get_attribute('value'):
    print("Please enter your MSU email: ")
    email = input()
    element.send_keys(email)
    
element = driver.find_element(By.ID, 'input36') # Find the password input
print("Please enter your password: ")
password = getpass()
element.send_keys(password)

element = driver.find_element(By.CLASS_NAME, 'o-form-button-bar') # Click Sign in
element.click()
time.sleep(5)

In [None]:
# Finding the authentication method by phone, might not be in the right window
# If not in the right window, click 'Verify with something else'
# If there's an error, run the cell again
time.sleep(5)
buttons = driver.find_elements(By.CLASS_NAME, 'authenticator-button')
for button in buttons:
    if button.get_attribute('data-se') == 'phone_number':
        button.click() # Click on authenticate by phone
time.sleep(5)

element = driver.find_element(By.TAG_NAME, 'input')  
element.click() # Click on Receive code via SMS
time.sleep(5)

element = driver.find_element(By.TAG_NAME, "input") # Find the code input
print("Please enter your SMS code: ")
code = input() 
element.send_keys(code)
 
element = driver.find_element(By.CLASS_NAME, 'o-form-button-bar') # Click Verify
element.click()
time.sleep(5)

In [None]:
try:
    element = driver.find_element(By.ID, "win0groupletPTNUI_LAND_REC_GROUPLET$1") # Click on Classes
    element.click()
    time.sleep(5)
except: # In case the website want to login again
    element = driver.find_element(By.ID, 'loginUrl1') # Click on Login
    element.click()
    time.sleep(5)
    
    element = driver.find_element(By.ID, "win0groupletPTNUI_LAND_REC_GROUPLET$1")  # Click on Classes
    element.click()
    time.sleep(5)

try:
    element = driver.find_element(By.ID, 'SCC_LO_FL_WRK_SCC_VIEW_BTN$2')  # Click on Class Search & Enroll
    element.click()
    time.sleep(5)
except:
    pass # If there's no Class Search & Enroll, proceed

body = driver.page_source
soup = BeautifulSoup(body, 'html.parser')
cells = soup.find_all("tr", class_="ps_grid-row psc_rowact")
semesters = dict()
for cell in cells: # Find each semester's ID
    semester = cell.find("a", class_="ps-link")
    semesters[semester.text] = semester.get("href")

previous_semesters = dict()
current_semesters = dict()
for key, value in semesters.items():
    if "CSTRMPRV" in value: # Get all previous semesters in a dict
        previous_semesters[key] = value
    else: # Get all current (following) semesters in another dict
        current_semesters[key] = value

previous_semesters = dict(reversed(list(previous_semesters.items()))) # Reverse the previous semesters dict
semesters = dict(**previous_semesters, **current_semesters) # Create a dict with semesters in chronological order
semesters_df = pd.DataFrame.from_dict(semesters, orient='index', columns=['href'])

In [None]:
df = pd.read_csv('CMSE_Names.csv')

In [None]:
def search_single_semester_by_instructor(instructor, semester):
    href = semesters_df.loc[semester, 'href']
    driver.execute_script(href);
    time.sleep(5)
    
    element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_CLEAR_FLDS_PB') # Click New Search
    driver.execute_script("arguments[0].click();", element)
    time.sleep(3)
    
    element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_GROUP3')  # Click the Instructors filter
    driver.execute_script("arguments[0].click();", element)
    time.sleep(1)
    try:
        element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_MSU_LOAD_INSTRUCTS') # Click Load Instructors
        driver.execute_script("arguments[0].click();", element)
        time.sleep(5)
    except:
        pass
    
    element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_LAST_NAME')  # Find the input field
    element.send_keys(instructor) 
    time.sleep(1)

    try: 
        element = driver.find_element(By.ID, "win9divMSU_CLSRCH_WRK2_GROUPBOX15")
        first = element.find_element(By.CSS_SELECTOR, "tr:not([style])") # Find the first check box
    except:
        return None
    
    driver.execute_script("arguments[0].click();", first.find_element(By.CLASS_NAME, "ps-checkbox")) # Click the first check box
    
    element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_SEARCH_BTN') # Hit Search
    driver.execute_script("arguments[0].click();", element)
    time.sleep(10)
    
    i = 0
    while True:
        try: # Find a checkbox based on index
            element = driver.find_element(By.ID, f"MSU_CLSRCH_I_DV$0_row_{i}")
        except: # If there's no more checkbox, break out of loop
            break 
        name = element.find_element(By.ID, f"MSU_CLSRCH_WRK2_INSTR_NAME${i}").text
        # If the checkbox is the instructor, don't click
        if name == instructor.replace(' ', ''): 
            i += 1
            continue
        # Else, click to unselect the checkbox
        checkbox = element.find_element(By.ID, f"MSU_CLSRCH_I_DV$selm${i}$$0")
        driver.execute_script("arguments[0].click();", checkbox)
        i += 1
    
    element = driver.find_element(By.ID, "MSU_CLSRCH_WRK2_APPLY_PB$186$") # Hit Filter Results
    driver.execute_script("arguments[0].click();", element)
    time.sleep(5)

    classes = set()
    for i in range(50):
        try: # Find the class that the instructor teached
            element = driver.find_element(By.ID, f"DESCR100$0_row_{i}") 
        except: 
            break;
        class_name = element.find_element(By.XPATH, f".//div[@id='win9divMSU_RSLT_NAV_WK_HTMLAREA${i}']").text
        classes.add(class_name) # Add the class name to the set
        
    return classes

# df["Instructor's Classes"] = df["CMSE Faculty Name"].apply(
#     lambda name: search_single_semester_by_instructor(name, "Spring 2024"))
# df

In [None]:
def search_multiple_semesters_by_instructor(instructor, start_semester=None, end_semester=None):
    prof_classes = dict()
    if not start_semester:
        start_semester = semesters_df.iloc[0].name
    if not end_semester:
        end_semester = semesters_df.iloc[-1].name
    subdf = semesters_df.loc[start_semester:end_semester]
    
    change_semester = "javascript:submitAction_win9(document.win9,'DERIVED_SSR_FL_SSR_CHANGE_BTN');"
    for index, row in subdf.iterrows():
        try: 
            driver.execute_script(change_semester);
            time.sleep(2)
            driver.switch_to.frame(0)
            driver.execute_script(row.href);
            time.sleep(2)
            driver.switch_to.default_content()
        except:
            driver.execute_script(row.href);
            time.sleep(2)
        
        prof_classes[index] = search_single_semester_by_instructor(instructor, index)
    return prof_classes

search_multiple_semesters_by_instructor('Wang, Jianrong', start_semester="Spring Semester 2020")

In [None]:
df.to_csv('CMSE_Names.csv')