# Scrape by Instructors

This notebook takes in a data frame of instructors' names and filter classes based on them

In [None]:
#Load selinimum and automatically install the Chrome Driver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import re

from selenium.webdriver.chrome.service import  Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from getpass import getpass
from bs4 import BeautifulSoup

from Login import login_to_SIS

In [None]:
# FireFox Driver
driver = webdriver.Firefox()
time.sleep(5)

login_to_SIS(driver)

In [None]:
try:
    element = driver.find_element(By.ID, "win0groupletPTNUI_LAND_REC_GROUPLET$1") # Click on Classes
    element.click()
    time.sleep(2)
except: # In case the website want to login again
    element = driver.find_element(By.ID, 'loginUrl1') # Click on Login
    element.click()
    time.sleep(2)
    
    element = driver.find_element(By.ID, "win0groupletPTNUI_LAND_REC_GROUPLET$1")  # Click on Classes
    element.click()
    time.sleep(2)

try:
    element = driver.find_element(By.ID, 'SCC_LO_FL_WRK_SCC_VIEW_BTN$2')  # Click on Class Search & Enroll
    element.click()
    time.sleep(2)
except:
    pass # If there's no Class Search & Enroll, proceed

In [None]:
body = driver.page_source
soup = BeautifulSoup(body, 'html.parser')
cells = soup.find_all("tr", class_="ps_grid-row psc_rowact")
semesters_dict = dict()
for cell in cells: # Find each semester's ID
    semester = cell.find("a", class_="ps-link")
    semesters_dict[semester.text] = semester.get("href")

previous_semesters = []
current_semesters = []
for key, value in semesters_dict.items():
    if "CSTRMPRV" in value: # Get all previous semesters in a list
        previous_semesters.append(key)
    else: # Get all current (following) semesters in another list
        current_semesters.append(key)

previous_semesters.reverse()
semesters_list = previous_semesters + current_semesters

In [None]:
df = pd.read_csv('CMSE_Names.csv')

In [None]:
def search_by_instructor(instructor, semester):
    wait = WebDriverWait(driver, 20)
    try:
        cur_page = driver.find_element(By.ID, "TERM_VAL_TBL_DESCR").text
    except:
        cur_page = None

    if not cur_page:
        driver.execute_script(semesters_dict[semester])
        time.sleep(2)
    elif cur_page != semester.replace('Semester ', ''):
        change_semester = "javascript:submitAction_win9(document.win9,'DERIVED_SSR_FL_SSR_CHANGE_BTN');"
        driver.execute_script(change_semester);
        wait.until(EC.frame_to_be_available_and_switch_to_it(0))
        time.sleep(1)
        if semester in previous_semesters:
            element = driver.find_element(By.ID, "DERIVED_SSR_FL_SSR_CSTRMPRV_GRP")
            element.click()
            time.sleep(1)
        element = driver.find_element(By.LINK_TEXT, semester)
        driver.execute_script("arguments[0].click();", element)
        time.sleep(1)    
        driver.switch_to.default_content()
        
    try:
        input = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_LAST_NAME')
    except:
        input = None
    
    if not input:
        element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_GROUP3')  # Click the Instructors filter
        driver.execute_script("arguments[0].click();", element)
        time.sleep(1)

        try: 
            element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_MSU_LOAD_INSTRUCTS') # Click Load Instructors
            driver.execute_script("arguments[0].click();", element)
            wait.until(EC.visibility_of_element_located((By.ID, 'MSU_CLSRCH_WRK2_LAST_NAME')))
        except:
            pass # Some older semesters automatically click Load Instructors for us
    
        input = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_LAST_NAME')
        
    input.clear()
    input.send_keys(instructor) 
    time.sleep(1)
    
    try: 
        element = driver.find_element(By.ID, "win9divMSU_CLSRCH_WRK2_GROUPBOX15")
        first = element.find_element(By.CSS_SELECTOR, "tr:not([style='display: none;'])") # Find the first check box
    except:
        return None

    driver.execute_script("arguments[0].click();", first.find_element(By.CLASS_NAME, "ps-checkbox")) # Click the first check box
    
    element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_SEARCH_BTN') # Hit Search
    driver.execute_script("arguments[0].click();", element)
    wait.until(EC.visibility_of_element_located((By.ID, "MSU_CLSRCH_I_DV$selm$0$$0")))

    try: # For some cases, the search returns no results and will throw up an error window
        element = driver.find_element(By.ID, '#ICOK') 
        element.click() # Click OK to get out of the error
        time.sleep(1)
        return None
    except: 
        pass
    
    i = 0
    while True:
        try: # Find a checkbox based on index
            element = driver.find_element(By.ID, f"MSU_CLSRCH_I_DV$0_row_{i}")
        except: # If there's no more checkbox, break out of loop
            break 
        name = element.find_element(By.ID, f"MSU_CLSRCH_WRK2_INSTR_NAME${i}").text
        # If the checkbox is the instructor, don't click
        if name == instructor.replace(' ', ''): 
            i += 1
            continue
        # Else, click to unselect the checkbox
        checkbox = element.find_element(By.ID, f"MSU_CLSRCH_I_DV$selm${i}$$0")
        driver.execute_script("arguments[0].click();", checkbox)
        i += 1
        
    if i > 1:
        result_i = driver.find_element(By.ID, 'MSU_RSLT_NAV_WK_PTPG_ROWS_GRID').text
        element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_APPLY_PB$186$') # Click Filter Results
        driver.execute_script("arguments[0].click();", element)
        wait.until(lambda drv: drv.find_element(By.ID, 'MSU_RSLT_NAV_WK_PTPG_ROWS_GRID').text != result_i)

    classes = set()
    for i in range(50):
        try: # Find the class that the instructor taught
            element = driver.find_element(By.ID, f"DESCR100$0_row_{i}") 
        except: 
            break
        class_name = element.find_element(By.XPATH, f".//div[@id='win9divMSU_RSLT_NAV_WK_HTMLAREA${i}']").text
        classes.add(class_name) # Add the class name to the set
    
    element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_CLEAR_FLDS_PB') # Click New Search
    driver.execute_script("arguments[0].click();", element)
    wait.until(EC.invisibility_of_element_located((By.ID, "MSU_CLSRCH_I_DV$selm$0$$0")))
    
    return classes

In [None]:
def search_multiple_semesters(df, start_semester=None, end_semester=None):
    if not start_semester:
        start_semester = semesters_list[0]
    if not end_semester:
        end_semester = semesters_list[-1]
    start_index = semesters_list.index(start_semester)
    end_index = semesters_list.index(end_semester)
    sublist = semesters_list[start_index:end_index+1]
    for semester in sublist:
        df[f"{semester}"]= df["CMSE Faculty Name"].apply(
            lambda name: search_by_instructor(name, semester))
    return df

search_multiple_semesters(df, start_semester="Fall Semester 2021")

In [None]:
# driver.execute_script(semesters_dict['Spring Semester 2024'])
# time.sleep(1)

wait = WebDriverWait(driver, 15)

try:
    input = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_LAST_NAME')
except:
    input = None

if not input:
    element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_GROUP3')  # Click the Instructors filter
    driver.execute_script("arguments[0].click();", element)
    time.sleep(1)

    try: 
        element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_MSU_LOAD_INSTRUCTS') # Click Load Instructors
        driver.execute_script("arguments[0].click();", element)
        time.sleep(5) # Can be shorter
    except:
        pass # Some older semesters automatically click Load Instructors for us

    input = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_LAST_NAME')

instructor = 'Wang, Jianrong'
input.clear()
input.send_keys(instructor) 
time.sleep(1)

try: 
    element = driver.find_element(By.ID, "win9divMSU_CLSRCH_WRK2_GROUPBOX15")
    first = element.find_element(By.CSS_SELECTOR, "tr:not([style='display: none;'])") # Find the first check box
except:
    pass

driver.execute_script("arguments[0].click();", first.find_element(By.CLASS_NAME, "ps-checkbox")) # Click the first check box

element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_SEARCH_BTN') # Hit Search
driver.execute_script("arguments[0].click();", element)
wait.until(EC.element_to_be_clickable((By.ID, "MSU_CLSRCH_I_DV$selm$0$$0")))

try: # For some cases, the search returns no results and will throw up an error window
    # driver.switch_to.frame(0)
    element = driver.find_element(By.ID, '#ICOK') # Click OK to get out of the error
    element.click()
    time.sleep(1)
    # driver.switch_to.default_content()
except: 
    pass

i = 0
while True:
    try: # Find a checkbox based on index
        element = driver.find_element(By.ID, f"MSU_CLSRCH_I_DV$0_row_{i}")
    except: # If there's no more checkbox, break out of loop
        break 
    name = element.find_element(By.ID, f"MSU_CLSRCH_WRK2_INSTR_NAME${i}").text
    # If the checkbox is the instructor, don't click
    if name == instructor.replace(' ', ''): 
        i += 1
        continue
    # Else, click to unselect the checkbox
    checkbox = element.find_element(By.ID, f"MSU_CLSRCH_I_DV$selm${i}$$0")
    driver.execute_script("arguments[0].click();", checkbox)
    i += 1

result_i = driver.find_element(By.ID, 'MSU_RSLT_NAV_WK_PTPG_ROWS_GRID').text
if i > 1:
    element = driver.find_element(By.ID, 'MSU_CLSRCH_WRK2_APPLY_PB$186$') # Click Filter Results
    driver.execute_script("arguments[0].click();", element)
    wait.until(lambda drv: drv.find_element(By.ID, 'MSU_RSLT_NAV_WK_PTPG_ROWS_GRID').text != result_i)
    # time.sleep(5) #If too slow can decrease this, if there's an error increase it.

In [None]:
df.to_csv('CMSE_Names.csv')