In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException        

from time import sleep
from os import getcwd
from tqdm import tqdm
from warnings import warn

In [None]:
class Data:
    columns = ["strongly_agree", "agree", "neutral", "disagree", "strongly_disagree", "na", "responses"]
    index = ["effective_prof", "worthwhile_course", "help_available", "workload", "course_organization", "course_interest"]
    
    def read_page(self):
        info_prof, info_course, info_semesters = soct_driver.find_elements(By.XPATH, "//dd")
        self.prof = info_prof.text
        info_course, self.course_desc = info_course.text.split(" - ", 1)
        self.subject, self.course, self.subject_desc = info_course.split(" ", 2)
        self.semesters = info_semesters.text
        
        table = []
        for row in soct_driver.find_elements(By.XPATH, "//table[@id='student-responses']/tbody/tr"):
            row_data = []
            for val in row.find_elements(By.XPATH, "./td[@class='bar-cell text-center']/table/tbody/tr/td[1]/p"):
                row_data.append(val.text)
            num_responses = row.find_element(By.XPATH, "./td[not(@class)]/p[1]").text.split(" = ")[1]
            row_data.append(num_responses)
            table.append(row_data)
        self.table = pd.DataFrame(table, columns=Data.columns, index=Data.index).T
        
    def save_data(data_list):
        with open("output/prof.txt", "w") as prof:
            for value in [data.prof for data in data_list]:
                prof.write(value+"\n")
        
        with open("output/course_desc.txt", "w") as course_desc:
            for value in [data.course_desc for data in data_list]:
                course_desc.write(value+"\n")
        
        with open("output/subject.txt", "w") as subject:
            for value in [data.subject for data in data_list]:
                subject.write(value+"\n")
        
        with open("output/course.txt", "w") as course:
            for value in [data.course for data in data_list]:
                course.write(value+"\n")
        
        with open("output/subject_desc.txt", "w") as subject_desc:
            for value in [data.subject_desc for data in data_list]:
                subject_desc.write(value+"\n")
        
        with open("output/semesters.txt", "w") as semesters:
            for value in [data.semesters for data in data_list]:
                semesters.write(value+"\n")
        
        for i, data in enumerate(data_list):
            data.table.to_csv("output/dataframe"+str(i)+".csv")
            
    def load_data():
        with open("output/prof.txt", "r") as prof:
            prof_list = prof.readlines()
            prof_list = [line.rstrip() for line in prof_list]
        
        with open("output/course_desc.txt", "r") as course_desc:
            course_desc_list = course_desc.readlines()
            course_desc_list = [line.rstrip() for line in course_desc_list]
        
        with open("output/subject.txt", "r") as subject:
            subject_list = subject.readlines()
            subject_list = [line.rstrip() for line in subject_list]
        
        with open("output/course.txt", "r") as course:
            course_list = course.readlines()
            course_list = [line.rstrip() for line in course_list]
        
        with open("output/subject_desc.txt", "r") as subject_desc:
            subject_desc_list = subject_desc.readlines()
            subject_desc_list = [line.rstrip() for line in subject_desc_list]
        
        with open("output/semesters.txt", "r") as semesters:
            semesters_list = semesters.readlines()
            semesters_list = [line.rstrip() for line in semesters_list]
        
        if not len(prof_list) == len(course_desc_list) == len(subject_list) == len(course_list) == len(subject_desc_list) == len(semesters_list):
            raise RuntimeError("Files contain data of different lengths")
        
        table_list = [pd.read_csv("output/dataframe"+str(i)+".csv", index_col=0) for i in range(len(prof_list))]
        
        output = []
        for prof, course_desc, subject, course, subject_desc, semesters, table in zip(prof_list, course_desc_list, subject_list, course_list, subject_desc_list, semesters_list, table_list):
            data_obj = Data()
            data_obj.prof = prof
            data_obj.course_desc = course_desc
            data_obj.subject = subject
            data_obj.course = course
            data_obj.subject_desc = subject_desc
            data_obj.semesters = semesters
            data_obj.table = table
            output.append(data_obj)
            
        return output
    
    def clean_data(data_list):
        for data in data_list:
            pass

In [None]:
soct_driver = webdriver.Chrome(service=Service(f'{getcwd()}/chromedriver.exe'))
soct_driver.get("https://soct.msu.edu/")

def log_in():
    soct_driver.find_element(By.ID, "msu-id").send_keys(<insert username here>)
    soct_driver.find_element(By.ID, "password").send_keys(<insert password here>)
    soct_driver.find_element(By.ID, "login-submit").click()
    
def home():
    soct_driver.find_element(By.ID, "section-home").click()
        
def back():
    soct_driver.execute_script("window.history.go(-1)")

def get_subjects():
    return Select(soct_driver.find_element(By.ID, "Select_SubjectCode"))
    
def get_courses():
    return Select(soct_driver.find_element(By.ID, "Select_CourseNumber"))
    
def get_profs():
    return Select(soct_driver.find_element(By.ID, "Select_Instructor"))
    
def check_received():
    return not len(soct_driver.find_elements(By.XPATH, "//*[contains(text(), 'No SOCT forms were received')]"))

def check_sufficient():
    return not len(soct_driver.find_elements(By.XPATH, "//h1[contains(text(), 'Insufficient data')]"))

def subjects_iterate():
    subjects = get_subjects()
    for subj_idx in range(0, len(subjects.options)):
        subjects.select_by_index(subj_idx)
        soct_driver.find_element(By.ID, "submit_subjectcode_1").click()
        courses_iterate()
        back()
        subjects = get_subjects()
        
def courses_iterate():
    courses = get_courses()
    for course_idx in range(len(courses.options)):
        courses.select_by_index(course_idx)
        soct_driver.find_element(By.ID, "submit_coursenumber_1").click()
        if check_received():
            profs_iterate()
        back()
        courses = get_courses()

def profs_iterate():
    profs = get_profs()
    for prof_idx in range(len(profs.options)):
        profs.select_by_index(prof_idx)
        soct_driver.find_element(By.ID, "submit_instructor_1").click()
        if check_sufficient():
            data_obj = Data()
            data_obj.read_page()
            searches.append(data_obj)
        back()
        profs = get_profs()
        
log_in()
searches = []
subjects_iterate()

Data.save_data(searches)

In [None]:
searches = Data.load_data()

In [None]:
dir(searches[0])

In [None]:
searches[0].subject_desc

In [None]:
search = searches[0].data.iloc[0:5,:]

for search in searches[0:100]:
    search = search.data.iloc[0:5,:]
    plt.bar(search.index, search["effective_prof"].str.strip("%").astype(float))
    plt.show()

In [None]:
categories = searches[0].data.index[:5]
medians_list = []

for i, search in enumerate(searches):
    t1 = search.data.iloc[:5].apply(lambda x: x.str.strip("%").astype(float).cumsum())
    t1 = t1.apply(lambda x: x/max(x))
    medians_list.append(t1.apply(lambda x: (x>=0.5).idxmax()), searches[0].data.index[:5].astype("category", categories=categories, ordered=True))

In [None]:
medians = pd.concat(medians_list, axis=1).T
medians.columns = searches[0].data.columns

In [None]:
medians.iloc[:,0]

In [None]:
searches[0].data

In [None]:
for col in medians:
    plt.bar([0,1,2,3,4], list(medians[col].value_counts()))
    plt.show()