In [88]:
import pandas as pd

## SET Evals PDF Scraping
Used [Tabula](https://tabula.technology/) to intially extract the tales from the PDF file.

In [89]:
set_df = pd.read_csv('tabula-ECON 2010-F23-Course Comments.csv')

In [90]:
# cleaning the scraped table column headers
set_df.drop(set_df.index[set_df['Comments'] == "Comments"], inplace = True)

In [91]:
print(set_df.shape)
set_df.head()

(636, 1)


Unnamed: 0,Comments
0,I don't really understand the complexities bet...
1,"I really enjoyed Professor Elzinga's lectures,..."
2,El Zinga's the goat.
3,I feel like the course material is covered in ...
4,It was a positive experience.


### Initial cleaning of the dataset

In [92]:
# drop nulls + none + no
set_df.drop(set_df.index[set_df['Comments'].isnull()], inplace = True)
set_df.drop(set_df.index[set_df['Comments'] == "No"], inplace = True)
set_df.drop(set_df.index[set_df['Comments'] == "no"], inplace = True)

In [93]:
# find and hardcode drop the ones that are just nothing
set_df[set_df['Comments'].str.contains("nothing", case=False)]

Unnamed: 0,Comments
58,"Nothing really, maybe just that the lectures c..."
61,"Nothing in particular, sometimes the lectures ..."
71,nothing really
78,Nothing much
100,This course was confusing and enlightening to ...
106,Nothing
151,The learning is very self–involved. Most of my...
395,nothing


In [94]:
set_df = set_df.drop([71, 78, 106, 395])

In [95]:
# check if they actally got dropped
set_df[set_df['Comments'].str.contains("nothing", case=False)]

Unnamed: 0,Comments
58,"Nothing really, maybe just that the lectures c..."
61,"Nothing in particular, sometimes the lectures ..."
100,This course was confusing and enlightening to ...
151,The learning is very self–involved. Most of my...


In [96]:
# save cleaned dataset
# set_df.to_csv('cleaned-set.csv', index=False) 

In [97]:
set_df.head()
set_df.shape

(615, 1)

## The Course Forum Webscraping

In [146]:
import requests
from bs4 import BeautifulSoup

In [147]:
URL = "https://thecourseforum.com/course/748/691/"
page = requests.get(URL)

In [148]:
soup = BeautifulSoup(page.content, "html.parser")

In [149]:
reviews = soup.find_all("div", class_="review card mb-2")

In [None]:
rows = []
for review in reviews:
    semester = review.find("h6", class_="font-weight-bold text-tcf-orange").getText()
    last_updated = review.find("h6", class_="text-muted").getText()
    last_updated = last_updated.replace('Updated ', '')     # keep only the date the comment was last updated
    review_text = review.find("div", class_="review-text-full").getText()
    upvotes = review.find("span", class_="upvoteCount").getText()
    downvotes = review.find("span", class_="downvoteCount").getText()
    review_avg = review.find('div', id='review-average').text.strip()
    instr_rating = review.find('i', class_='fa-user').find_parent('div').text.strip()
    enjoyability = review.find('i', class_='far fa-smile-beam').find_parent('div').text.strip()
    recommend = review.find('i', class_='fas fa-heart').find_parent('div').text.strip()
    difficulty = review.find('i', class_='fa fa-dumbbell fa-fw').find_parent('div').text.strip()
    weekly_hrs = review.find('i', class_='fa fa-hourglass-half fa-fw').find_parent('div').text.strip()
    rows.append([semester, last_updated, review_text, upvotes, downvotes, review_avg, instr_rating, enjoyability, recommend, difficulty, weekly_hrs])

tcf_df = pd.DataFrame(rows, columns=["Semester", "Last Updated", "Review Text", "Upvotes", "Downvotes", "Review Average", "Instr Rating", "Enoyability", "Recommend", "Difficulty", "Hours Per Week"])


In [153]:
print(tcf_df.shape)
tcf_df.head()

(408, 11)


Unnamed: 0,Semester,Last Updated,Review Text,Upvotes,Downvotes,Review Average,Instr Rating,Enoyability,Recommend,Difficulty,Hours Per Week
0,Fall 2023,7/25/24,This class was a decent introduction to microe...,0,0,3.0,3,2,4,2,6
1,Fall 2023,4/27/24,TAKE DOYLE TAKE DOYLE TAKE DOYLE. This guy is ...,1,0,1.0,1,1,1,5,13
2,Fall 2023,4/11/24,"If you're gonna take Econ201, do it with Elzin...",0,0,3.67,4,3,4,3,5
3,Fall 2023,4/11/24,This is a class you most likely will have to t...,0,0,3.0,3,3,3,3,4
4,Fall 2023,4/02/24,The lectures are super boring but the micro co...,0,0,4.67,4,5,5,3,3


In [152]:
# save webscraped dataset
# tcf_df.to_csv('cleaned-tcf.csv', index=False) 