In [1]:
# Import required modules
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [2]:
# For headless mode
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1080")  

In [3]:
# This function scrapes all the info in a review
def scrapeReview(url):
    """
    url = course url for which we will scrape review
    returns = all the review info"""
    
    # Initialize webdrive in headless mode
    driver = webdriver.Chrome(r"/home/faysal/Documents/utilities/chromedriver", options=options)
    driver.get(url)
    time.sleep(4)
    
    # Close accept cookies
    try:
        driver.find_element_by_id("onetrust-accept-btn-handler").click()
    except:
        pass

    # Locate view more button by link text
    try:
        viewMore = driver.find_element_by_link_text("View more")
    except:
        pass

    # Keep clicking till "View more" exists, otherwise just break out of the loop
    while True:
        try:
            viewMore.click()
            time.sleep(2)
        except:
            break
    
    # This main container holds all the review info
    mainCont = driver.find_elements_by_css_selector("div#providerCourseReviews article")
    # Extract main cont text
    mainCont = [x.text for x in mainCont]
    
    # Create a df off main cont
    df = pd.DataFrame({"mainCont":mainCont})
    
    # Insert the course link
    df["courseLink"] = url
    driver.close()
    return df

# Extract required info from main cont
def clean(df):
    """return = final cleaned df"""
    df = df.copy()
    df["courseId"] = df.courseLink.str.split("/").str[5].str.replace("#", "").str.strip()
    df["review"] = df.mainCont.str.split("\n").str[3].str.strip()
    df["rating"] = df.mainCont.str.split("Star").str[0].str.strip()
    df["reviewDate"] = df.mainCont.str.split("\n").str[1].str.strip()
    df["reviewedBy"] = df.mainCont.str.split("Review by").str[1].str.split("for").str[0].str.strip()
    return df

In [4]:
# This is to scrape by chunks
def main(s1, s2, fileName):
    """
    s1 = start index of the link
    s2 = end index of the link
    fileName = file name from which we read the link"""
    
    # Read the course link from a file to send requests to
    courseLinkDf = pd.read_excel(f"{fileName}.xlsx")
    
    # Scrape by chunks
    df = pd.concat(list(map(scrapeReview, courseLinkDf.courseLink.iloc[s1:s2])))
    
    # Extract info from main cont
    finalDf = clean(df)
    
    # Drop unnecessary columns
    finalDf.drop(["mainCont", "courseLink"], axis=1, inplace=True)
    return finalDf

In [5]:
%%time
chunk1 = main(0, 30, "excelWithBusiness")

CPU times: user 478 ms, sys: 131 ms, total: 610 ms
Wall time: 5min 55s


In [6]:
%%time
chunk2 = main(30, None, "excelWithBusiness")

CPU times: user 375 ms, sys: 142 ms, total: 517 ms
Wall time: 5min 36s


In [7]:
# Concat all the chunks
allReview = pd.concat([chunk1, chunk2]).reset_index(drop=True)
allReview.courseId = allReview.courseId.astype("int")

# Merge review data with course data
courseData = pd.read_excel("excelWithBusiness.xlsx")
courseWithReview = pd.merge(courseData, allReview, on="courseId", how="left")

# Let's have a look at our final data
courseWithReview.head()

Unnamed: 0,date,courseId,courseTitle,courseLink,subtitle,courseProvider,offerPrice,originalPrice,unitSold,category,...,soldOrEnq,savingsPercent,broadCategory1,broadCategory2,subCategory1,subCategory2,review,rating,reviewDate,reviewedBy
0,02_Dec,74821,Black Friday Special - The Ultimate Microsoft ...,https://www.reed.co.uk/courses/black-friday-sp...,This is the First time we have ever offered Li...,Excel with Business,39.99,199,5217,"[['IT', 'Business analysis'], ['Office skills'...",...,1,79,IT,"Admin, secretarial & PA",Business analysis,Microsoft Excel,Straightforward and logically laid out so far....,4.0,12 Sep 2021,Julie Bridger
1,02_Dec,74821,Black Friday Special - The Ultimate Microsoft ...,https://www.reed.co.uk/courses/black-friday-sp...,This is the First time we have ever offered Li...,Excel with Business,39.99,199,5217,"[['IT', 'Business analysis'], ['Office skills'...",...,1,79,IT,"Admin, secretarial & PA",Business analysis,Microsoft Excel,Good content for an advanced course on Excel. ...,5.0,15 May 2021,Julian
2,02_Dec,74821,Black Friday Special - The Ultimate Microsoft ...,https://www.reed.co.uk/courses/black-friday-sp...,This is the First time we have ever offered Li...,Excel with Business,39.99,199,5217,"[['IT', 'Business analysis'], ['Office skills'...",...,1,79,IT,"Admin, secretarial & PA",Business analysis,Microsoft Excel,"A very thorough and informative course, provid...",4.6,29 Mar 2021,Angela Metcalfe
3,02_Dec,245094,Microsoft Office Essentials - 3 Course Bundle,https://www.reed.co.uk/courses/microsoft-offic...,"Expert training in Excel, Word and PowerPoint,...",Excel with Business,19.0,177,198,"[['Office skills', 'Microsoft Office', 'Micros...",...,1,89,Office skills,Office skills,Microsoft Excel,Microsoft Word,Reviewed by Craig Jackson,5.0,12 Oct 2020,Craig Jackson
4,02_Dec,74828,Advanced Microsoft Excel Course (CPD Accredited),https://www.reed.co.uk/courses/advanced-micros...,Learn Advanced Techniques with the Leading Glo...,Excel with Business,59.0,59,327,"[['Office skills', 'Microsoft Office', 'Micros...",...,1,0,Office skills,Business,Microsoft Excel,Microsoft Excel,Reviewed by Amelia Davies-Smith,5.0,16 Apr 2020,Amelia Davies-Smith


In [8]:
# Who got the best avg stars?
courseWithReview.rating = courseWithReview.rating.fillna(0).astype(float)
courseWithReview.groupby("courseTitle").rating.agg(["count", "mean"]).sort_values("mean", ascending=False).round(2)

Unnamed: 0_level_0,count,mean
courseTitle,Unnamed: 1_level_1,Unnamed: 2_level_1
Microsoft Office Essentials - 3 Course Bundle,1,5.0
Critical Thinking & Problem Solving Skills Course,2,5.0
Introduction To Leadership & Management,1,5.0
Excel Essentials 3 Course Bundle,1,5.0
Microsoft Excel - Beginner & Advanced Bundle,3,5.0
...,...,...
Microsoft OneNote Online Course,1,0.0
Business Administration Bundle,1,0.0
Microsoft Power BI Online Course,1,0.0
Microsoft Project 2010 Course,1,0.0
