In [1]:
# IImport required modules
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
from concurrent.futures import ProcessPoolExecutor

In [2]:
# This functiion return all the cover page link(urls) from where we will scrape review information
def generateCoverPage(url):
    """url = url of the company, we link to scrape reviews for
    return = all the cover page links"""
    
    # To store cover page links
    coverPage = []
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    
    # Extract total reviiews
    totalReviews = s.findAll("span", class_="TextBody TextBody--sm TextBody--inline js-reviewsio-review-count")[0]\
    .strong.text.strip().replace(",", "").strip()
    
    # Extract total page
    totalPage = int(np.ceil(int(totalReviews)/20))
    
    # Create all the cover pages link
    for pg in range(1, totalPage):
        coverPage.append(f"{url}/{pg}")
        
    # Append the requested url at the start
    coverPage = [url] + coverPage
    return coverPage

In [3]:
# This function scrapes review, rating, reviewer, and review date
def scrapeReviewInfo(url):
    """url = cover page urls
    return = review info such as review, rating, reviewer, and review date as a dataframe"""
    
    # Initialize empty list of variables to be scraped
    review = []
    rating = []
    reviewer = []
    reviewDate = []
    
    # Making requests
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    
    # This is the main container for each review
    try:
        mainCont = s.findAll("div", class_="Review")
    except:
        pass
    
    
    # Extract review
    try:
        for cont in mainCont:
            for cont1 in cont.findAll(class_="Review__body"):
                review.append(cont1.text.strip())
    except:
        review.append("na")
    
    # Extract reviewer
    try:
        for cont in mainCont:
            for cont1 in cont.findAll(class_="Review__author"):
                reviewer.append(cont1.text.strip())
    except:
        reviewer.append("na")
    
    # Extract rating
    try:
        for cont in mainCont:
            for cont1 in cont.findAll("div", class_="Review__overallStars__stars"):
                rating.append(cont1.findAll("i", class_="stars__icon icon-full-star-01"))
    except:
        rating.append("na")
    
    # Extract review data
    try:
        for cont in mainCont:
            for cont1 in cont.findAll(class_="Review__dateSource"):
                reviewDate.append(cont1.text.strip())
    except:
        reviewDate.append("na")
    
    # Create a df off scraped variables
    df = pd.DataFrame({
        "review":review,
        "reviewer":reviewer,
        "rating":rating,
        "reviewDate":reviewDate
    })
    
    # Extract rating in number from rating
    df.rating = df.rating.str.len().astype("int")
    return df

In [4]:
# Wrap all the functions inside main
def main(url):
    coverPage = generateCoverPage(url)
    with ProcessPoolExecutor(max_workers=4) as ex:
        finalDf = pd.concat(list(ex.map(scrapeReviewInfo, coverPage))).reset_index(drop=True)
    return finalDf

In [5]:
# Scrape one education review on reviews.io
df = main("https://www.reviews.io/company-reviews/store/one-education")
df.head(10)

Unnamed: 0,review,reviewer,rating,reviewDate
0,"“I registerd through the ""get 7 days for free""...",Gabor,1,Posted 1 week ago
1,“My videos are not opening.I tried lots of tim...,Sonu,1,Posted 1 week ago
2,“Excellent. I took the course Blockchain & cry...,Anonymous,5,Posted 2 weeks ago
3,"“Honestly, the best online course I’ve done! I...",Anonymous,5,Posted 3 weeks ago
4,“First of all i would like to admire the effor...,F Tadesse,5,Posted 1 month ago
5,“iv been a stay at home mum for the past 11 ye...,Anonymous,5,Posted 1 month ago
6,"“very excited, been looking for a course like ...",Dean Husbands,5,Posted 1 month ago
7,"“This course was very informative, helpful, in...",Ewa Adach-Miluska,5,Posted 1 month ago
8,“I love my course learnt alot. I would definit...,Earleen Alcide,5,Posted 1 month ago
9,“I love my course learnt alot. I would definit...,Earleen Alcide,5,Posted 1 month ago


In [6]:
# Data dimension
df.shape

(1185, 4)