In [1]:
# Import required modules
from concurrent.futures import ProcessPoolExecutor
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
from itertools import chain
import urllib3
import time

In [2]:
# Suppress warnings
urllib3.disable_warnings()

# Define deaders and date
HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"}
today = pd.to_datetime("today").strftime("%d_%b_%y")


# Generate cover pages links
def generateCoverPageLink(url, stopPage):
    """url = url to make request in which total course number is found,
    stopPage = total no of cover pages,
    return = cover pages links"""
    
    # Store cover page links
    coverPageLink = []
    
    # Iterate through stop page and create cover page links
    for page in range(1, stopPage+1):
        coverPageLink.append(url + f"?pageno={page}&sortby=MostPopular&pagesize=100")
    return coverPageLink



# This function scrapes individual course links from every cover page links
def scrapeIndividualCourseLink(url):
    """Scrapes individual course link from cover page links,
    url = cover page link,
    return = Individual course links"""
    
    # Store course links
    courseLink = []
    
    # Making request
    r = requests.get(url, headers=HEADERS, verify=False)
    s = BeautifulSoup(r.text, "lxml")
    
    # Scrape course links and store
    for lnk in s.find_all("div", class_="course-overview"):
        courseLink.append("https://www.reed.co.uk" + lnk.find("h2").find("a").get("href"))
    return courseLink



# This function scrapes individual course info from individual course link
def scrapeCourseInfo(url):
    """url = individual course link,
    return = scraped course info as a dataframe"""
    
    # Initialize empty list of variables to be scraped
    courseTitle = []
    subtitle = []
    offerPrice = []
    originalPrice = []
    courseProvider = []
    unitSold = []
    category = []
    savings = []
    haveCpd = []
    awardingBody = []
    qualName = []
    cpdPoint = []
    isRegulated = []
    hasProfCert = []
    soldOrEnq = []
    
    # Making request
    r = requests.get(url, headers=HEADERS, verify=False)
    s = BeautifulSoup(r.text, "lxml")
    
    # Scrape course name
    try:
        courseTitle.append(s.find("div", class_="course-title").find("h1").text.strip())
    except:
        courseTitle.append("na")
        
    # Scrape subtitle
    try:
        subtitle.append(s.find("div", class_="course-title").find("h2").text.strip())
    except:
        subtitle.append("na")
        
    # Scrape offer price
    try:
        offerPrice.append(s.find("span", class_="current-price").text.strip())
    except:
        offerPrice.append("na")
        
    # Scrape original price
    try:
        originalPrice.append(s.find("small", class_="vat-status").text.strip())
    except:
        originalPrice.append("na")
        
    # Scrape course provider
    try:
        try:
            # If the provider is hyperlinked
            courseProvider.append(s.find("a", class_="provider-link").text.strip())
        except:
            # If the provider is not hyperlinked
            courseProvider.append(s.find("span", class_="thumbnail").text.strip())
    except:
        courseProvider.append("na")
        
    # Scrape unit sale
    try:
        unitSold.append(s.find(id="number-enquiries-purchases").text.strip())
    except:
        unitSold.append(0)
        
    # Scrape category
    try:
        # Scrape total category
        totalCat = len(s.find_all("ol", class_="breadcrumb pb-0"))
        for cat in range(totalCat):
            category.append([x.text.strip() for x in s.find_all("ol", class_="breadcrumb pb-0")[cat].find_all("li")])
    except:
        category.append("na")
        
    # Scrape savings
    try:
        savings.append(s.find("span", class_="icon-savings-tag price-saving").text.strip())
    except:
        savings.append("na")
        
    # Does the course have CPD?
    try:
        haveCpd.append(1 if s.find("div", class_="badge badge-dark badge-cpd mt-2") else 0)
    except:
        haveCpd.append(0)
    
    # Scrape awarding body
    try:
        try:
            # Executes if the course is "Endorsed by"
            awardingBody.append(s.find("div", class_="col").find("a").text.strip())
        except:
            # Executes if the course is "Awarded by"
            awardingBody.append(s.find("div", class_="small").find("div").find("a").text.strip())
    except:
        awardingBody.append("na")
            
    # Scrape qualification name, only for awarded courses
    try:
        qualName.append(s.find("div", class_="small").find("h3", class_="h4").text.strip())
    except:
        qualName.append("na")
        
    # Scrape cpd point
    try:
        cpdPoint.append(s.body.find_all(text=re.compile("\d{1,3}\sCPD hours / points"))[0].strip())
    except:
        cpdPoint.append(0)
    
    # Is the course regulated? Assign 1 if regulated, otherwise 0
    try:
        isRegulated.append(1 if s.find("div", class_="badge badge-dark badge-regulated mt-2") else 0)
    except:
        isRegulated.append(0)
        
    # Does the course offer professional certification?
    try:
        hasProfCert.append(1 if s.find("div", class_="badge badge-dark badge-professional mt-2") else 0)
    except:
        hasProfCert.append(0)
    
    # Check if the course is sold or enquired. 2 if the course has both purchased and enquired mode
    try:
        soldOrEnq.append(2 if (s.find(id="addToBasket") and s.find(id="enquireNow"))\
        else 1 if s.find(id="addToBasket") else 0)
    except:
        soldOrEnq.append("na")
        
    # Create a df off scraped variables
    df = pd.DataFrame({
        "courseTitle":courseTitle,
        "courseLink":url,
        "subtitle":subtitle,
        "courseProvider":courseProvider,
        "offerPrice":offerPrice,
        "originalPrice":originalPrice,
        "unitSold":unitSold,
        "category":[category], # This one is not scalar, converting into 1d
        "haveCpd":haveCpd,
        "cpdPoint":cpdPoint,
        "awardingBody":[awardingBody],
        "qualName":qualName,
        "isRegulated":isRegulated,
        "hasProfCert":hasProfCert,
        "savings":savings,
        "soldOrEnq":soldOrEnq
    })
    df = df.astype(str)
    return df



# This function cleans scraped data
def cleanAndExtractFeature(df):
    """df = dataFrame to clean,
    return = final cleaned data"""
    
    # Copy the input data
    finalDf = df.copy()
    
    # Create course id and insert to the df
    finalDf.insert(loc=0, value=finalDf.courseLink.str.split("/").str.get(5).str.replace("#",""), 
                     column="courseId")
    
    # Insert date
    finalDf.insert(loc=0, value=today, column="date")
    
    # Clean unit sold
    finalDf.unitSold = finalDf.unitSold.apply(lambda x: re.findall(r"\d+", x)).str.join("")
    finalDf.unitSold = pd.to_numeric(finalDf.unitSold, errors="coerce").fillna(0).astype("int")
    
    # Clean saving percent
    finalDf["savingsPercent"] = finalDf.savings.str.split("Save").str[-1].str.replace("%", "")
    finalDf.savingsPercent = pd.to_numeric(finalDf.savingsPercent, errors="coerce").fillna(0).astype("int")
    
    # Clean offer price
    finalDf.offerPrice = finalDf.offerPrice.str.split("£").str[-1].str.replace(",", "")
    
    # Clean original price
    finalDf.originalPrice = finalDf.originalPrice.str.split("£").str[-1].str.replace(",", "").str.replace(")", "")
    finalDf.originalPrice = pd.to_numeric(finalDf.originalPrice, errors="coerce")
    
    # If savings is 0, make offer price equals to original price
    finalDf.originalPrice = np.where(finalDf.savingsPercent==0,
                                     finalDf.originalPrice.fillna(finalDf.offerPrice), finalDf.originalPrice)
    
    # Extract CPD point
    finalDf["cpdPoint"] = pd.to_numeric(finalDf.cpdPoint.str.join("").str.split("CPD").str[0], errors="coerce").fillna(0).astype(int)
    
    # Remove '"' from category
    finalDf.category = finalDf.category.apply(lambda x: eval(x))
    
    # Create broadCategory1 from category
    finalDf["broadCategory1"] = finalDf.category.str[0].str[0]
    
    # Create broadCategory2 from category
    finalDf["broadCategory2"] = finalDf.category.str[-1].str[0]
    
    # Create subCategory1 from category
    finalDf["subCategory1"] = finalDf.category.str[0].str[-1]
    
    # Create subCategory1 from category
    finalDf["subCategory2"] = finalDf.category.str[-1].str[-1]
    
    # Extract qualification name
    finalDf["qualName"] = np.where(finalDf.qualName.str.contains("CPD"), "na", finalDf.qualName)
    
    # Clean awarding body
    finalDf.awardingBody = finalDf.awardingBody.str[1:-1].str[1:-1]\
    .str.strip().replace(r"^\s*$", np.nan, regex=True).fillna("na")
    
    # Drop "savings"
    finalDf.drop("savings", axis=1, inplace=True)
    
    # Drop duplicates by "courseId"
    finalDf = finalDf.drop_duplicates("courseId")
    return finalDf

In [3]:
# Wrap all the functions inside main
def main(url, stopPage):
    """url = url to requests,
    stopPage = total pages to scrape,
    Returns the final data frame"""
    
    # Record execution time
    startTime = time.time()
    
    # Generate cover page links
    coverPageLink = generateCoverPageLink(url, stopPage)
    
    # Store individual course links
    courseLink = []
    
    # This loop ensures maximum no of course links scraped
    for _ in range(2):
        with ProcessPoolExecutor(max_workers=6) as ex:
            # Scrape individual course links
            indCourseLink = list(ex.map(scrapeIndividualCourseLink, coverPageLink))
            indCourseLink = list(chain(*indCourseLink)) # Flattening the list
        courseLink.append(indCourseLink)
    courseLink = list(chain(*courseLink))
    
    # Create a series to drop duplicates by ids. This portion keeps only the unique links
    tempSeries = pd.Series(courseLink, name="tempLink")
    splitTempSeries = tempSeries.astype("str").str.split("/", expand=True)
    splitTempSeries.columns = ["a","b","c","d","e","f","g","h"]
    duplicatesDropped = splitTempSeries.drop_duplicates("f", keep="first").reset_index(drop=True)
    courseLink = duplicatesDropped.agg("/".join, axis=1) # This returns a series of course links
    
    # Scrapes course info from course link
    with ProcessPoolExecutor(max_workers=6) as ex:
        courseInfo = pd.concat(list(ex.map(scrapeCourseInfo, courseLink))).reset_index(drop=True)
    
    # Cleans and engineers new features from the scraped dataframe and returns the final dataframe   
    finalDf = cleanAndExtractFeature(courseInfo).reset_index(drop=True)
    
    # Measure execution time and return the final df
    endTime = time.time()
    durationInMins = round((endTime-startTime)/60, 2)
    print(f"{url.split('/')[4].capitalize()} ==> {len(courseLink)} Records ==> {durationInMins} Minutes")
    return finalDf

In [4]:
%%time
# Scrape the data
finalDf = main("https://www.reed.co.uk/courses/discount", 5)
finalDf.head(10)



Discount ==> 500 Records ==> 3.6 Minutes
CPU times: user 1.59 s, sys: 166 ms, total: 1.76 s
Wall time: 3min 36s


Unnamed: 0,date,courseId,courseTitle,courseLink,subtitle,courseProvider,offerPrice,originalPrice,unitSold,category,...,awardingBody,qualName,isRegulated,hasProfCert,soldOrEnq,savingsPercent,broadCategory1,broadCategory2,subCategory1,subCategory2
0,08_May_21,162300,Food Hygiene Course - Level 2,https://www.reed.co.uk/courses/food-hygiene-co...,"ROSPA Accredited, Institute of Hospitality End...",The Training Terminal,10,30.0,10527,"[[Food science, Food safety], [Health and safe...",...,Royal Society for the Prevention of Accidents,Level 2 Award,0,0,1,66,Food science,Hospitality & catering,Food safety,Food hygiene
1,08_May_21,274566,Food Hygiene and Safety Level 3,https://www.reed.co.uk/courses/food-hygiene-an...,**Bank Holiday Gift: Free PDF Certificate+Free...,Training Express Ltd,10,299.0,2537,"[[Health and safety, Food safety], [Hospitalit...",...,Institute of Hospitality,na,0,0,1,96,Health and safety,Hospitality & catering,Food safety,Food hygiene
2,08_May_21,266620,Leadership & Management,https://www.reed.co.uk/courses/leadership-mana...,Level 7 Advanced Diploma | 150 CPD Points |*FR...,CPD Courses,10,1050.0,9159,"[[HR, Leadership], [Management, Leadership & m...",...,The Quality Licence Scheme,na,0,0,1,99,HR,Management,Leadership,Leadership & management
3,08_May_21,227846,Microsoft Excel,https://www.reed.co.uk/courses/microsoft-excel...,Spring Sale! | FREE PDF Certificate | CPD Cert...,Janets,11,1049.0,1963,"[[Office skills, Microsoft Office, Microsoft E...",...,na,na,0,0,1,98,Office skills,Business,Microsoft Excel,Microsoft Excel
4,08_May_21,98276,Project Management,https://www.reed.co.uk/courses/project-managem...,Advanced Diploma QLS Level 7 |*FREE PDF Certif...,Oxford Home Study College,10,1050.0,5489,"[[IT, Project management], [Project management...",...,The Quality Licence Scheme,na,0,0,1,99,IT,Project management,Project management,PMI
5,08_May_21,228114,British Sign Language (BSL) Level 1 & 2,https://www.reed.co.uk/courses/british-sign-la...,Spring Mega Sale! | FREE PDF Certificate & Tut...,Janets,11,549.0,5618,"[[Language, Sign language], [Teaching and chil...",...,na,na,0,0,1,97,Language,Language,Sign language,BSL
6,08_May_21,51553,CACHE Level 3 Teaching Assistant Qualification...,https://www.reed.co.uk/courses/cache-level-3-t...,Accepted in ALL Schools |Government Regulated ...,The Learning College,269,499.0,9633,"[[Teaching and child care, Teaching, SEN teach...",...,"Council for Awards in Care, Health and Education",Level 3 Award in Supporting Teaching and Learn...,1,0,0,46,Teaching and child care,Teaching and child care,SEN teaching assistant,Teaching assistant
7,08_May_21,227847,Microsoft Office,https://www.reed.co.uk/courses/microsoft-offic...,"Spring Mega Sale | FREE PDF Certificate, Minut...",Janets,11,1599.0,2099,"[[Admin, secretarial & PA, Microsoft Office], ...",...,na,na,0,0,1,99,"Admin, secretarial & PA",Office skills,Microsoft Office,Microsoft Office
8,08_May_21,264321,First Aid at Work,https://www.reed.co.uk/courses/first-aid-at-wo...,***Bank Holiday Gifts: Free PDF Certificate + ...,Training Express Ltd,10,55.0,3134,"[[Health and safety, First aid], [Health and s...",...,na,na,0,0,1,81,Health and safety,Security,First aid,First aid at work
9,08_May_21,227973,Sage 50 Payroll,https://www.reed.co.uk/courses/sage-50-payroll...,Spring Mega Sale! | FREE PDF Certificate & Tut...,Janets,11,579.0,3423,"[[Accounting and finance, Payroll, Sage Payrol...",...,na,na,0,0,1,98,Accounting and finance,Finance,Sage Payroll,Sage Payroll


In [5]:
# Dimension of the scraped data
finalDf.shape

(500, 22)

In [6]:
# Save the data as csv
finalDf.iloc[0:500].to_csv(f"/home/faysal/Desktop/masterData/top500/{today}_top_500_courses.csv", index=None)

In [7]:
# Check for "na"
finalDf[finalDf.courseProvider=="na"]

Unnamed: 0,date,courseId,courseTitle,courseLink,subtitle,courseProvider,offerPrice,originalPrice,unitSold,category,...,awardingBody,qualName,isRegulated,hasProfCert,soldOrEnq,savingsPercent,broadCategory1,broadCategory2,subCategory1,subCategory2
