In [1]:
# Import basic and advanced modules
import pandas as pd
import numpy as np
import re
import time
from datetime import datetime
from bs4 import BeautifulSoup
import requests
import aiohttp
import asyncio
import nest_asyncio
from IPython.core.display import clear_output

In [2]:
# Requires to save data as csv with today's date
today = datetime.today().strftime("%d_%b")

In [3]:
# Add "nest_asyncio.apply()" code snippet, otherwise it throws "RuntimeError: asyncio.run() cannot be called from a running event loop"
nest_asyncio.apply()
def scrapeByProvider(providerUrl):
    """It will scrape any provider data, provided that link of the provider is given as input to the function"""
    startTime = time.time()
    
    # Initialize empty list of variables to be scraped
    courseLink = []
    courseTitle = []
    courseProvider = []
    subTitle = []
    unitSold = []
    offerPrice = []
    originalPrice = []
    savings = []
    soldOrEnq = []
    haveCpd = []
    haveProfQual = []
    isRegulated = []
    awrBodyQualName = []
    awrBodyName = []
    cpdAccreditedBy = []
    othersAsCpd = []
    
    # Get the no of pages to scrape. It requests to a provider url to scrape the number of total courses it has.
    responseStopPage = requests.get(providerUrl)
    soupStopPage = BeautifulSoup(responseStopPage.content,"html.parser")
    totalCoursesStr = soupStopPage.find("span",class_="h1").text.strip()
    totalCoursesStr = totalCoursesStr.replace(",","") # Remove comma if any
    totalCoursesInt = int(totalCoursesStr) # Convert course no into int from string
    stopPageFloat = np.ceil(totalCoursesInt/100) # Returns float
    stopPageInt = np.int(stopPageFloat) # Converts into int, since range function requires integer data type
    
    
    # This portion scrapes all the indvidual courses link of that provider
    async def fetch(session, providerUrl):
        async with session.get(providerUrl) as response:
            return await response.text()
        
    async def main():
        async with aiohttp.ClientSession() as session:
            # Iterate over the stop page to scrape all the indvidual courses link in most popular basis with page size 100
            for page in range(1,stopPageInt+1):
                htmlCourseLink = await fetch(session, providerUrl + f"?pageno={page}&sortby=MostPopular&pagesize=100")
                soupCourseLink = BeautifulSoup(htmlCourseLink,"html.parser")
                # Scrapes individual course link
                for lnk in soupCourseLink.find_all("h2",class_="mt-4 mt-sm-1 mr-5 mb-0"):
                    # Need to create absolute url to make request to each individual course later
                    courseLink.append(str("https://www.reed.co.uk")+lnk.find("a").get("href"))
                    
    asyncio.run(main())
    
    # Information parsing. Scrape all the variables except courseLink by sending requests to courseLink
    async def fetch(session, providerUrl):
        async with session.get(providerUrl) as response:
            return await response.text()
        
        
    async def main():
        async with aiohttp.ClientSession() as session:
            # Count the no of requests
            req = 0
            # Extract provider name from providerUrl to print during requests count
            providerName = providerUrl.split("/")[4].title()
            # Sending requests to each course link to extract required variables
            for lnk,reqCount in zip(courseLink,np.arange(1,len(courseLink)+1)):
                htmlInfo = await fetch(session, lnk)
                req = req+1
                print(f"{providerName} => Requests Completed: {req} out of {len(courseLink)}")
                soupInfo = BeautifulSoup(htmlInfo,"html.parser")
                # Clear all the outputs except the current one in notebook console
                clear_output(wait=True)
                
                # Extract title
                try:
                    titleTag = soupInfo.find("h1")
                    courseTitle.append(titleTag.text)
                except:
                    courseTitle.append("missing")
                
                # Extract subtitle
                try:
                    subTitleTag = soupInfo.find("h2")
                    subTitle.append(subTitleTag.text)
                except:
                    subTitle.append("missing")

                
                # Extract offer price
                try:
                    offerPriceTag = soupInfo.find("span",class_="current-price")
                    offerPrice.append(offerPriceTag.text)
                except:
                    offerPrice.append("missing")
                
                # Extract unit sale
                try:
                    unitSoldTag = soupInfo.find_all('strong')[1]
                    unitSold.append(unitSoldTag.text)
                except:
                    unitSold.append("missing")
                
                
                # Extract original price
                try:
                    originalPriceTag = soupInfo.find("small",class_="vat-status")
                    originalPrice.append(originalPriceTag.text)
                except:
                    originalPrice.append("missing")
                
                # Extract providers
                try:
                    providerTag = soupInfo.find("section",class_="sidebar-actions").find("a",class_="provider-link")
                    courseProvider.append(providerTag.text)
                except:
                    courseProvider.append("missing")
                
                # Extract savings
                try:
                    savingsTag = soupInfo.find("span",class_="icon-savings-tag price-saving")
                    savings.append(savingsTag.text)
                except:
                    savings.append("missing")
                
                # Extract if the course is sold or enquired
                try:
                    soldOrEnqTag = soupInfo.find_all("div",class_="summary-content")[-1]
                    soldOrEnq.append(soldOrEnqTag.text.strip())
                except:
                    soldOrEnq.append("missing")
                    
                # Does the course have cpd? 
                try:
                    cpdTag = soupInfo.find("div", class_="badge badge-dark badge-cpd mt-2")
                    haveCpd.append("yes" if cpdTag else "no")
                    
                    # Other certification termed as CPD
                    if cpdTag:
                        cpdAccreditedByTag = soupInfo.find("div",class_="col").find_all("div")[-1]
                        othersAsCpd.append(cpdAccreditedByTag.text.strip().split("by")[-1].strip())
                    else:
                        othersAsCpd.append("missing")
                except:
                    pass
            
                
                # Does the course have professional qualification?
                try:
                    profQualTag = soupInfo.find("div", class_="badge badge-dark badge-professional mt-2")
                    haveProfQual.append("yes" if profQualTag else "no")
                except:
                    pass
                
                # Is the course qualification regulated?
                try:
                    reguTag = soupInfo.find("div", class_="badge badge-dark badge-regulated mt-2")
                    isRegulated.append("yes" if reguTag else "no")
                except:
                    pass
                
                # Extract awarding body qualification name if any
                try:
                    awrBodyQualNameTag = soupInfo.find("div",class_="col")
                    awrBodyQualName.append(awrBodyQualNameTag.find_all("div")[0].text.strip() if awrBodyQualNameTag else "missing")
                except:
                    pass
                
                # Extract awarding body name if any
                try:
                    awrBodyNameTag = soupInfo.find("div",class_="col")
                    awrBodyName.append(awrBodyNameTag.find_all("div")[1].text.strip() if awrBodyNameTag else "missing")
                except:
                    pass
                
                # Who provides cpd accreditation?
                try:
                    # If the tag text contains "CPD", we will then extract the cpd provider
                    cpdAccreditedByTag = soupInfo.find("div",class_="col").find_all("div")[-1]
                    # This filters out any certification providers except cpd
                    if cpdAccreditedByTag.findAll(text=re.compile("CPD"), limit=1):
                        cpdAccreditedBy.append(cpdAccreditedByTag.text.strip().split("by")[-1].strip())
                    else:
                        cpdAccreditedBy.append("missing")
                except:
                    cpdAccreditedBy.append("missing")

    asyncio.run(main())
    
    # Now all the variables extraction is done. Create a primary dataframe off those extracted variables
    primaryDf = pd.DataFrame({
        "courseTitle":courseTitle,
        "courseLink": courseLink,
        "subTitle":subTitle,
        "courseProvider":courseProvider,
        "offerPrice":offerPrice,
        "originalPrice":originalPrice,
        "savings":savings,
        "unitSold":unitSold,
        "soldOrEnq":soldOrEnq,
        "haveCpd":haveCpd,
        "othersAsCpd":othersAsCpd,
        "cpdAccreditedBy":cpdAccreditedBy,
        "haveProfQual":haveProfQual,
        "isRegulated":isRegulated,
        "awrBodyQualName":awrBodyQualName,
        "awrBodyName":awrBodyName
    })
    
    """Data processing, cleaning, and feature engineering.
    We will create a new dataframe for this purpose.
    Because we do not want to mutate the original dataframe"""
    # Create an empty dataframe to store cleaned data
    cleanedDf = pd.DataFrame()
    # Extract id from courseLink
    cleanedDf.insert(loc=0, value=primaryDf.courseLink.str.split("/").str.get(5).str.replace("#",""),column="courseId")
    
    # Clean original price. Remove text from digits
    removeStringsFromOriginalPrice = primaryDf.originalPrice.apply(lambda x: "".join(re.findall(r"[0-9\.]",x)))
    # Convert empty strings to nan and fill nan with 0
    cleanedDf["originalPrice"] = pd.to_numeric(removeStringsFromOriginalPrice,errors="coerce").fillna(0)
    
    # Clean offer price
    removeCommaAndPoundFromOfferPrice = primaryDf.offerPrice.str.replace("£","").str.replace(",","")
    # Convert any strings (FREE) to nan and fill nan with 0
    cleanedDf["offerPrice"] = pd.to_numeric(removeCommaAndPoundFromOfferPrice, errors="coerce").fillna(0)
    
    # Clean savings. Keep the digits only
    removeStringsFromSavings = primaryDf.savings.apply(lambda x: "".join(re.findall(r"[0-9\.]",x)))
    # Convert empty strings to nan and fill nan with 0 and cast into integer
    cleanedDf["savings(%)"] = pd.to_numeric(removeStringsFromSavings, errors="coerce").fillna(0).astype(int)
    
    # Clean awarding body name
    cleanedDf["awrBodyName"] = primaryDf.awrBodyName.str.split("\n").str.get(1).fillna("missing")
    
    # Remove comma from unitSold.
    removeCommaFromUnitSold = primaryDf.unitSold.str.replace(",","")
    # Also cast non digits to nan and then replace nan with 0, and cast into integer
    cleanedDf["unitSold"] = pd.to_numeric(removeCommaFromUnitSold, errors="coerce").fillna(0).astype(int)
    
    # Extract course level from awarding body qualification name and fill nan with missing
    cleanedDf["courseLevel"] = pd.to_numeric(primaryDf.awrBodyQualName.str.split(" ").str.get(1), errors="coerce").fillna("missing")
    
    # Create the finalDF with required variables from primaryDf and cleanedDf
    finalDf = pd.DataFrame({
        "courseId":cleanedDf.courseId,
        "courseTitle":primaryDf.courseTitle,
        "subTitle":primaryDf.subTitle,
        "courseLink":primaryDf.courseLink,
        "courseProvider":primaryDf.courseProvider,
        "unitSold":cleanedDf.unitSold,
        "soldOrEnq":primaryDf.soldOrEnq,
        "offerPrice":cleanedDf.offerPrice,
        "originalPrice":cleanedDf.originalPrice,
        "haveCpd":primaryDf.haveCpd,
        "cpdAccreditedBy":cpdAccreditedBy,
        "othersAsCpd":primaryDf.othersAsCpd,
        "haveProfQual":primaryDf.haveProfQual,
        "isRegulated":primaryDf.isRegulated,
        "awrBodyName":cleanedDf.awrBodyName,
        "awrBodyQualName":primaryDf.awrBodyQualName,
        "courseLevel": cleanedDf.courseLevel
    })
    
    # Calculate the program execution time in mins
    endTime = time.time()
    durationInMins = np.round((endTime-startTime)/60,2)
    print(f"{finalDf.courseProvider.iloc[0]} => Time Required to Scrape {finalDf.shape[0]} Records => {durationInMins} Minutes")
    return finalDf

In [4]:
# Scrape one education. Provider url is given as input to the function
oneEducation = scrapeByProvider("https://www.reed.co.uk/courses/one-education/p1812")

One Education => Time Required to Scrape 429 Records => 5.73 Minutes


In [5]:
# Scrape course gate
courseGate = scrapeByProvider("https://www.reed.co.uk/courses/course-gate/p1834")

Course Gate => Time Required to Scrape 365 Records => 4.08 Minutes


In [6]:
# Scrape janets
janets = scrapeByProvider("https://www.reed.co.uk/courses/janets/p1778")

Janets => Time Required to Scrape 432 Records => 4.69 Minutes


In [7]:
# Scrape euston college
eustonCollege = scrapeByProvider("https://www.reed.co.uk/courses/euston-college/p2128")

Euston College => Time Required to Scrape 76 Records => 1.33 Minutes


In [8]:
# Scrape training express limited
trainingExpress = scrapeByProvider("https://www.reed.co.uk/courses/training-express-ltd/p2079")

Training Express Ltd => Time Required to Scrape 35 Records => 0.57 Minutes


In [9]:
# Scrape beaco
beaco = scrapeByProvider("https://www.reed.co.uk/courses/be-acouk/p545")

Be-a.co.uk => Time Required to Scrape 338 Records => 3.6 Minutes


In [10]:
# Scrape CPD courses
cpdCourses = scrapeByProvider("https://www.reed.co.uk/courses/cpd-courses/p1534")

CPD Courses => Time Required to Scrape 756 Records => 8.27 Minutes


In [11]:
# Scrape brentwood
brentwood = scrapeByProvider("https://www.reed.co.uk/courses/brentwood-open-learning-college/p438")

Brentwood Open learning College => Time Required to Scrape 226 Records => 2.64 Minutes


In [12]:
# Scrape oplex careers
oplexCareers = scrapeByProvider("https://www.reed.co.uk/courses/oplex-careers/p630")

Oplex Careers => Time Required to Scrape 384 Records => 5.6 Minutes


In [13]:
# Scrape oxford home study college
oxford = scrapeByProvider("https://www.reed.co.uk/courses/oxford-home-study-college/p1245")

Oxford Home Study College => Time Required to Scrape 193 Records => 2.22 Minutes


In [14]:
# Scrape training terminal
trainingTerminal = scrapeByProvider("https://www.reed.co.uk/courses/the-training-terminal/p1064")

The Training Terminal => Time Required to Scrape 88 Records => 1.11 Minutes


In [15]:
# Scrape excel with business
excelWithBusiness = scrapeByProvider("https://www.reed.co.uk/courses/excel-with-business/p930")

Excel with Business => Time Required to Scrape 60 Records => 0.91 Minutes


In [16]:
# Scrape ofcourse
ofCourse = scrapeByProvider("https://www.reed.co.uk/courses/ofcourse/p675")

OfCourse => Time Required to Scrape 647 Records => 7.44 Minutes


In [17]:
# Scrape trendimi
trendimi = scrapeByProvider("https://www.reed.co.uk/courses/trendimi/p964")

Trendimi => Time Required to Scrape 40 Records => 0.59 Minutes


In [18]:
# Scrape centre of excellence
centreOfExcellence = scrapeByProvider("https://www.reed.co.uk/courses/centre-of-excellence-online/p652")

Centre of Excellence => Time Required to Scrape 348 Records => 3.81 Minutes


In [19]:
# Concat all the dataframes into one dataframe
mergedDf = pd.concat([
    oneEducation,
    courseGate,
    janets,
    eustonCollege,
    trainingExpress,
    beaco,
    cpdCourses,
    brentwood,
    oplexCareers,
    oxford,
    trainingTerminal,
    excelWithBusiness,
    ofCourse,
    trendimi,
    centreOfExcellence
]).reset_index(drop=True)

# Let's have a look at those data
mergedDf.head(5)

Unnamed: 0,courseId,courseTitle,subTitle,courseLink,courseProvider,unitSold,soldOrEnq,offerPrice,originalPrice,haveCpd,cpdAccreditedBy,othersAsCpd,haveProfQual,isRegulated,awrBodyName,awrBodyQualName,courseLevel
0,234929,Car Mechanic Training,Level 3 Endorsed Diploma by TQUK | 15 CPD Poin...,https://www.reed.co.uk/courses/car-mechanic-tr...,One Education,1461,"1,461 students purchased this course",14.0,199.0,yes,missing,Training Qualifications UK Ltd,yes,no,Training Qualifications UK Ltd,Level 3 Diploma in Car Mechanic (Endorsed Cert...,3
1,233863,120 hours TEFL (TESOL) Masterclass,Level 3 Endorsed Diploma by TQUK | 40 CPD Poin...,https://www.reed.co.uk/courses/120-hours-tefl-...,One Education,668,668 students purchased this course,14.0,199.0,yes,missing,Training Qualifications UK Ltd,yes,no,Training Qualifications UK Ltd,Level 3 Diploma in TEFL/TESOL Masterclass (End...,3
2,246722,Level 3 Award in Education and Training - AET ...,Awarded by TQUK I Ofqual Regulated I Free PDF ...,https://www.reed.co.uk/courses/level-3-award-i...,One Education,661,661 students purchased this course,10.0,399.0,no,missing,missing,yes,yes,Training Qualifications UK Ltd,Level 3 Award in Education and Training (QCF),3
3,237280,Microsoft Excel Ultimate 5 Courses Bundle with...,Level 5 Endorsed Diploma by TQUK | 40 CPD Poin...,https://www.reed.co.uk/courses/microsoft-excel...,One Education,1425,"1,425 students purchased this course",16.0,199.0,yes,missing,Training Qualifications UK Ltd,yes,no,Training Qualifications UK Ltd,Level 5 Diploma in Microsoft Excel (Endorsed C...,5
4,233696,UK Employment Law,Level 5 Endorsed Diploma by TQUK | 20 CPD Poin...,https://www.reed.co.uk/courses/uk-employment-l...,One Education,1246,"1,246 students purchased this course",14.0,199.0,yes,missing,Training Qualifications UK Ltd,yes,no,Training Qualifications UK Ltd,Level 5 Diploma in UK Employment Law (Endorsed...,5


In [20]:
# Now save the dataframe as csv file with today's date
mergedDf.to_csv(f"{today}_15_providers_cpdUpdated.csv",index=None)