In [1]:
# Import basic and advanced modules
import pandas as pd
import numpy as np
import re
import time
from datetime import datetime
from bs4 import BeautifulSoup
import requests
import aiohttp
import asyncio
import nest_asyncio
from IPython.core.display import clear_output

In [2]:
# These line is required to save as csv with today's date
today = datetime.today().strftime('%d_%b')

In [3]:
nest_asyncio.apply() # This line is to avoid "RuntimeError: asyncio.run() cannot be called from a running event loop"
def topDiscountCourses(url,stopPage):
    """It scrapes any number of top discount courses (most popular) depending on the value of stop page.
    If the value of stop page is 5, it scrapes top 500 discount courses. It scrapes 3000 courses, if 
    stop page value is 30. So stop page 1 corresponds to 100 courses.
    """
    # To calculate the execution time
    startTime = time.time()
    
    # Initialize empty list of variables to be scraped
    courseLink = []
    courseTitle = []
    courseProvider = []
    subTitle = []
    unitSold = []
    offerPrice = []
    originalPrice = []
    savings = []
    soldOrEnq = []
    haveCpd = []
    haveProfQual = []
    isRegulated = []
    awrBodyQualName = []
    awrBodyName = []
    cpdAccreditedBy = []
    
    # Assign the required stop page depending on the no of courses to be scraped
    stopPage = stopPage
    
    # This portion scrapes all the indvidual courses link 
    async def fetch(session, providerUrl):
        async with session.get(providerUrl) as response:
            return await response.text()
        
    async def main():
        async with aiohttp.ClientSession() as session:
            # Iterate over the stop page to scrape all the indvidual courses link in most popular basis with page size 100
            for page in range(1,stopPage+1):
                htmlCourseLink = await fetch(session, url + f"?pageno={page}&sortby=MostPopular&pagesize=100")
                soupCourseLink = BeautifulSoup(htmlCourseLink,"html.parser")
                # Scrapes individual course link
                for lnk in soupCourseLink.find_all("h2",class_="mt-4 mt-sm-1 mr-5 mb-0"):
                    # Need to create absolute url to make request to each individual course
                    courseLink.append(str("https://www.reed.co.uk")+lnk.find("a").get("href"))
                    
    asyncio.run(main())
    
    
    # Information parsing. Scrape all the variables except courseLink by sending requests to courseLink
    async def fetch(session, url):
        async with session.get(url) as response:
            return await response.text()
        
        
    async def main():
        async with aiohttp.ClientSession() as session:
            # Count the no of requests
            req = 0
            # Sending requests to each course link to extract required variables
            for lnk,reqCount in zip(courseLink,np.arange(1,len(courseLink)+1)):
                htmlInfo = await fetch(session, lnk)
                req = req+1
                print(f"Most Popular Discount Courses => Requests Completed: {req} out of {len(courseLink)}")
                soupInfo = BeautifulSoup(htmlInfo,"html.parser")
                # Clear all the outputs except the current one in notebook console
                clear_output(wait=True)
                
                 # Extract title
                try:
                    titleTag = soupInfo.find("h1")
                    courseTitle.append(titleTag.text)
                except:
                    courseTitle.append("missing")
                
                # Extract subtitle
                try:
                    subTitleTag = soupInfo.find("h2")
                    subTitle.append(subTitleTag.text)
                except:
                    subTitle.append("missing")

                
                # Extract offer price
                try:
                    offerPriceTag = soupInfo.find("span",class_="current-price")
                    offerPrice.append(offerPriceTag.text)
                except:
                    offerPrice.append("missing")
                
                # Extract unit sale
                try:
                    unitSoldTag = soupInfo.find_all('strong')[1]
                    unitSold.append(unitSoldTag.text)
                except:
                    unitSold.append("missing")
                
                
                # Extract original price
                try:
                    originalPriceTag = soupInfo.find("small",class_="vat-status")
                    originalPrice.append(originalPriceTag.text)
                except:
                    originalPrice.append("missing")
                
                # Extract providers
                try:
                    providerTag = soupInfo.find("section",class_="sidebar-actions").find("a",class_="provider-link")
                    courseProvider.append(providerTag.text)
                except:
                    courseProvider.append("missing")
                
                # Extract savings
                try:
                    savingsTag = soupInfo.find("span",class_="icon-savings-tag price-saving")
                    savings.append(savingsTag.text)
                except:
                    savings.append("missing")
                
                # Extract if the course is sold or enquired
                try:
                    soldOrEnqTag = soupInfo.find_all("div",class_="summary-content")[-1]
                    soldOrEnq.append(soldOrEnqTag.text.strip())
                except:
                    soldOrEnq.append("missing")
                    
                # Does the course have cpd?
                try:
                    cpdTag = soupInfo.find("div", class_="badge badge-dark badge-cpd mt-2")
                    haveCpd.append("yes" if cpdTag else "no")
                except:
                    pass
            
                
                # Does the course have professional qualification?
                try:
                    profQualTag = soupInfo.find("div", class_="badge badge-dark badge-professional mt-2")
                    haveProfQual.append("yes" if profQualTag else "no")
                except:
                    pass
                
                # Is the course qualification regulated?
                try:
                    reguTag = soupInfo.find("div", class_="badge badge-dark badge-regulated mt-2")
                    isRegulated.append("yes" if reguTag else "no")
                except:
                    pass
                
                # Extract awarding body qualification name if any
                try:
                    awrBodyQualNameTag = soupInfo.find("div",class_="col")
                    awrBodyQualName.append(awrBodyQualNameTag.find_all("div")[0].text.strip() if awrBodyQualNameTag else "missing")
                except:
                    pass
                
                # Extract awarding body name if any
                try:
                    awrBodyNameTag = soupInfo.find("div",class_="col")
                    awrBodyName.append(awrBodyNameTag.find_all("div")[1].text.strip() if awrBodyNameTag else "missing")
                except:
                    pass
                
                # Who provides cpd accreditation?
                try:
                    cpdAccreditedByTag = soupInfo.find("div",class_="col").find_all("div")[-1]
                    cpdAccreditedBy.append(cpdAccreditedByTag.text.strip().split("by")[-1].strip())
                except:
                    cpdAccreditedBy.append("missing")

    asyncio.run(main())
    
    # Now all the variables extraction is done. Create a primary dataframe off those extracted variables
    primaryDf = pd.DataFrame({
        "courseTitle":courseTitle,
        "courseLink": courseLink,
        "subTitle":subTitle,
        "courseProvider":courseProvider,
        "offerPrice":offerPrice,
        "originalPrice":originalPrice,
        "savings":savings,
        "unitSold":unitSold,
        "soldOrEnq":soldOrEnq,
        "haveCpd":haveCpd,
        "cpdAccreditedBy":cpdAccreditedBy,
        "haveProfQual":haveProfQual,
        "isRegulated":isRegulated,
        #"awrBodyQualName":awrBodyQualName,
        #"awrBodyName":awrBodyName
    })
    
    """Data processing, cleaning, and feature engineering.
    We will create a new dataframe for this purpose.
    Because we do not want to mutate the original dataframe"""
    # Create an empty dataframe to store cleaned data
    cleanedDf = pd.DataFrame()
    # Extract id from courseLink
    cleanedDf.insert(loc=0, value=primaryDf.courseLink.str.split("/").str.get(5).str.replace("#",""),column="courseId")
    
    # Clean original price. Remove text from digits
    removeStringsFromOriginalPrice = primaryDf.originalPrice.apply(lambda x: "".join(re.findall(r"[0-9\.]",x)))
    # Convert empty strings to nan and fill nan with 0
    cleanedDf["originalPrice"] = pd.to_numeric(removeStringsFromOriginalPrice,errors="coerce").fillna(0)
    
    # Clean offer price
    removeCommaAndPoundFromOfferPrice = primaryDf.offerPrice.str.replace("£","").str.replace(",","")
    # Convert any strings (FREE) to nan and fill nan with 0
    cleanedDf["offerPrice"] = pd.to_numeric(removeCommaAndPoundFromOfferPrice, errors="coerce").fillna(0)
    
    # Clean savings. Keep the digits only
    removeStringsFromSavings = primaryDf.savings.apply(lambda x: "".join(re.findall(r"[0-9\.]",x)))
    # Convert empty strings to nan and fill nan with 0 and cast into integer
    cleanedDf["savings(%)"] = pd.to_numeric(removeStringsFromSavings, errors="coerce").fillna(0).astype(int)
    
    # Clean awarding body name
    #cleanedDf["awrBodyName"] = primaryDf.awrBodyName.str.split("\n").str.get(1).fillna("missing")
    
    # Remove comma from unitSold.
    removeCommaFromUnitSold = primaryDf.unitSold.str.replace(",","")
    # Also cast non digits to nan and then replace nan with 0, and cast into integer
    cleanedDf["unitSold"] = pd.to_numeric(removeCommaFromUnitSold, errors="coerce").fillna(0).astype(int)
    
    # Extract course level from awarding body qualification name and fill nan with missing
    #cleanedDf["courseLevel"] = pd.to_numeric(primaryDf.awrBodyQualName.str.split(" ").str.get(1), errors="coerce").fillna("missing")
    
    # Create the finalDF with required variables from primaryDf and cleanedDf
    finalDf = pd.DataFrame({
        "courseId":cleanedDf.courseId,
        "courseTitle":primaryDf.courseTitle,
        "subTitle":primaryDf.subTitle,
        "courseLink":primaryDf.courseLink,
        "courseProvider":primaryDf.courseProvider,
        "unitSold":cleanedDf.unitSold,
        "soldOrEnq":primaryDf.soldOrEnq,
        "offerPrice":cleanedDf.offerPrice,
        "originalPrice":cleanedDf.originalPrice,
        "haveCpd":primaryDf.haveCpd,
        "cpdAccreditedBy":cpdAccreditedBy,
        "haveProfQual":primaryDf.haveProfQual,
        "isRegulated":primaryDf.isRegulated,
        #"awrBodyName":cleanedDf.awrBodyName,
        #"awrBodyQualName":primaryDf.awrBodyQualName,
        #"courseLevel": cleanedDf.courseLevel
    })
    
    # Calculate the program execution time in mins 
    endTime = time.time()
    durationInMins = np.round((endTime-startTime)/60,2)
    print(f"Most Popular {finalDf.shape[0]} Discount Courses => Time Required to Scrape {finalDf.shape[0]} Records => {durationInMins} Minutes")
    # Write the file as csv with today's date
    return finalDf.to_csv(f"{today}_top_{finalDf.shape[0]}_courses.csv",index=None)  

In [4]:
# We need to scrape top 500 discount courses
topDiscountCourses("https://www.reed.co.uk/courses/discount",5)

Most Popular 500 Discount Courses => Time Required to Scrape 500 Records => 4.01 Minutes
