In [1]:
# Import basic and advanced modules
import pandas as pd
import numpy as np
import re
import time
from datetime import datetime
from bs4 import BeautifulSoup
import requests
import aiohttp
import asyncio
import nest_asyncio
from IPython.core.display import clear_output

In [2]:
# Requires to save data as csv with today's date
today = datetime.today().strftime('%d_%b')

In [3]:
# Add "nest_asyncio.apply()" code snippet, otherwise it throws "RuntimeError: asyncio.run() cannot be called from a running event loop"
nest_asyncio.apply()
def scrapeByKeyword(url,keyword):
    """It will scrape any course data, provided that a keyword is given as input to the function"""
    startTime = time.time()
    
    # Initialize variables to be extracted
    link = []
    title = []
    provider = []
    subtitle = []
    unit_sold = []
    offer_price = []
    ori_price = []
    saving = []
    sold_or_enq = []
    cpd = []
    prof = []
    regu = []
    awr_qua_name = []
    awr_body_name = []
    
    
    # Get the no of pages to scrape
    res = requests.get(f"https://www.reed.co.uk/courses/?keywords={keyword}&sortby=MostPopular&pagesize=100")
    soup = BeautifulSoup(res.content,'html.parser')
    total_course = soup.find('span',class_='h1').text.strip()
    total_course = total_course.replace(',','') # Remove comma if any
    total_course = int(total_course) # Convert course no into int
    stop_page = np.ceil(total_course/100) # Returns float
    stop_page = np.int(stop_page) # Converts into int, since range function requires integer data type
    
    
    
    
    # urls parsing
    async def fetch(session, url):
        async with session.get(url) as response:
            return await response.text()
        
    async def main():
        async with aiohttp.ClientSession() as session:
            for page in range(1,stop_page+1):
                html = await fetch(session, url + f'?pageno={page}&keywords={keyword}&sortby=MostPopular&pagesize=100')
                soup = BeautifulSoup(html,'html.parser')
                for lnk in soup.find_all('h2',class_="mt-4 mt-sm-1 mr-5 mb-0"):
                    link.append(str('https://www.reed.co.uk')+lnk.find('a').get('href'))
                    
    asyncio.run(main())
    
                    
    # Information parsing
    async def fetch(session, url):
        async with session.get(url) as response:
            return await response.text()
        
    async def main():
        async with aiohttp.ClientSession() as session:
            # Count the no of requests
            req = 0
            # Extract provider name from keyword to print during requests count
            providerName = keyword.title()
            
            for lnk,req_count in zip(link,range(1,len(link)+1)):
                html = await fetch(session, lnk)
                req = req+1
                print(f"{providerName} => Requests Completed: {req} out of {len(link)}")
                soup = BeautifulSoup(html,'html.parser')
                # Clear all the outputs except the current one in notebook console
                clear_output(wait=True)
                
                # Extract title
                title_tag = soup.find('h1')
                title.append(title_tag.text if title_tag is not None else 'missing')
                
                #Extract subtitle
                subtitle_tag = soup.find('h2')
                subtitle.append(subtitle_tag.text if subtitle_tag is not None else 'missing')
                
                # Extract offer price
                offer_price_tag = soup.find('span',class_='current-price')
                offer_price.append(offer_price_tag.text if offer_price_tag is not None else 'missing')
                
                # Extract original price
                ori_price_tag = soup.find("small",class_='vat-status')
                ori_price.append(ori_price_tag.text if ori_price_tag is not None else 'missing')
                
                # Extract units sold
                unit_sold_tag = soup.find_all('strong')[1]
                unit_sold.append(unit_sold_tag.text if unit_sold_tag is not None else 'missing')
                
                # Extract providers
                provider_tag = soup.find('section',class_='sidebar-actions').find('a',class_='provider-link')
                provider.append(provider_tag.text if provider_tag is not None else 'missing')
                
                # Extract savings
                saving_tag = soup.find("span",class_="icon-savings-tag price-saving")
                saving.append(saving_tag.text if saving_tag is not None else 'missing')
                
                # Extract it the course is sold or enquired
                sold_er_enq_tag = soup.find_all("div",class_="summary-content")[-1]
                sold_or_enq.append(sold_er_enq_tag.text.strip() if sold_er_enq_tag is not None else 'missing')
                
                # Have cpd
                cpd_tag = soup.find('div', class_="badge badge-dark badge-cpd mt-2")
                cpd.append('yes' if cpd_tag is not None else 'no')
                
                # Have prof qualification
                prof_tag = soup.find('div', class_="badge badge-dark badge-professional mt-2")
                prof.append('yes' if prof_tag is not None else 'no')
                
                # Have requlation
                regu_tag = soup.find('div', class_="badge badge-dark badge-regulated mt-2")
                regu.append('yes' if regu_tag is not None else 'no')
                
                # Awarding body qualification name if any
                try:
                    awr_body_qua_tag = soup.find('div',class_='col')
                    awr_qua_name.append(awr_body_qua_tag.find_all('div')[0].text.strip())
                except:
                    awr_qua_name.append("missing")
                
                # Awarding body name if any
                try:
                    awr_body_tag = soup.find('div',class_='col')
                    awr_body_name.append(awr_body_tag.find_all('div')[1].text.strip())
                except:
                    awr_body_name.append("missing")
                
                
    asyncio.run(main())
    
    #Create a df of extracted variables
    df = pd.DataFrame({'title':title,'link':link, 'provider':provider, 'subtitle':subtitle,
                       'price':offer_price,'original_price':ori_price,'sold':unit_sold,
                       'saving':saving,'sold_or_enq':sold_or_enq,'cpd':cpd,'prof':prof,
                       'regu':regu,'awr_body_qua_name':awr_qua_name,'awr_body_name':awr_body_name})
    duration = np.round((time.time()-startTime)/60,2)
    
    # Get course ids
    df['id'] = df.link.str.split('/').str[5].str.replace('#','')
    
    # Clean original price
    actual_price = []
    for price in df.original_price:
        actual_price.append(price[15:-1])
        
    # Rewrite original_price
    df['original_price'] = actual_price
    
    # Clean price. Remove £ and comma(,). And convert to float
    "Remove comma before applying pd.to_numeric(). Otherwise values with comma will be converted to zero"
    df.price = df.price.str.replace('£','').str.replace(',','')
    df.price = pd.to_numeric(df.price, errors='coerce').fillna(0).astype(float)
    
    # Convert non-digit sold to na, fill na by 0 and cast to int
    "Remove comma before applying pd.to_numeric(). Otherwise values with comma will be converted to zero"
    df.sold = df.sold.str.replace(',','')
    df.sold = pd.to_numeric(df.sold,errors='coerce').fillna(0).astype(int)
    
    ## Drop all the duplicates for precautionary measure
    df.drop_duplicates(subset=['id'],keep=False,inplace=True)
    
    
    print(f'{keyword} Courses: Time required to scrape {len(df)} observation: {duration} minutes')
    return df

#### AAT
scrapeByKeyword('https://www.reed.co.uk/courses/','aat')

In [4]:
# AET
aetCourses = scrapeByKeyword('https://www.reed.co.uk/courses/','aet')
aetCourses.head()

aet Courses: Time required to scrape 41 observation: 0.45 minutes


Unnamed: 0,title,link,provider,subtitle,price,original_price,sold,saving,sold_or_enq,cpd,prof,regu,awr_body_qua_name,awr_body_name,id
0,Level 3 Award in Education and Training - AET ...,https://www.reed.co.uk/courses/level-3-award-i...,One Education,Awarded by TQUK I Ofqual Regulated I Free PDF ...,10.0,399,660,Save 97%,660 students purchased this course,no,yes,yes,Level 3 Award in Education and Training (QCF),Awarded by\nTraining Qualifications UK Ltd,246722
1,Award in Education and Training (A.E.T.) Level...,https://www.reed.co.uk/courses/award-in-educat...,Study365,Pearson BTEC Level 3 Award in Education and Tr...,29.0,399,724,Save 92%,724 students purchased this course,no,yes,yes,Level 3 Award in Education and Training (QCF),Awarded by\nEdexcel,176029
2,Level 3 Award in Education and Training - BTEC...,https://www.reed.co.uk/courses/level-3-award-i...,1 Training,Pearson | BTEC | Ofqual Regulated |Tutor Suppo...,10.0,299,742,Save 96%,742 students purchased this course,no,yes,yes,Level 3 Award in Education and Training (QCF),Awarded by\nPearson Education Ltd,131479
3,Level 3 Award in Education and Training - AET ...,https://www.reed.co.uk/courses/level-3-award-i...,1 Training,Awarded by NCFE CACHE | Ofqual Regulated | Ful...,159.0,169,94,Save 5%,94 students enquired about this course,no,yes,yes,Level 3 Award in Education and Training,Awarded by\nPearson,248534
4,Level 3 Award in Education and Training,https://www.reed.co.uk/courses/level-3-award-i...,Future Step Education,Online based with full tutor support and live ...,140.0,280,0,Save 50%,Tutor is available to students,no,yes,no,Level 3 Award in Education and Training (RQF),Awarded by\nActive IQ,268901


In [5]:
# Scrape zeology courses
zeologyCourses = scrapeByKeyword('https://www.reed.co.uk/courses','zeology')

zeology Courses: Time required to scrape 23 observation: 0.33 minutes


In [6]:
# Ecology courses
ecologyCourses = scrapeByKeyword('https://www.reed.co.uk/courses','ecology')

ecology Courses: Time required to scrape 57 observation: 0.54 minutes


#### CMI
scrapeByKeyword('https://www.reed.co.uk/courses/','cmi')

#### CACHE
scrapeByKeyword('https://www.reed.co.uk/courses/','cache')

#### TQUK
scrapeByKeyword('https://www.reed.co.uk/courses/','tquk')

#### CIPD
scrapeByKeyword('https://www.reed.co.uk/courses/','cipd')

#### City and guilds 
scrapeByKeyword('https://www.reed.co.uk/courses/','city&guilds')

#### IIRSM
scrapeByKeyword('https://www.reed.co.uk/courses/','iirsm')

#### CIMA
scrape('https://www.reed.co.uk/courses/','cima')

#### CompTIA
scrape('https://www.reed.co.uk/courses/','comptia')

#### ABC Awards
scrape('https://www.reed.co.uk/courses/','abc')