## This script scrapes 15 providers data from Reed, from which daily revenue and market share will be calculated.

In [1]:
# Import basic and advanced modules
from IPython.core.display import clear_output
import pandas as pd
import numpy as np
import aiohttp
import asyncio
import time
from bs4 import BeautifulSoup
import requests
import nest_asyncio
from datetime import datetime

In [2]:
## Any providers
nest_asyncio.apply()

def scrape(url):
    start = time.time()
    
    # Initialize variables to be extracted
    link = []
    title = []
    provider = []
    subtitle = []
    unit_sold = []
    offer_price = []
    ori_price = []
    saving = []
    sold_or_enq = []
    
    
    # Get the no of pages to scrape
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    lst_stop_page = soup.find('span',class_='h1').text.split() # Gets total no of course as list of strings
    str_stop_page = ''.join(lst_stop_page) # converts list into string
    str_stop_page = str_stop_page.replace(',','') # removes comma from the string
    stop_page = np.int(str_stop_page) # Converts str into int
    stop_page = np.ceil(stop_page/100) # Returns float
    stop_page = np.int(stop_page) # Converts back to int from float required for range function
    
    
    # urls parsing
    async def fetch(session, url):
        async with session.get(url) as response:
            return await response.text()
        
    async def main():
        async with aiohttp.ClientSession() as session:
            for page in range(1,stop_page+1):
                html = await fetch(session, url + f'?pageno={page}&sortby=MostPopular&pagesize=100')
                soup = BeautifulSoup(html,'html.parser')
                for lnk in soup.find_all('h2',class_="mt-4 mt-sm-1 mr-5 mb-0"):
                    link.append(str('https://www.reed.co.uk')+lnk.find('a').get('href'))
                    
    asyncio.run(main())
    
                    
    # Information parsing
    async def fetch(session, url):
        async with session.get(url) as response:
            return await response.text()
        
    async def main():
        async with aiohttp.ClientSession() as session:
            # Count the no of requests
            req = 0
            for lnk,req_count in zip(link,range(1,len(link)+1)):
                html = await fetch(session, lnk)
                req = req+1
                print(f'Requests Completed: {req} out of {len(link)}')
                soup = BeautifulSoup(html,'html.parser')
                # Clear all the outputs except the current one in notebook console
                clear_output(wait=True)
                
                # Extract title
                title_tag = soup.find('h1')
                title.append(title_tag.text if title_tag is not None else 'missing')
                
                #Extract subtitle
                subtitle_tag = soup.find('h2')
                subtitle.append(subtitle_tag.text if subtitle_tag is not None else 'missing')
                
                # Extract offer price
                offer_price_tag = soup.find('span',class_='current-price')
                offer_price.append(offer_price_tag.text if offer_price_tag is not None else 'missing')
                
                # Extract original price
                ori_price_tag = soup.find("small",class_='vat-status')
                ori_price.append(ori_price_tag.text if ori_price_tag is not None else 'missing')
                
                # Extract units sold
                unit_sold_tag = soup.find_all('strong')[1]
                unit_sold.append(unit_sold_tag.text if unit_sold_tag is not None else 'missing')
                
                # Extract providers
                provider_tag = soup.find('section',class_='sidebar-actions').find('a',class_='provider-link')
                provider.append(provider_tag.text if provider_tag is not None else 'missing')
                
                # Extract savings
                saving_tag = soup.find("span",class_="icon-savings-tag price-saving")
                saving.append(saving_tag.text if saving_tag is not None else 'missing')
                
                # Extract it the course is sold or enquired
                sold_er_enq_tag = soup.find_all("div",class_="summary-content")[-1]
                sold_or_enq.append(sold_er_enq_tag.text.strip() if sold_er_enq_tag is not None else 'missing')
                
    asyncio.run(main())
    
    #Create a df of extracted variables
    df = pd.DataFrame({'title':title,'link':link, 'provider':provider, 'subtitle':subtitle,
                       'price':offer_price,'original_price':ori_price,'sold':unit_sold,'saving':saving,'sold_or_enq':sold_or_enq})
    duration = np.round((time.time()-start)/60,2)
    
    # Get course ids
    df['id'] = df.link.str.split('/').str[5].str.replace('#','')
    
    # Clean original price
    actual_price = []
    for price in df.original_price:
        actual_price.append(price[15:-1])
        
    # Rewrite original_price
    df['original_price'] = actual_price
    
    # Clean price. Remove £ and comma(,). And convert to float
    "Remove comma before applying pd.to_numeric(). Otherwise values with comma will be converted to zero"
    df.price = df.price.str.replace('£','').str.replace(',','')
    df.price = pd.to_numeric(df.price, errors='coerce').fillna(0).astype(float)
    
    # Convert non-digit sold to na, fill na by 0 and cast to int
    "Remove comma before applying pd.to_numeric(). Otherwise values with comma will be converted to zero"
    df.sold = df.sold.str.replace(',','')
    df.sold = pd.to_numeric(df.sold,errors='coerce').fillna(0).astype(int)
    
    ## Drop all the duplicates for precautionary measure
    df.drop_duplicates(subset=['id'],keep=False,inplace=True)
    
    print(f'{df.provider.iloc[0]} Courses: Time required to scrape {len(df)} observation: {duration} minutes')
    return df

In [3]:
## One education
oneEducation = scrape('https://www.reed.co.uk/courses/one-education/p1812')

## Course Gate
courseGate = scrape('https://www.reed.co.uk/courses/course-gate/p1834')

## Janets
janets = scrape('https://www.reed.co.uk/courses/janets/p1778')

## Euston college
eustonCollege = scrape('https://www.reed.co.uk/courses/euston-college/p2128')

# Training Express
trainingExpress = scrape('https://www.reed.co.uk/courses/training-express-ltd/p2079')

## Beaco
beaco = scrape('https://www.reed.co.uk/courses/be-acouk/p545')

## Brentwood
brentwood = scrape('https://www.reed.co.uk/courses/brentwood-open-learning-college/p438')

## Oplex
oplexCareers = scrape('https://www.reed.co.uk/courses/oplex-careers/p630')

## Oxford
oxford = scrape('https://www.reed.co.uk/courses/oxford-home-study-college/p1245')

## CPD courses
cpdCourses = scrape('https://www.reed.co.uk/courses/cpd-courses/p1534')

## Ofcourse
ofCourse = scrape('https://www.reed.co.uk/courses/ofcourse/p675')

## Training terminal
trainingTerminal = scrape('https://www.reed.co.uk/courses/the-training-terminal/p1064')

## Tremdimi
trendimi = scrape('https://www.reed.co.uk/courses/trendimi/p964')

## Excel with business
excelWithBusiness = scrape('https://www.reed.co.uk/courses/excel-with-business/p930')

## Centre of Excellence
centreOfExcellence = scrape('https://www.reed.co.uk/courses/centre-of-excellence-online/p652')

Centre of Excellence Courses: Time required to scrape 347 observation: 3.12 minutes


In [4]:
# Concat all the providers data into one dataframe and save as csv
mergedDf = pd.concat([
    oneEducation,
    courseGate,
    janets,
    eustonCollege,
    trainingExpress,
    beaco,
    brentwood,
    oplexCareers,
    oxford,
    cpdCourses,
    ofCourse,
    trainingTerminal,
    trendimi,
    excelWithBusiness,
    centreOfExcellence
    
],axis=0).reset_index(drop=True)

# Today's date to save as csv
today = datetime.today().strftime('%d_%b')
mergedDf.to_csv(f"{today}_15_providers.csv",index=None)