In [13]:
#Import required modules
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests 
import re
from datetime import date

In [16]:
# Create a function to take different category urls
def extract(url):
    
    # Initialize empty list of variables to extract
    title = []
    link = []
    link_abs = []
    price = []
    duration = []

    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
    response = requests.get(url,headers=headers)
    soup = BeautifulSoup(response.content,'html.parser')
    
    # Title
    title_tag = soup.find_all('div',class_='product__text-title') 
    for i in title_tag:
        title.append(i.a.text.strip() if title_tag is not None else 'missing')
        
    # Course link
    link_tag = soup.find_all('div',class_='product__text-title') 
    for i in link_tag:
        link.append(i.a.get('href') if link_tag is not None else 'missing')
        
    # Price
    price_tag = soup.find_all('div',class_='product__text-price')
    for i in price_tag:
        price.append(i.text.strip() if price_tag is not None else 'missing')
        
    # Duration
    duration_tag = soup.find_all('div',class_='product__text-duration')
    for i in duration_tag:
        duration.append(i.text.strip() if duration_tag is not None else 'missing')
    
    # Create a df with those extracted variables
    df = pd.DataFrame({'title':title,'link':link,'price_vat_excl':price,'duration_hrs':duration})
    
    # Remove £, +VAT from price column
    df.price_vat_excl = df.price_vat_excl.apply(lambda x: re.sub(r'\D','',x))
    
    # Remove 'Duration' :  hours from duration_hrs
    df.duration_hrs = df.duration_hrs.str.replace('Duration','').str.replace(':','').str.replace('hours','').str.replace('Hours','').str.replace('hour','').str.replace('Hour','')
    
    
    # Link is as relative path. Remove .. from link and append 'https://www.highspeedtraining.co.uk' to every link
    # to make them absolute url
    for lnk in df.link:
        link_abs.append(str('https://www.highspeedtraining.co.uk') + lnk.replace('..',''))
    df['link'] = link_abs
    
    # Extract category from individual link
    df['category'] = df.link.str.split('/').str.get(3)
    today = date.today().strftime('%d_%b')
    return df.to_csv(f'{today}.csv',index=False)


# Extract all the courses
extract('https://www.highspeedtraining.co.uk/available-courses/')    

In [72]:
# Read the data and set the title as index. Since we are checking new courses by title
df = pd.read_csv('24_Jan.csv')
df = df.set_index('title')
df

Unnamed: 0_level_0,link,price_vat_excl,duration_hrs,category
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Level 2 Food Hygiene and Safety for Catering,https://www.highspeedtraining.co.uk/food-safet...,20,2,food-safety
Asbestos Awareness (Category A) Training Course,https://www.highspeedtraining.co.uk/health-and...,25,2-3,health-and-safety
Manual Handling Training,https://www.highspeedtraining.co.uk/health-and...,25,3,health-and-safety
Level 3 Supervising Food Safety in Catering,https://www.highspeedtraining.co.uk/food-safet...,125,8-10,food-safety
Work At Height Training,https://www.highspeedtraining.co.uk/health-and...,25,1-2,health-and-safety
GDPR,https://www.highspeedtraining.co.uk/business-s...,25,1,business-skills
Workplace First Aid Training,https://www.highspeedtraining.co.uk/health-and...,25,2-3,health-and-safety
Designated Safeguarding Officer Training\r\n\t\t\t\t\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\t\t\t(Level 3 Safeguarding Children),https://www.highspeedtraining.co.uk/safeguardi...,60,3,safeguarding-people
Fire Warden Training,https://www.highspeedtraining.co.uk/health-and...,35,2-3,health-and-safety
Level 2 Health and Safety in the Workplace,https://www.highspeedtraining.co.uk/health-and...,30,3-4,health-and-safety


In [73]:
#Let's check for new courses set the title as index. Since we are checking new courses by title
df_22_oct = pd.read_csv('highspeed_training_22_oct.csv')
df_22_oct = df_22_oct.set_index('title')
df_22_oct.head()

Unnamed: 0_level_0,price,link
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Level 2 Food Hygiene and Safety for Catering,20 +VAT,https://www.highspeedtraining.co.uk/food-safet...
Asbestos Awareness (Category A) Training Course,25 +VAT,https://www.highspeedtraining.co.uk/health-and...
Manual Handling Training,25 +VAT,https://www.highspeedtraining.co.uk/health-and...
Level 3 Supervising Food Safety in Catering,125 +VAT,https://www.highspeedtraining.co.uk/food-safet...
Work At Height Training,25 +VAT,https://www.highspeedtraining.co.uk/health-and...


In [63]:
# Set 24 jan index like 22 oct
df_set_22 = df.reindex_like(df_22_oct)

# Courses lost from 22 oct. Not in 24 jan
df_set_22[df_set_22.link.isna()==True]

Unnamed: 0_level_0,price,link
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Level 2 Food Hygiene and Safety for Catering,,https://www.highspeedtraining.co.uk/food-safet...
Asbestos Awareness (Category A) Training Course,,https://www.highspeedtraining.co.uk/health-and...
Manual Handling Training,,https://www.highspeedtraining.co.uk/health-and...
Level 3 Supervising Food Safety in Catering,,https://www.highspeedtraining.co.uk/food-safet...
Work At Height Training,,https://www.highspeedtraining.co.uk/health-and...
GDPR,,https://www.highspeedtraining.co.uk/business-s...
Workplace First Aid Training,,https://www.highspeedtraining.co.uk/health-and...
Designated Safeguarding Officer Training (Level 3 Safeguarding),,
Fire Warden Training,,https://www.highspeedtraining.co.uk/health-and...
Level 2 Health and Safety in the Workplace,,https://www.highspeedtraining.co.uk/health-and...


In [71]:
# Only common rows
pd.Series(list(set(df.title).intersection(set(df_22_oct.title))))

0                    Personal Protective Equipment (PPE)
1                         Anaphylaxis Awareness Training
2                Safeguarding Children with Disabilities
3                                      Networking Skills
4      Advanced Safeguarding Children (Level 2 Safegu...
5                                    Resilience Training
6                       Customer Service Training Course
7                             Drug and Alcohol Awareness
8                        Office Health & Safety Training
9                            Needles and Sharps Training
10             Level 3 Supervising Food Safety in Retail
11                     Display Screen Equipment Training
12                             Online Bookkeeping Course
13                   Conflict Management Training Course
14                           Internet Safety for Schools
15                     Health & Safety for Food Handlers
16                  Domestic Violence and Abuse Training
17                             

In [74]:
df_22_oct[~df_22_oct.index.isin(df.index)]

Unnamed: 0_level_0,price,link
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Designated Safeguarding Officer Training (Level 3 Safeguarding),60 +VAT,https://www.highspeedtraining.co.uk/safeguardi...
Permit to Work Training,25 +VAT,https://www.highspeedtraining.co.uk/health-and...
Information Governance Training,25 +VAT,https://www.highspeedtraining.co.uk/business-s...
Performance Appraisal Training,30 +VAT,https://www.highspeedtraining.co.uk/business-s...
Deprivation of Liberty Safeguards (DoLS),30 +VAT,https://www.highspeedtraining.co.uk/safeguardi...
Managing Contractors Training,30 +VAT,https://www.highspeedtraining.co.uk/health-and...
Good Manufacturing Practice,20 +VAT,https://www.highspeedtraining.co.uk/food-safet...


In [75]:
# Not in 24jan
df[~df.index.isin(df_22_oct.index)]

Unnamed: 0_level_0,link,price_vat_excl,duration_hrs,category
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Designated Safeguarding Officer Training\r\n\t\t\t\t\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\t\t\t(Level 3 Safeguarding Children),https://www.highspeedtraining.co.uk/safeguardi...,60,3,safeguarding-people
Respirable Crystalline Silica Awareness Training,https://www.highspeedtraining.co.uk/health-and...,25,2,health-and-safety
Permit to Work Training,https://www.highspeedtraining.co.uk/health-and...,25,1-2,health-and-safety
Information Governance Training,https://www.highspeedtraining.co.uk/business-s...,25,2,business-skills
Performance Appraisal Training,https://www.highspeedtraining.co.uk/business-s...,30,3-4,business-skills
Deprivation of Liberty Safeguards (DoLS),https://www.highspeedtraining.co.uk/safeguardi...,30,1-2,safeguarding-people
Managing Contractors Training,https://www.highspeedtraining.co.uk/health-and...,30,2-3,health-and-safety
Good Manufacturing Practice,https://www.highspeedtraining.co.uk/food-safet...,20,1,food-safety
