In [1]:
# Import required modules
from concurrent.futures import ProcessPoolExecutor
import pandas as pd
from requests_html import HTMLSession
from selenium import webdriver
import re
import time
from itertools import chain

In [2]:
# This function creates cover pages
def generateCoverPage(url):
    """url = a single cover page,
    return = cover pages"""
    
    # Initialize web drivers
    driver = webdriver.Chrome("/home/faysal/Documents/utilities/chromedriver")
    driver.get(url)
    time.sleep(3)
    
    # Initiate empty list to store cover pages
    coverPage = []
    
    # If there are more than 1 page
    try:
        totalPage = driver.find_element_by_css_selector(".pagination-numbers-container").text.split("\n")[-1]
        totalPage = int(totalPage)
    # If there is only one page
    except:
        totalPage = 1
    
    # Create cover pages and store them
    for pg in range(1, totalPage+1):
        coverPage.append(url + f"&page={pg}")
    driver.close()
    return coverPage



# This function creates individual course links
def scrapeCourseLink(url):
    """url = a single cover page created by earlier function,
    return = individual course links"""
    
    # Initialize web drivers
    driver = webdriver.Chrome("/home/faysal/Documents/utilities/chromedriver")
    driver.get(url)
    time.sleep(3)
    
    # Initiate empty list to store course links
    courseLink = []
    
    # SCrape individual course links
    for lnk in driver.find_elements_by_css_selector("div.cui-content a"):
        courseLink.append(lnk.get_attribute("href"))
    driver.close()
    return courseLink

In [3]:
# This function scrapes individual course info
def scrapeCourseInfo(url):
    """Scrapes required variables by requesting each course link,
    url = individual course link,
    return = a df of course information"""
    
    # Initialize session and make request
    r = HTMLSession()
    s = r.get(url)
    
    # Scrape main container
    # Main container holds course tile, unit sale, offer price, original price and savings
    mainContainer = [x.text.strip() for x in s.html.find("div.option-details.c-txt-gray-dk")]
    
    # Create a dataframe for futher extracting the variables
    mainDf = pd.DataFrame({
        "mainCont":mainContainer
    })
    
   # Scrape location
    try:
        mainDf["location"] = s.html.find(".merchant-info-anchor")[0].text.strip()
    except:
        mainDf["location"] = "na"
    
    # Extract course link
    mainDf["courseLink"] = url
    
    # Scrape course provider
    try:
        mainDf["courseProvider"] = s.html.find("#deal-title")[0].text.strip()
    except:
        mainDf["courseProvider"] = "na"
    
    return mainDf




# This cleans the scraped variables and create new features.
def cleanAndEngineerFeature(df):
    """df = df to clean,
    return = final cleaned df"""

    # Create a copy of the input dataframe
    cleanedDf = df.copy()
    
    # Extract title
    cleanedDf["courseTitle"] = cleanedDf.mainCont.str.split("\n").str.get(0)

    # Extract unit sold and clean it
    cleanedDf["unitSold"] = cleanedDf.mainCont.str.extract(r"(\d+,?\d+\+\s(?:bought|purchased))")
    cleanedDf.unitSold = cleanedDf.unitSold.str.split("+").str[0].str.replace(",", "").fillna(0).astype(int)
    
    # Extract original price and clean it
    cleanedDf["originalPrice"] = cleanedDf.mainCont\
    .apply(lambda x: re.findall(r"($|£|€|AED)\s?(\d+[,.]?\d*)", x)).str[0].str[-1].str.replace(",", "")
    cleanedDf.originalPrice = pd.to_numeric(cleanedDf.originalPrice.str.replace(",", ""), errors="coerce")

    # Extract offer price and clean it
    cleanedDf["offerPrice"] = cleanedDf.mainCont\
    .apply(lambda x: re.findall(r"($|£|€|AED)\s?(\d+[,.]?\d*)", x)).str[1].str[-1].str.replace(",", "")
    cleanedDf.offerPrice = cleanedDf.offerPrice.fillna(cleanedDf.originalPrice).astype(float)
    
    # Drop courses with live location
    cleanedDf = cleanedDf[cleanedDf.location=="na"]
    
    # Drop variables
    cleanedDf.drop(["mainCont", "location"], axis=1, inplace=True)
    
    # Return the final data as intended order
    cleanedDf = cleanedDf[["courseTitle", "courseLink", "courseProvider", "unitSold", "offerPrice", "originalPrice"]]
    return cleanedDf

In [4]:
# Wrap all the function inside main
def main(url):
    """url = single cover page,
    return = final cleaned data"""
    
    # Generate cover pages
    coverPage = generateCoverPage(url)
    
    # Scrape course links
    courseLink = list(map(scrapeCourseLink, coverPage))
    courseLink = list(chain.from_iterable(courseLink))
    
    # Scrape course info
    with ProcessPoolExecutor(max_workers=6) as ex:
        courseInfo = pd.concat(list(ex.map(scrapeCourseInfo, courseLink)))
    
    # Clean the scraped data
    finalDf = cleanAndEngineerFeature(courseInfo)
    
    # Insert course category
    finalDf["category"] = url.split("=")[-1].replace("-", " ").title()
    
    # Clean category column
    # Remove course|courses, lesson|lessons, class|classes.
    # Also replace 1 or more than 1 space with 1 space
    finalDf.category = finalDf.category\
    .apply(lambda x: re.sub("courses?|lessons?|classes?", "", x, flags=re.I)).str.replace("\s{1,}", " ")\
    .str.strip()
    return finalDf

In [5]:
# These are the category cover pages link
categoryCoverPages = [
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=skills-and-hobbies",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=health-and-fitness-classes",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=business-training-classes",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=cookery-courses",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=v-dance-classes",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=language-courses",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=pet-care-classes",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=academic-courses",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=project-management-courses",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=personal-development",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=digital-marketing-courses",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=accounting-and-finance-courses",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=it-certifications",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=driving-lessons",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=microsoft-office-classes",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=web-and-app-development",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=programming-classes",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=multimedia-audio-and-video-courses",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=tuition",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=reading-and-writing-classes",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=flying-lessons",
    "https://www.groupon.co.uk/browse/london?topcategory=local&subcategory2=bartending-course"
]

In [6]:
# Apply the function on each category pages
masterDf = pd.concat(list(map(main, categoryCoverPages))).reset_index(drop=True)

  finalDf.category = finalDf.category\


In [7]:
# Preview the data
masterDf.head(10)

Unnamed: 0,courseTitle,courseLink,courseProvider,unitSold,offerPrice,originalPrice,category
0,Drums: one-hour Skype lesson; valid 7 days a week,https://www.groupon.co.uk/deals/waithe-studios...,Waithe Tuition,0,20.0,50.0,Skills And Hobbies
1,Guitar: one-hour Skype lesson; valid 7 days a ...,https://www.groupon.co.uk/deals/waithe-studios...,Waithe Tuition,0,20.0,50.0,Skills And Hobbies
2,Drums: one-hour Skype lesson; valid 7 days a week,https://www.groupon.co.uk/deals/waithe-studios...,Waithe Tuition,0,20.0,50.0,Skills And Hobbies
3,Guitar: one-hour Skype lesson; valid 7 days a ...,https://www.groupon.co.uk/deals/waithe-studios...,Waithe Tuition,0,20.0,50.0,Skills And Hobbies
4,Create Together Arty or Crafty Session on Zoom...,https://www.groupon.co.uk/deals/deana-kim-page...,Deana Kim Page - Art Lessons & Workshops,0,4.0,6.0,Skills And Hobbies
5,Theme Tuesday or Thursday Art Session on Zoom ...,https://www.groupon.co.uk/deals/deana-kim-page...,Deana Kim Page - Art Lessons & Workshops,0,4.0,6.0,Skills And Hobbies
6,Re-create a Famous Masterpiece on zoom - 3 hou...,https://www.groupon.co.uk/deals/deana-kim-page...,Deana Kim Page - Art Lessons & Workshops,0,4.0,6.0,Skills And Hobbies
7,Culinary herbs online course,https://www.groupon.co.uk/deals/online-academy-21,Online Academy,0,12.0,20.0,Skills And Hobbies
8,20-module online photography course,https://www.groupon.co.uk/deals/photography-ma...,Photography Made Easy,1000,14.0,295.0,Skills And Hobbies
9,Relaxation massage online masterclass,https://www.groupon.co.uk/deals/skill-success-165,Skill Success,110,19.0,158.63,Skills And Hobbies
