In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from itertools import chain

In [2]:
# Define user agent
HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}

# Function to generate cover pages
def generate_cover_pages():
    
    # Making requests
    r = requests.get("https://www.hotukdeals.com/tag/online-courses?page=1", headers=HEADERS)
    s = BeautifulSoup(r.text, "lxml")
    
    # Scrape total page
    total_page = int(s.find_all("button", class_="text--color-brandPrimary")[-1].text.strip())
    
    # To store cover pages
    cover_page = []
    for pg in range(1, total_page+1):
        cover_page.append(f"https://www.hotukdeals.com/tag/online-courses?page={pg}")
    return cover_page


# Function to scrape individual deal links
def scrape_deal_links(url):
    """url = cover pages,
    return = individual deal links as a list"""
    
    # Making requests
    r = requests.get(url, headers=HEADERS)
    s = BeautifulSoup(r.text, "lxml")
    
    # To store deal links
    deal_links = []
    
    # Main container
    main_container = s.find_all(class_="threadGrid")
    
    # Iterate over main container and scrape deal links
    for cont in main_container:
        try:
            deal_links.append(cont.a.get("href"))
        except:
            deal_links.append("na")

    # Filter out "na" links if any
    deal_links = list(filter(lambda x: "na"!=x, deal_links))
    return deal_links


# Function to scrape individual deal info
def scrape_deal_info(url):
    """url = individual deal link,
    return = deal info df"""
    
    # Making requests
    r = requests.get(url, headers=HEADERS)
    s = BeautifulSoup(r.text, "lxml")
    
    # Scrape deal title
    try:
        title = s.find(class_="thread-title--item").text.strip()
    except:
        title = "na"
    
    # Scrape deal temperature. Checks for 3 classes. 2 for active deals, 1 for inactive deals
    try:
        temp = s.find_all('span', class_=['cept-vote-temp vote-temp vote-temp--hot',
                         'cept-vote-temp vote-temp vote-temp--warm',
                        'space--h-2 text--b'])[0].text.strip()
    except:
        temp = np.nan
    
    # Scrape deal price. Checks for 2 classes. 1 for active deals, 1 for inactive deals
    try:
        price = s.find_all('span', class_=['thread-price text--b cept-tp size--all-l size--fromW3-xl',
                         'thread-price text--b cept-tp size--all-l size--fromW3-xl text--color-greyShade'])[0].text.strip()
    except:
        price = np.nan
    
    # Scrape deal provider. Checks for 2 classes. 1 for active deals, 1 for inactive deals
    try:
        provider = s.find_all('span', class_=['cept-merchant-name text--b text--color-brandPrimary link',
                         'cept-merchant-name text--b text--color-greyShade link'])[0].text.replace("Deals", "").strip()
    except:
        provider = np.nan
    
    # Scrape deal publisher
    try:
        publisher = s.find(class_="thread-username").text.strip()
    except:
        publisher = np.nan
    
    # Checks if the deal is currently active
    try:
        is_active = 0 if "expired" in s.find(class_="threadItem-headerMeta").text.lower() else 1
    except:
        is_active = np.nan
    
    # Create a temp dataframe off scraped variables
    temp_df = pd.DataFrame({
        "deal_title":title,
        "deal_link":url,
        "deal_temperature":temp,
        "deal_price":price,
        "deal_provider":provider,
        "deal_publisher":publisher,
        "is_active":is_active
    }, index=[0])
    
    return temp_df

In [3]:
# Wrap all the function inside main
def main(s1, s2):
    """s1 = start index of deal links,
    s2 = end index of deal links"""
    
    # Generate cover pages
    cover_pages = generate_cover_pages()
    
    # Scrape deal links 
    with ProcessPoolExecutor(max_workers=4) as ex:
        deal_links = list(chain.from_iterable(list(ex.map(scrape_deal_links, cover_pages))))
    
    # Scrape deal info
    with ProcessPoolExecutor(max_workers=6) as ex:
        deal_info = pd.concat(list(ex.map(scrape_deal_info, deal_links[s1:s2]))).reset_index(drop=True)
    
    return deal_info

In [4]:
# Apply the function and scrape 200 deal info
deal_info = main(0, 200)

# Preview
deal_info.head(10)

Unnamed: 0,deal_title,deal_link,deal_temperature,deal_price,deal_provider,deal_publisher,is_active
0,Complete Portuguese Course: Portuguese for Beg...,https://www.hotukdeals.com/deals/complete-port...,89°,FREE,Udemy,aclondon,1.0
1,Rock Music Production For TV/Film & Video Game...,https://www.hotukdeals.com/deals/rock-music-pr...,89°,FREE,Udemy,Wonka,1.0
2,"Free Udemy Courses: Excel Lessons, Cryptocurre...",https://www.hotukdeals.com/deals/free-udemy-co...,483°,FREE,Udemy,MartianMan,1.0
3,15 AWS Courses: AWS Certified Solutions Archit...,https://www.hotukdeals.com/deals/15-aws-course...,78°,£9.99,Udemy,MartianMan,1.0
4,Arabic Language Course - free with code @ Udemy,https://www.hotukdeals.com/deals/arabic-langua...,267°,FREE,Udemy,aclondon,1.0
5,Delicious Japanese language for foodies - free...,https://www.hotukdeals.com/deals/delicious-jap...,250°,FREE,Udemy,aclondon,0.0
6,Free Udemy Courses: Python Network Programming...,https://www.hotukdeals.com/deals/free-udemy-co...,577°,FREE,Udemy,MartianMan,0.0
7,Complete Turkish Course: Learn Turkish for Beg...,https://www.hotukdeals.com/deals/complete-turk...,842°,FREE,Udemy,aclondon,0.0
8,Spanish for beginners: Level 1 - free @ Udemy,https://www.hotukdeals.com/deals/spanish-for-b...,551°,FREE,Udemy,aclondon,0.0
9,"Free Udemy Courses: Heal From Within, Canva, M...",https://www.hotukdeals.com/deals/free-udemy-co...,437°,FREE,Udemy,MartianMan,0.0
