In [1]:
# Import required modules
import requests
from bs4 import BeautifulSoup
import pandas as pd
from itertools import chain
from concurrent.futures import ProcessPoolExecutor

In [2]:
# Define headers
HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"}

# This function creates all the cover pages
def generate_cover_pages(url):
    """url = ist cover page,
    return = all the cover pages as a list"""
    
    # Make request
    r = requests.get(url, headers=HEADERS)
    s = BeautifulSoup(r.text, "lxml")
    
    # Total pages
    total_page = int(s.find_all(class_="pagination-module_item__3XB-l")[-1].text.strip())
    
    
    # Generate cover pages
    cover_pages = [f"https://www.futurelearn.com/courses?filter_category=open&filter_course_type=open&filter_availability=started&page={pg}#courses-grid-start" for pg in range(1, total_page+1)]
    return cover_pages


# Function to scrape indvidual course links
def scrape_course_links(url):
    """url = cover page links,
    return = individual course links as a list"""
    
    # Make request
    r = requests.get(url, headers=HEADERS)
    s = BeautifulSoup(r.text, "lxml")
    
    # Array to store course links
    course_link = []
    
    # Scrape ind course links
    for lnk in s.find_all(class_="m-card Container-wrapper_1lZbP Container-grey_1l9VP"):
        for lnk1 in lnk.find_all("a"):
            course_link.append(lnk1.get("href"))
    
    # Removes duplicates
    course_link = list(set(course_link))
    return course_link


# Function to scrape individual course info
def scrape_course_info(url):
    """url = individual course link,
    return = course info dataframe"""
    
    # Make request
    r = requests.get(url, headers=HEADERS)
    s = BeautifulSoup(r.text, "lxml")
    
    # Scrape course title
    try:
        course_title = s.find(class_="stack-module_wrapper__3ZERF").h1.text.strip()
    except:
        course_title = "na"
    
    # Scrape course category
    try:
        category = s.find_all(class_="breadcrumbs-module_item__3SxlK")[-1].text.strip()
    except:
        category = "na"
    
    # Scrape total enrollments
    try:
        unit_sold = s.find(class_="spacer-module_default__3N2H9 spacer-module_vertical-4__5ZLo8").strong.text.strip().replace(",", "")
    except:
        unit_sold = 0
    
    # Create a temporary df
    temp_df = pd.DataFrame({
        "course_title":course_title,
        "course_link":course_link,
        "category":category,
        "unit_sold":unit_sold
    }, index=[0])
    
    return temp_df


# Wrap all the function inside main func
def main(url):
    """url = ist cover page,
    return = course info df"""
    
    # Generate cover page
    cover_pages = generate_cover_pages(f"https://www.futurelearn.com{url}")
    
    # Scrape ind course links. We're scraping for ist 2 cover pages
    course_links = list(chain.from_iterable(list(map(scrape_course_links, cover_pages[:2]))))
    
    # Scrape ind course info
    with ProcessPoolExecutor(max_workers=6) as ex:
        course_info = pd.concat(list(ex.map(scrape_course_info, course_links))).reset_index(drop=True)
    return course_info

In [3]:
# Scrape data
final_df = main("https://www.futurelearn.com/courses?filter_category=open&filter_course_type=open&filter_availability=started&page=1#courses-grid-start")
final_df.head()

ConnectionError: HTTPSConnectionPool(host='www.futurelearn.comhttps', port=443): Max retries exceeded with url: //www.futurelearn.com/courses?filter_category=open&filter_course_type=open&filter_availability=started&page=1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f8a2e189430>: Failed to establish a new connection: [Errno -2] Name or service not known'))