In [3]:
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
import os
import platform
from langdetect import detect
import json
from tqdm import tqdm

# If browser is loading too fast, enable time.sleep option
# import time

options = webdriver.ChromeOptions()
#options.add_argument("headless")
options.add_experimental_option("excludeSwitches", ["enable-automation"])

# Choose right chromedriver of your OS

if platform.system() == "Windows":
    driverpath = os.getcwd() + "/chromedriver.exe"
else:
    driverpath = os.getcwd() + "/chromedriver"


def get_links(page, lang=True):

    url = "https://www.coursera.org/directory/courses?page=" + str(page)
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "lxml")
    links = soup.findAll("a", {"class": "c-directory-link"}, href=True)

    course_name = []
    course_links = []
    for i in links:
        if detect(i.text) == "en":
            course_name.append(i.text)
            course_links.append("https://coursera.org" + i["href"])

    collected = {}
    for i in range(0, len(course_name)):
        collected[course_name[i]] = course_links[i]
    return collected


In [22]:

def crawl_add(url):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "lxml")


    if soup.findAll("div", {"class": "_1fpiay2"}) != []:
        enroll = soup.findAll("div", {"class": "_1fpiay2"})[0].findAll("span")[-1].text
        enroll = int(re.sub("[^0-9.]+", "", enroll))
    else:
        enroll = "None"

    if soup.findAll("div", {"class": "_bd90rg"}) != []:
        views = soup.findAll("div", {"class": "_bd90rg"})[0].findAll("span")[-1].text
        views = int(re.sub("[^0-9.]+", "", views))
    else:
        views = "None"

    if soup.findAll("div", {"class": "instructor-wrapper"}) != []:
        instructor_infos = soup.findAll("div", {"class": "instructor-wrapper"})
        instructor_information = {}
        for i in instructor_infos:
            instructor_info = i
            instructor_name = instructor_info.find("h3").text
            instructor_title = instructor_info.find(
                "span", {"class": "instructor-title"}
            ).text
            try:
                instructor_department = instructor_info.find(
                    "div", {"class": "instructor-department caption-text color-black"}
                ).text
            except AttributeError:
                instructor_department = "None"
            try:
                detail = instructor_info.find("div", {"class": "instructor-expertise"})
            except AttributeError:
                detail = None

            if detail != None:
                learners = detail.find("span").text
                learners = int(re.sub("[^0-9.]+", "", learners))
                courses = detail.find("div", {"class": "courses-count"}).text
                courses = int(re.sub("[^0-9.]+", "", courses))
            else:
                learners = "None"
                courses = "None"

            instructor_information[instructor_name] = {
                "Instructor_title": instructor_title,
                "Instructor_department": instructor_department,
                "Learners": learners,
                "Open_courses": courses,
            }
    else:
        instructor_information = "None"
    return {
        "Enroll": enroll,
        "Views": views,
        "Instructor_info": instructor_information,
    }

In [23]:
driver = webdriver.Chrome(driverpath, chrome_options=options)
data = {}

start = input("Which page to start?: ")
page = input("How many pages to crawl? (Max 194...): ")
for i in range(int(start), int(page) + 1):

    print("Start scraping...")

    links = get_links(i, lang=True)  # Detect language, and if not English, skip
    names = list(links.keys())

    for c in tqdm(range(0, len(names))):
        course = names[c]
        link = links[course]
        data[course] = crawl_add(link)

    if i % 10 == 0:
        print("Page {} out of {}...".format(i, page))
        # backup:
        print("Saved backup")

  """Entry point for launching an IPython kernel.


Which page to start?: 106
How many pages to crawl? (Max 194...): 106
Start scraping...


100%|██████████| 33/33 [01:23<00:00,  2.54s/it]


In [24]:
data

{'Introduction to Project Management (Coursera Project Network)': {'Enroll': 31875,
  'Views': 'None',
  'Instructor_info': {'Megan Peck': {'Instructor_title': 'Project Manager',
    'Instructor_department': 'None',
    'Learners': 31875,
    'Open_courses': 1}}},
 'Introduction to Project Management with ClickUp (Coursera Project Network)': {'Enroll': 10995,
  'Views': 'None',
  'Instructor_info': {'Abby Saey': {'Instructor_title': '',
    'Instructor_department': 'Freedom Learning Group',
    'Learners': 43756,
    'Open_courses': 18}}},
 'Introduction to Psychology (University of Toronto)': {'Enroll': 343928,
  'Views': 200689,
  'Instructor_info': {'Steve Joordens': {'Instructor_title': 'Professor',
    'Instructor_department': 'Department of Psychology',
    'Learners': 489827,
    'Open_courses': 3}}},
 'Introduction to Psychology (Yale University)': {'Enroll': 608457,
  'Views': 1086040,
  'Instructor_info': {'Paul Bloom': {'Instructor_title': 'Brooks and Suzanne Ragen Professor

In [None]:
# Save as json file
with open("additional_data.json", "w", encoding="utf-8") as json_file:
    json.dump(data, json_file)