In [4]:
import time
import tqdm
import csv
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

In [5]:
class ScraperConfig:
    # should be in the form of xpath
    rating_selector = '//div[contains(@aria-label,"stars")]'
    rating_count = '//p[contains(text(),"reviews")]'
    level_selector = '//div[contains(text(),"level")]'

    # if the course is consists on courses
    cours_ovw = '//div[contains(text(), "months")]'
    
    # courses container
    cour_cont = '//span[text()="Course 1"]/ancestor::div[contains(@class, "css-")][last()]'
    cour_path = './/h3/ancestor::div[1]'
    cour_title = './/h3/a'
    isCourse = '//a[text()="Courses"]'

    # instructor
    ins_button = '//button[.//span[contains(text(),"+")]]'
    ins = '//div[@role="dialog"]//div[contains(@class,"cds-grid-item")]'
    close_ins = '//button[@aria-label="Close"]'
    isMoreIns = '//span[contains(text(),"+")]'

    one_ins = '//span[contains(normalize-space(.), "Instructor:")]'
    one_ins_link = './/a' 

    more_ins_name = './/a/span'
    more_ins_link = './/a'

    # offered by
    offer_by = '//h3[contains(text(), "Offered by")]/following::a[1]'

    # review 
    rev_cont = '//div[contains(@class, "review review-text review-page-review")]'
    rating_star = './/div[@role="img"]//span'
    review_button = '//a[.//span[contains(normalize-space(.), "View more reviews")]]'
    isReview = '//span[contains(normalize-space(.), "View more reviews")]'
    review_name = './/p[contains(@class,"reviewerName")]'
    
    #-----------------------
    # error in the path will see later
    skills_selector = '//h2[contains(.,"Skills")]/following::ul[1]//span'
    #-----------------------
    view_skills = '//button[.//span[text()="View all skills"]]'

    deafult_link = 'https://www.coursera.org/'

In [6]:
options = webdriver.ChromeOptions()
options.add_argument("--headless") 
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver  = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()), 
    options=options
)

In [7]:
def _check_var(var):
    return var in globals()

def _let_sleep_for(seconds):
    time.sleep(seconds)

In [8]:
def _get_element(path, driver=driver):
    try:
        
        return driver.find_element(By.XPATH, path)
    except:
        return False

def _get_elements(path, driver=driver):
    try:
        many = driver.find_elements(By.XPATH, path)
        return many
    except:
        return False

In [9]:
def press_button(path, timeout=3):
    try:
        button = WebDriverWait(driver, timeout).until(
            EC.element_to_be_clickable((By.XPATH, path))
        )
        driver.execute_script("arguments[0].click();", button)
        return True
    except:
        return False

In [10]:
def get_level():
    return _get_element(sc.level_selector).text

In [11]:
def get_page_intution():
    if _get_element(sc.isCourse) != False:
        isCourse = True
    else:
        isCourse = False
    
    if _get_element(sc.isMoreIns) != False:
        isMoreIns = True
    else:
        isMoreIns = False

    if _get_element(sc.isReview) != False:
        isReview = True
    else:
        isReview = False

    return isCourse, isMoreIns, isReview

In [12]:
def get_courses():
    title = []
    links = []
    try:
        courses = _get_element(sc.cour_cont)
        for course in _get_elements(sc.cour_path, courses):
            try:
                cont = _get_element(sc.cour_title, course)
                title.append(cont.text)
                links.append(cont.get_attribute("href"))
            except:
                break
    except:
        title.append(None)
        links.append(None)

    return title, links

In [13]:
def get_skills():
    press_button(sc.view_skills)
    skills = _get_elements(sc.skills_selector)
    return [s.text for s in skills][:-1:2]

In [14]:
def get_instructor(isMoreIns):
    names = []
    links = []
    if isMoreIns:
        press_button(sc.ins_button)
        ins = _get_elements(sc.ins)
        for block in ins:
            try:
                name = _get_element(sc.more_ins_name, block).text
                if name == '':
                    raise Exception(f'name is {name}')
            except:
                name = None
            try:
                link = _get_element(sc.more_ins_link, block).get_attribute("href")
            except:
                link = None

            if name == None:
                break

            names.append(name)
            links.append(link)

        
        _let_sleep_for(1)
        press_button(sc.close_ins)
    else:
        try:
            ins = _get_element(sc.one_ins)
            names.append(ins.text)
            links.append(_get_element(sc.one_ins_link, ins).get_attribute("href"))
        except:
            names.append(None)
            links.append(None)

    return names, links

In [15]:
def who_offer():
    offer = _get_element(sc.offer_by)
    return offer.get_attribute('href'), offer.text

In [16]:
def get_rating():
    rcount = _get_element(sc.rating_count).text.split(' ')[0].strip('(')
    rating = _get_element(sc.rating_selector).text
    return rating, rcount

In [17]:
def get_review_rating(review_block):
    stars = review_block.find_elements(By.XPATH,  sc.rating_star)
    rating_ = 0
    for s in stars:
        if "Filled Star" in s.get_attribute("outerHTML"):
            rating_ += 1
    return  rating_

def get_reviews():
    press_button(sc.review_button)
    try:
        reviews = _get_elements(sc.rev_cont)
        stars = []
        name = []
        for r in reviews:
            stars.append(get_review_rating(r))
            name.append(_get_element(sc.review_name, r).text.strip('By '))

        return stars, name
    except:
        raise Exception("Something went wrong")

In [18]:
def csv_handler(file_path, data, headers=None):
    try:
        with open(file_path, 'r', newline='', encoding='utf-8') as f:
            file_exists = True
    except FileNotFoundError:
        file_exists = False
        
    data_to_write = {k: ', '.join(v) if isinstance(v, list) else v for k, v in data.items()}

    with open(file_path, 'a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=list(data_to_write.keys()))
        if not file_exists:
            writer.writeheader()
        writer.writerow(data_to_write)

In [19]:
class CoursesPage:
    def __init__(self, isMoreIns):
        self.isMoreIns = isMoreIns
        self.data = None

    def scrape(self):
        title = driver.title
        level = get_level()
        rating, rcount = get_rating()
        olink, offer =  who_offer()
        sub_courses, subc_links = get_courses()
        names, links = get_instructor(self.isMoreIns)
        
        self.data = {
            'Title' :title,
            'Level': level,
            'Organization': offer,
            'Organization_Link': olink,
            'Rating': rating,
            'rcount': rcount,
            'Skills': get_skills(),
            'Courses ': sub_courses,
            'Courses_Link': subc_links,
            'Instructors': names,
            'links': links
        }
        # print(self.data)
        csv_handler('courses.csv', self.data)

In [20]:
class CoursePage:
    def __init__(self, isMoreIns):
        self.isMoreIns = isMoreIns
        self.data = None
        self.user_data = None

    def scrape(self):
        title = driver.title
        level = get_level()
        rating, rcount = get_rating()
        olink, offer =  who_offer()
        names, links = get_instructor(self.isMoreIns)
        reviews, user = get_reviews()
        
        self.data = {
            'Title' :title,
            'Level': level,
            'Organization': offer,
            'Organization_Link': olink,
            'Rating': rating,
            'rcount': rcount,
            'Skills': get_skills(),
            'Instructors': names,
            'links': links,
        }

        self.user_data = {
            'Title' :title,
            'User': user,
            'Reviews': [str(r) for r in reviews]
        }
        
        csv_handler('course.csv', self.data)
        csv_handler('user_rating.csv', self.user_data)

In [21]:
def scrape_links(links):
    for l in links:
        sl = random.uniform(3, 10)
        print(f'Waiting for {sl} sec......') 
        time.sleep(sl)
        driver.get(l)
        
        isCourse, isMoreIns, isReview = get_page_intution()

        if not (isCourse or isReview):
            pass

        if isCourse:
            CoursesPage(isMoreIns).scrape()

        elif isReview:
            CoursePage(isMoreIns).scrape()

    driver.quit()

In [22]:
links = [
    'https://www.coursera.org/professional-certificates/ai-engineer',
    'https://www.coursera.org/learn/machine-learning?specialization=machine-learning-introduction'
]

In [29]:
driver.get('https://www.coursera.org/learn/machine-learning?specialization=machine-learning-introduction')

In [30]:
press_button(sc.view_skills)
    # skills = _get_elements(sc.skills_selector)
    # return [s.text for s in skills][:-1:2]

True

In [33]:
sc.skills_selector

'//h2[contains(.,"Skills")]/following::ul[1]//span'

In [31]:
skills = _get_elements(sc.skills_selector)

In [32]:
[s.text for s in skills][:-1:2]

['Classification And Regression Tree (CART)',
 'Feature Engineering',
 'Artificial Intelligence',
 'Applied Machine Learning',
 'Statistical Modeling',
 'Python Programming',
 'Predictive Modeling',
 'Supervised Learning',
 'Jupyter',
 'Data Transformation',
 'Machine Learning',
 'NumPy',
 'Regression Analysis',
 'Scikit Learn (Machine Learning Library)']