In [1]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import time

### Metadatos un curso

In [2]:
def get_metadata_page(url): 
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')

    h1_tags = soup('h1')
    title = h1_tags[0].get_text()

    description = soup.find('div', {'class': 'content-inner'}).get_text()

    h4_tags = soup('h4')
    difficulty_level = ''
    duration = ''
    language = h4_tags[-1].get_text()
    for h4 in h4_tags:    
        if (re.search("Level", str(h4))):
            difficulty_level = h4.get_text()
        if (re.search("Approx", str(h4))):
            duration = h4.get_text()

    try:
        entradas = soup.find('div', {'class': 'Box_120drhm-o_O-displayflex_poyjc-o_O-wrap_rmgg7w'})
        keywords = ''
        for span in entradas:
            keywords += span.get_text() + ', '
    except:
        keywords = ''

    diccionario = {'title': title, 'description': description, 'difficulty_level': difficulty_level,
                   'duration': duration, 'keywords': keywords, 'url': url}
    return diccionario

### Iterar sobre todos los cursos

In [3]:
def iterator(url, n_pages):
    exeDriver = "chromedriver.exe"
    driver = webdriver.Chrome(exeDriver)
    driver.get(url)
    
    # dataframe
    columns = ['title', 'description', 'difficulty_level', 'duration', 'keywords', 'url']
    df = pd.DataFrame(columns = columns)   
    
    #pasar de pagina
    for i in range(n_pages): 
        print(i+1, '--------------------------------------------------------------------------')
        time.sleep(2)
        try:
            # Conseguir metadatos de un curso
            source = driver.page_source
            soup = BeautifulSoup(source, "html.parser")
            for link in soup.findAll('a', attrs={'href': re.compile("/learn/")}):
                href = 'https://www.coursera.org' + link.get('href')
                print(href)
                diccionario = get_metadata_page(href)
                df = df.append(diccionario, ignore_index=True)
        except:
            print('Error consiguiendo metadatos *************************************************************************')
            pass
        try:
            # Pasar pagina
            time.sleep(1)
            driver.find_element_by_id('pagination_right_arrow_button').click()
        except:
            print('Error pasando de pagina *************************************************************************')
            pass
    driver.quit()
    return df

In [4]:
# Beginner
url = 'https://www.coursera.org/search?query=&indices%5Bprod_all_products_term_optimization%5D%5Bpage%5D=1&indices%5Bprod_all_products_term_optimization%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products_term_optimization%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&indices%5Bprod_all_products_term_optimization%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Beginner&indices%5Bprod_all_products_term_optimization%5D%5BrefinementList%5D%5BallLanguages%5D%5B0%5D=English&indices%5Bprod_all_products_term_optimization%5D%5BrefinementList%5D%5BentityTypeDescription%5D%5B0%5D=Courses&configure%5BclickAnalytics%5D=true'
dfB = iterator(url, 100)

1 --------------------------------------------------------------------------
https://www.coursera.org/learn/ai-for-everyone
https://www.coursera.org/learn/technical-support-fundamentals
https://www.coursera.org/learn/sql-for-data-science
https://www.coursera.org/learn/bcg-uva-darden-digital-transformation
https://www.coursera.org/learn/what-is-datascience
https://www.coursera.org/learn/aws-fundamentals-going-cloud-native
https://www.coursera.org/learn/financial-markets-global
https://www.coursera.org/learn/uva-darden-digital-product-management
https://www.coursera.org/learn/uva-darden-design-thinking-innovation
https://www.coursera.org/learn/deep-neural-network


In [5]:
# Mixed
url = 'https://www.coursera.org/search?query=&indices%5Bprod_all_products_term_optimization%5D%5Bpage%5D=1&indices%5Bprod_all_products_term_optimization%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products_term_optimization%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&indices%5Bprod_all_products_term_optimization%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Mixed&indices%5Bprod_all_products_term_optimization%5D%5BrefinementList%5D%5BallLanguages%5D%5B0%5D=English&indices%5Bprod_all_products_term_optimization%5D%5BrefinementList%5D%5BentityTypeDescription%5D%5B0%5D=Courses&configure%5BclickAnalytics%5D=true'
dfM = iterator(url, 100)

1 --------------------------------------------------------------------------
https://www.coursera.org/learn/machine-learning
https://www.coursera.org/learn/google-cbrs-cpi-training
https://www.coursera.org/learn/python
https://www.coursera.org/learn/the-science-of-well-being
https://www.coursera.org/learn/python-data
https://www.coursera.org/learn/learning-how-to-learn
https://www.coursera.org/learn/negotiation-skills
https://www.coursera.org/learn/private-equity
https://www.coursera.org/learn/indigenous-canada
https://www.coursera.org/learn/data-scientists-tools


In [6]:
# Intermediate
url = 'https://www.coursera.org/search?query=&indices%5Bprod_all_products_term_optimization%5D%5Bpage%5D=1&indices%5Bprod_all_products_term_optimization%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products_term_optimization%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&indices%5Bprod_all_products_term_optimization%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Intermediate&indices%5Bprod_all_products_term_optimization%5D%5BrefinementList%5D%5BallLanguages%5D%5B0%5D=English&indices%5Bprod_all_products_term_optimization%5D%5BrefinementList%5D%5BentityTypeDescription%5D%5B0%5D=Courses&configure%5BclickAnalytics%5D=true'
dfI = iterator(url, 77)

1 --------------------------------------------------------------------------
https://www.coursera.org/learn/neural-networks-deep-learning
https://www.coursera.org/learn/gcp-fundamentals
https://www.coursera.org/learn/python-data-analysis
https://www.coursera.org/learn/introduction-tensorflow
https://www.coursera.org/learn/convolutional-neural-networks
https://www.coursera.org/learn/machine-learning-with-python
https://www.coursera.org/learn/google-kubernetes-engine
https://www.coursera.org/learn/aws-fundamentals-cloud-migration
https://www.coursera.org/learn/gcp-big-data-ml-fundamentals
https://www.coursera.org/learn/aws-machine-learning


In [7]:
# Advanced
url = 'https://www.coursera.org/search?query=&indices%5Bprod_all_products_term_optimization%5D%5Bpage%5D=1&indices%5Bprod_all_products_term_optimization%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products_term_optimization%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&indices%5Bprod_all_products_term_optimization%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Advanced&indices%5Bprod_all_products_term_optimization%5D%5BrefinementList%5D%5BallLanguages%5D%5B0%5D=English&indices%5Bprod_all_products_term_optimization%5D%5BrefinementList%5D%5BentityTypeDescription%5D%5B0%5D=Courses&configure%5BclickAnalytics%5D=true'
dfA = iterator(url, 13)

1 --------------------------------------------------------------------------
https://www.coursera.org/learn/site-reliability-engineering-slos
https://www.coursera.org/learn/medical-neuroscience
https://www.coursera.org/learn/preparing-cloud-professional-cloud-architect-exam
https://www.coursera.org/learn/advanced-valuation-and-strategy
https://www.coursera.org/learn/probabilistic-graphical-models
https://www.coursera.org/learn/intro-self-driving-cars
https://www.coursera.org/learn/intro-to-deep-learning
https://www.coursera.org/learn/competitive-data-science
https://www.coursera.org/learn/end-to-end-ml-tensorflow-gcp
https://www.coursera.org/learn/social-economic-networks


### Concatenando los 4 df 

In [13]:
df = pd.concat([dfB, dfM, dfI, dfA], ignore_index=True)
df

Unnamed: 0,title,description,difficulty_level,duration,keywords,url
0,AI For Everyone,AI is not only for engineers. If you want your...,Beginner Level,Approx. 9 hours to complete,"Workflow of Machine Learning projects, AI term...",https://www.coursera.org/learn/ai-for-everyone
1,Technical Support Fundamentals,This course is the first of a series that aims...,Beginner Level,Approx. 20 hours to complete,"Binary Code, Customer Support, Linux, Troubles...",https://www.coursera.org/learn/technical-suppo...
2,SQL for Data Science,As data collection has increased exponentially...,Beginner Level,Approx. 20 hours to complete,"Data Science, Data Analysis, Sqlite, SQL,",https://www.coursera.org/learn/sql-for-data-sc...
3,Digital Transformation,Digital transformation is a hot topic--but wha...,Beginner Level,Approx. 20 hours to complete,"Technology Disruption, Digital Trends, Competi...",https://www.coursera.org/learn/bcg-uva-darden-...
4,What is Data Science?,The art of uncovering the insights and trends ...,Beginner Level,Approx. 6 hours to complete,,https://www.coursera.org/learn/what-is-datasci...
5,AWS Fundamentals: Going Cloud-Native,This course will introduce you to Amazon Web S...,Beginner Level,Approx. 7 hours to complete,"Cloud Computing Security, AWS cloud, Cloud Sto...",https://www.coursera.org/learn/aws-fundamental...
6,Financial Markets,"An overview of the ideas, methods, and institu...",Beginner Level,Approx. 38 hours to complete,"Behavioral Finance, Financial Markets, Finance...",https://www.coursera.org/learn/financial-marke...
7,Digital Product Management: Modern Fundamentals,"Not so long ago, the job of product manager wa...",Beginner Level,Approx. 19 hours to complete,"Product/Market Fit, Product Management, Design...",https://www.coursera.org/learn/uva-darden-digi...
8,Design Thinking for Innovation,Today innovation is everyone's business. Wheth...,Beginner Level,Approx. 10 hours to complete,"Strategic Thinking, Design Thinking, Innovatio...",https://www.coursera.org/learn/uva-darden-desi...
9,Improving Deep Neural Networks: Hyperparameter...,"This course will teach you the ""magic"" of gett...",Beginner Level,Approx. 15 hours to complete,"Hyperparameter, Tensorflow, Hyperparameter Opt...",https://www.coursera.org/learn/deep-neural-net...


In [14]:
df.to_json("metadataCoursera2.json", orient='records')
df.to_excel("metadataCoursera2.xlsx", "data")