# Scrape Datacamp

In [1]:
# import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
# turn a string into a clickable URL

def make_click(val):
    return '<a href="{}">{}</a>'.format(val,val)

In [3]:
# get and parse a desired page - URL given in 2 parts

def get_page(url1, url2):
    url = url1 + url2
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser') # parse the page
    return soup

In [4]:
# search the page for classes 
# extract the class information and create a dataframe of the info

def parse_page(page_soup, topic):
    # find the section with all the courses
    all_courses = page_soup.find(class_='courses__explore-list row')
    
    # get all the sections which contain a class description
    all_courses_names = all_courses.find_all('a', class_='course-block__link')
    
    # create a matrix of all the data
    data = []

    # parse each course for the course specific content
    for course in all_courses_names:
        title = course.find_all(class_='course-block__title')[0].get_text()
        description = course.find_all(class_='course-block__description')[0].get_text().strip()
        time = course.find_all(class_='course-block__length')[0].get_text()
        link = course['href']
        data.append([title, description, time, link, topic])
        
    df_classes = pd.DataFrame(data)
    
    if len(data) == 0:
        # there are no courses on the page requested
        df_classes = df_classes.append(pd.Series(['','','','',topic]),ignore_index=True)
    else:
        # create a pandas dataframe with all the data
        df_classes = pd.DataFrame(data)
    
    column_names = ['Title', 'Description', 'Time', 'Link', 'Topic']
    df_classes.columns = column_names

    return df_classes
    

In [5]:
# page URL building
r_url = "https://www.datacamp.com/courses/tech:r"
py_url = "https://www.datacamp.com/courses/tech:python"

# there are 10 topics with the URL in the form:
#   - https://www.datacamp.com/courses/tech:python/topic:programming
topics = ['programming',
          'importing_and_cleaning_data',
          'data_manipulation',
          'data_visualization',
          'probablity_and_statistics',
          'machine_learning',
          'applied_finance',
          'reporting',
          'case_studies',
          'other']

In [6]:
r_frames = []
py_frames = []

for topic in topics:
    # create data frame of R courses:
    soup_r = get_page(r_url, '/topic:'+topic)
    tmp_r = parse_page(soup_r, topic)
    r_frames.append(tmp_r)

    # create data frame of Python courses:
    soup_py = get_page(py_url, '/topic:'+topic)
    tmp_py = parse_page(soup_py, topic)    
    py_frames.append(tmp_py)

In [7]:
df_py = pd.concat(py_frames)

In [8]:
df_r = pd.concat(r_frames)

In [9]:
# turn the "link" into a full URL
df_r['URL'] = 'https://www.datacamp.com' + df_r['Link'].astype(str)
df_py['URL'] = 'https://www.datacamp.com' + df_py['Link'].astype(str)

In [10]:
# delete the old "Link" column
df_r = df_r.drop('Link', 1)
df_py = df_py.drop('Link', 1)

# Display the Output

In [11]:
df_r

Unnamed: 0,Title,Description,Time,Topic,URL
0,Introduction to R,Master the basics of data analysis by manipula...,4 hours,programming,https://www.datacamp.com/courses/free-introduc...
1,Intermediate R,Continue your journey to become an R ninja by ...,6 hours,programming,https://www.datacamp.com/courses/intermediate-r
2,Writing Functions in R,Learn the fundamentals of writing functions in...,4 hours,programming,https://www.datacamp.com/courses/writing-funct...
3,Introduction to the Tidyverse,Get started on the path to exploring and visua...,4 hours,programming,https://www.datacamp.com/courses/introduction-...
4,Writing Efficient R Code,"Learn to write faster R code, discover benchma...",4 hours,programming,https://www.datacamp.com/courses/writing-effic...
5,Building Web Applications in R with Shiny,Build interactive web apps straight from R wit...,4 hours,programming,https://www.datacamp.com/courses/building-web-...
6,String Manipulation in R with stringr,"Learn how to pull character strings apart, put...",4 hours,programming,https://www.datacamp.com/courses/string-manipu...
7,Object-Oriented Programming in R: S3 and R6,Manage the complexity in your code using objec...,4 hours,programming,https://www.datacamp.com/courses/object-orient...
8,Working with Dates and Times in R,"Learn the essentials of parsing, manipulating ...",4 hours,programming,https://www.datacamp.com/courses/working-with-...
9,Scalable Data Processing in R,Learn how to write scalable code for working w...,4 hours,programming,https://www.datacamp.com/courses/scalable-data...


In [12]:
df_py

Unnamed: 0,Title,Description,Time,Topic,URL
0,Intro to Python for Data Science,Master the basics of data analysis in Python. ...,4 hours,programming,https://www.datacamp.com/courses/intro-to-pyth...
1,Intermediate Python for Data Science,Level up your data science skills by creating ...,4 hours,programming,https://www.datacamp.com/courses/intermediate-...
2,Python Data Science Toolbox (Part 1),Learn the art of writing your own functions in...,3 hours,programming,https://www.datacamp.com/courses/python-data-s...
3,Python Data Science Toolbox (Part 2),Continue to build your modern Data Science ski...,4 hours,programming,https://www.datacamp.com/courses/python-data-s...
4,Data Types for Data Science,Consolidate and extend your knowledge of Pytho...,4 hours,programming,https://www.datacamp.com/courses/data-types-fo...
0,Importing Data in Python (Part 1),Learn to import data into Python from various ...,3 hours,importing_and_cleaning_data,https://www.datacamp.com/courses/importing-dat...
1,Cleaning Data in Python,This course will equip you with all the skills...,4 hours,importing_and_cleaning_data,https://www.datacamp.com/courses/cleaning-data...
2,Importing Data in Python (Part 2),Improve your Python data importing skills and ...,2 hours,importing_and_cleaning_data,https://www.datacamp.com/courses/importing-dat...
0,pandas Foundations,Learn how to use the industry-standard pandas ...,4 hours,data_manipulation,https://www.datacamp.com/courses/pandas-founda...
1,Manipulating DataFrames with pandas,"You will learn how to tidy, rearrange, and res...",4 hours,data_manipulation,https://www.datacamp.com/courses/manipulating-...


# Playing with word count occurrences 

In [13]:
# # try nltk...
# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')

In [14]:
# # get all the words in the "Title" column with counts of each word 
# unq_words_counts = df_classes['Title'].str.split(' ', expand=True).stack().value_counts()

In [15]:
# df_title_words = unq_words_counts.to_frame() # turn this series into a dataframe
# df_title_words.reset_index(inplace=True) # put the words as a column, and reset the index
# df_title_words.columns = ['Word','Count'] # title the columns 
# df_title_words.head()

In [16]:
# # drop the rows if the word in "Word" is a stopword
# df_title_words.drop(df_title_words.index[df_title_words['Word'].isin(stopwords.words('english'))], inplace=True)

In [17]:
# # need to remove a few other words too...
# bad_list = ['R','R:','Working','Using','The','-','Exploring']
# df_title_words.drop(df_title_words.index[df_title_words['Word'].isin(bad_list)], inplace=True)