<a href="https://colab.research.google.com/github/gabrielborja/python_data_analysis/blob/main/online_learning_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Udacity courses analytics

Data about Udacity courses and programs can be found [here](https://www.udacity.com/courses/all?type=free%20courses)

## Uploading packages and data

In [None]:
#Importing HTTP libraries
from bs4 import BeautifulSoup as bs

In [None]:
#Importing data manipulation libraries
import numpy as np
import pandas as pd

##Retrieving data from URL

In [None]:
#Removing previous version of the file
!rm udacity.txt

In [None]:
#Uploading file from local drive
from google.colab import files
my_text_1 = files.upload()

Saving udacity.txt to udacity.txt


In [None]:
#Extracting html from txt file
html1 = my_text_1['udacity.txt']

In [None]:
#Create a function to retrieve data from given URL
def get_courses(url):
  """Retrieve data from given URL and return a dataframe"""
  
  #Get data and parse with BeautifulSoup
  try:
    soup = bs(url, 'html.parser')
    
    #Save retrieved data to lists
    d1_names = [i.text for i in soup.find_all('h2')]
    d1_schools = [i.text for i in soup.find_all('h3')]
    d1_description = [i.p.text for i in soup.find_all(class_='card_body__1fi66')]
    d1_skills = [i.p.text if i.p != None else None for i in soup.find_all(class_='card_overview__G6gIz')]
    d1_flag = [i.small.text if i.small != None else 'New' for i in soup.find_all(class_='card_container__aBcfz')]
    d1_level = [i.li.text if i.li.text != '' else 'beginner' for i in soup.find_all(class_='card_stats__StvYz')]
    d1_duration = [i.text for i in soup.find_all(attrs={'data-duration': True})]
    d1_url = ['https://www.udacity.com' + i.a['href'] for i in soup.find_all(class_='card-list_catalogCardListItem__aUQtx')]
    
    #Save data to dataframe
    df = pd.DataFrame()
    df = df.assign(name=d1_names, school=d1_schools, description=d1_description,
                   skills = d1_skills, flag=d1_flag, level=d1_level,
                   duration= d1_duration, url=d1_url)
    return df

  except TypeError:
    print('TypeError')

In [None]:
#Generating a dataframe from url
df1 = get_courses(html1)
df1.tail(5)

Unnamed: 0,name,school,description,skills,flag,level,duration,url
259,Front-End Interview Prep,Career Advancement,Answer front-end technical and behavioral inte...,"Interview questions, Common FE Questions, Whit...",Free,intermediate,1 Week,https://www.udacity.com/course/front-end-inter...
260,Full-Stack Interview Prep,Career Advancement,Answer common full stack and web security inte...,"Interview practice, Common FS Questions, White...",Free,intermediate,1 Week,https://www.udacity.com/course/full-stack-inte...
261,Data Structures & Algorithms in Swift,Career Advancement,Review and practice the skills technical inter...,"Interview practice, Swift, Data structures, Ca...",Free,intermediate,4 Weeks,https://www.udacity.com/course/data-structures...
262,iOS Interview Prep,Career Advancement,Answer iOS and mobile development interview qu...,"Interview practice, Common iOS Questions, Whit...",Free,intermediate,1 Week,https://www.udacity.com/course/ios-interview-p...
263,VR Interview Prep,Career Advancement,Learn how to tackle interview questions for te...,"Interview practice, Common VR Questions, White...",Free,intermediate,1 Week,https://www.udacity.com/course/vr-interview-pr...


In [None]:
#Checking if there are missing values in any column
df1[df1.isna()].sum()

name           0.0
school         0.0
description    0.0
skills         0.0
flag           0.0
level          0.0
duration       0.0
url            0.0
dtype: float64

In [None]:
#Checking for unique values in given column
df1['duration'].unique()

array(['5 Months', '4 Months', '3 Months', '6 Months', '2 Months',
       '12 Weeks', '160 Hours', '1 Month', '16 Weeks', '4 Weeks', '',
       '8 Weeks', '7 Weeks', '2 Weeks', '1 Week', '10 Weeks', '3 Weeks',
       '1 Hour', '6 Weeks', '10 Hours', '1 Day', '5 Weeks', '3 Days',
       '2 Days', '7 Days', '60 Hours', '6 Days'], dtype=object)

In [None]:
#Exporting to excel into local disk
from google.colab import files
df1.to_excel('2021_udacity_courses.xlsx', index=False) #==> Excluding index from file
files.download('2021_udacity_courses.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Udemy courses finder

Data about Udemy courses with discount coupons can be found [here](https://yofreesamples.com/courses/free-discounted-udemy-courses-list/)

## Uploading packages and data

In [None]:
#Importing HTTP libraries
import requests
from bs4 import BeautifulSoup as bs

In [None]:
#Importing data manipulation libraries
import numpy as np
import pandas as pd
from time import sleep
from random import randint
from collections import Counter

##Scraping courses from URL

In [None]:
#Create a function to retrieve data from given URL
def find_udemy_courses(url):
  """Retrieve udemy courses from given URL and return a dataframe"""
  
  #Get data and parse with BeautifulSoup
  try:
    sleep(randint(5,9))
    response = requests.get(url)
    print(response)
    if response.status_code == 200:
      soup = bs(response.text, 'html.parser')
      
      #Save retrieved data to lists
      d2_categories = [list(i)[1].replace('.', '').strip() for i in soup.find_all('p') if 'Category:' in i.text]
      d2_names = [i.text for i in soup.find_all(class_='course_title')]
      d2_urls = [i['href'] for i in soup.find_all(class_='course_title')]
      d2_ratings = [i.text.split(' ')[1] for i in soup.find_all('p') if 'Rating:' in i.text]
      d2_date = soup.find('h1').text.split('–')[1].strip()
      
      #Save data to dataframe
      df = pd.DataFrame()
      df = df.assign(category=d2_categories, name=d2_names,
                    url=d2_urls, rating=d2_ratings, date=d2_date)
      return df
    
    else:
      return pd.DataFrame({'Response': [str(response.status_code)]})

  except TypeError:
    print('TypeError')

In [None]:
#Generating a dataframe of courses from url
url = 'https://yofreesamples.com/courses/free-discounted-udemy-courses-list/'

df2 = find_udemy_courses(url)

<Response [403]>


In [None]:
#Converting column to datetime
df2['date'] = pd.to_datetime(df2['date'])
df2.tail()

In [None]:
#Counting values from unique categories
categ_2 = Counter(df2['category'])
repr(categ_2)

"Counter({'Business': 21, 'Marketing': 19, 'Development': 17, 'IT & Software': 15, 'Finance & Accounting': 10, 'Design': 7, 'Office Productivity': 5, 'Personal Development': 5, 'Teaching & Academics': 3, 'Health & Fitness': 3})"

In [None]:
#Filtering only data science courses
dsc_list = ['Artificial', 'Machine', 'Deep', 'Data', 'SQL', 'Spark', 'Azure']
df2_dsc = df2[(df2['name'].str.contains('Python|python', regex=True))].reset_index(drop=True)
              #(df2['name'].str.contains('|'.join(dsc_list), regex=True))].reset_index(drop=True)
df2_dsc

Unnamed: 0,category,name,url,rating,date
0,IT & Software,Python para no matemáticos: De 0 hasta reconoc...,https://www.udemy.com/course/python-de-cero-ha...,4.5.,2021-08-24
1,Development,Python and Data Handling Libraries Fully Diplo...,https://www.udemy.com/course/mastering-python-...,4.4.,2021-08-24
2,Development,Python for beginners – Learn all the basics of...,https://www.udemy.com/course/python-for-beginn...,4.3.,2021-08-24
3,Development,Python | Proje ve Uygulama ile Sıfırdan Zirvey...,https://www.udemy.com/course/python-proje-ve-u...,4.8.,2021-08-24
4,Business,"Decision Trees, Random Forests, AdaBoost & XGB...",https://www.udemy.com/course/machine-learning-...,4.0.,2021-08-24
5,Development,The Python Programming A-Z Definitive Diploma ...,https://www.udemy.com/course/the-ultimate-pyth...,4.3.,2021-08-24
6,IT & Software,Python Bootcamp 2021 Build 15 working Applicat...,https://www.udemy.com/course/python-complete-b...,4.4.,2021-08-24
7,Development,The Python Developer Essentials 2021 Immersive...,https://www.udemy.com/course/new-python-progra...,4.4.,2021-08-24


In [None]:
#Extracting URL of desired course
df2_dsc['url'][0]

'https://www.udemy.com/course/machine-learning-advanced-decision-trees-in-python/?couponCode=DTPYAUFR1'

#Scraping of certifications and courses

Scraping data and prices from several online courses

## Uploading packages and data

In [1]:
#Importing HTTP libraries
import requests
from bs4 import BeautifulSoup as bs

In [2]:
#Importing data manipulation libraries
import numpy as np
import pandas as pd
from time import sleep
from random import randint
from collections import Counter

##Scraping courses

In [3]:
#Create a function to retrieve data from given URL
def scrape_data_courses():
  """Retrieve data science courses from given URL and return a dictionary"""
  
  course_list = {'TinyML': {'Url': 'https://www.edx.org/professional-certificate/harvardx-tiny-machine-learning',
                            'Attr': 'main d-flex flex-wrap'},
                 'Data Engineer Udacity': {'Url': 'https://www.udacity.com/course/data-engineer-nanodegree--nd027',
                                           'Attr': '_price-card_pricingTemplate__Am1WB'},
                 'Data Engineering': {'Url': 'https://learndataengineering.com/p/academy',
                                      'Attr': 'block__pricing__plan__price'},
                 '100 Days of Python': {'Url': 'https://www.appbrewery.co/', 'Attr': 'btn btn-hg btn-primary btn-enroll'},
                 'AZ-220 IoT Developer': {'Url': 'https://docs.microsoft.com/en-us/learn/certifications/azure-iot-developer-specialty/',
                                          'Attr': 'font-size-h5'}
                 }
  my_courses = {}
  
  #Get data and parse with BeautifulSoup
  for i in course_list.keys():
    sleep(randint(3,10))
    response = requests.get(course_list[i]['Url'])
    print(f'{i}: {response}')
    soup = bs(response.text, 'html.parser')
      
    #Save retrieved data to dictionary
    if response.status_code == 200:
      my_courses.update({i: n.text for n in soup.find_all(class_=course_list[i]['Attr'])})
    else:
      my_courses.update({i: str(response.status_code)})
  
  return my_courses

In [4]:
#Generating a list of dictionaries from a list of urls

course_list = scrape_data_courses()

TinyML: <Response [200]>
Data Engineer Udacity: <Response [200]>
Data Engineering: <Response [200]>
100 Days of Python: <Response [200]>
AZ-220 IoT Developer: <Response [200]>


In [5]:
#Printing the dictionary of scraped courses
course_list

{'100 Days of Python': 'Hurry Just $12.99 Today',
 'AZ-220 IoT Developer': '',
 'Data Engineer Udacity': '\n\n \n\n\n\nfor - access\n\n',
 'Data Engineering': '3 payments of $80/month',
 'TinyML': 'Discounted price: $537.30Pre-discounted price: $597USD'}