<a href="https://colab.research.google.com/github/gabrielborja/python_data_analysis/blob/main/online_learning_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Udacity courses analytics

Data about Udacity courses and programs can be found [here](https://www.udacity.com/courses/all?type=free%20courses)

## Uploading packages and data

In [1]:
#Importing HTTP libraries
from bs4 import BeautifulSoup as bs

In [2]:
#Importing data manipulation libraries
import numpy as np
import pandas as pd

##Retrieving data from URL

In [5]:
#Removing previous version of the file
!rm udacity.txt

In [6]:
#Uploading file from local drive
from google.colab import files
my_text_1 = files.upload()

Saving udacity.txt to udacity.txt


In [7]:
#Extracting html from txt file
html1 = my_text_1['udacity.txt']

In [205]:
#Create a function to retrieve data from given URL
def get_courses(url):
  """Retrieve data from given URL and return a dataframe"""
  
  #Get data and parse with BeautifulSoup
  try:
    soup = bs(url, 'html.parser')
    
    #Save retrieved data to lists
    d1_names = [i.text for i in soup.find_all('h2')]
    d1_schools = [i.text for i in soup.find_all('h3')]
    d1_description = [i.p.text for i in soup.find_all(class_='card_body__1fi66')]
    d1_skills = [i.p.text if i.p != None else None for i in soup.find_all(class_='card_overview__G6gIz')]
    d1_flag = [i.small.text if i.small != None else 'New' for i in soup.find_all(class_='card_container__aBcfz')]
    d1_level = [i.li.text if i.li.text != '' else 'beginner' for i in soup.find_all(class_='card_stats__StvYz')]
    d1_duration = [i.text for i in soup.find_all(attrs={'data-duration': True})]
    d1_url = ['https://www.udacity.com' + i.a['href'] for i in soup.find_all(class_='card-list_catalogCardListItem__aUQtx')]
    
    #Save data to dataframe
    df = pd.DataFrame()
    df = df.assign(name=d1_names, school=d1_schools, description=d1_description,
                   skills = d1_skills, flag=d1_flag, level=d1_level,
                   duration= d1_duration, url=d1_url)
    return df

  except TypeError:
    print('TypeError')

In [206]:
#Generating a dataframe from url
df1 = get_courses(html1)
df1.tail(5)

Unnamed: 0,name,school,description,skills,flag,level,duration,url
259,Front-End Interview Prep,Career Advancement,Answer front-end technical and behavioral inte...,"Interview questions, Common FE Questions, Whit...",Free,intermediate,1 Week,https://www.udacity.com/course/front-end-inter...
260,Full-Stack Interview Prep,Career Advancement,Answer common full stack and web security inte...,"Interview practice, Common FS Questions, White...",Free,intermediate,1 Week,https://www.udacity.com/course/full-stack-inte...
261,Data Structures & Algorithms in Swift,Career Advancement,Review and practice the skills technical inter...,"Interview practice, Swift, Data structures, Ca...",Free,intermediate,4 Weeks,https://www.udacity.com/course/data-structures...
262,iOS Interview Prep,Career Advancement,Answer iOS and mobile development interview qu...,"Interview practice, Common iOS Questions, Whit...",Free,intermediate,1 Week,https://www.udacity.com/course/ios-interview-p...
263,VR Interview Prep,Career Advancement,Learn how to tackle interview questions for te...,"Interview practice, Common VR Questions, White...",Free,intermediate,1 Week,https://www.udacity.com/course/vr-interview-pr...


In [209]:
#Checking if there are missing values in any column
df1[df1.isna()].sum()

name           0.0
school         0.0
description    0.0
skills         0.0
flag           0.0
level          0.0
duration       0.0
url            0.0
dtype: float64

In [185]:
#Checking for unique values in given column
df1['duration'].unique()

array(['5 Months', '4 Months', '3 Months', '6 Months', '2 Months',
       '12 Weeks', '160 Hours', '1 Month', '16 Weeks', '4 Weeks', '',
       '8 Weeks', '7 Weeks', '2 Weeks', '1 Week', '10 Weeks', '3 Weeks',
       '1 Hour', '6 Weeks', '10 Hours', '1 Day', '5 Weeks', '3 Days',
       '2 Days', '7 Days', '60 Hours', '6 Days'], dtype=object)

In [210]:
#Exporting to excel into local disk
from google.colab import files
df1.to_excel('2021_udacity_courses.xlsx', index=False) #==> Excluding index from file
files.download('2021_udacity_courses.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Udemy courses finder

Data about Udemy courses with discount coupons can be found [here](https://yofreesamples.com/courses/free-discounted-udemy-courses-list/)

## Uploading packages and data

In [2]:
#Importing HTTP libraries
import requests
from bs4 import BeautifulSoup as bs

In [3]:
#Importing data manipulation libraries
import numpy as np
import pandas as pd

##Scraping courses from URL

In [None]:
#Removing previous version of the file
!rm udacity.txt

In [None]:
#Uploading file from local drive
from google.colab import files
my_text_1 = files.upload()

Saving udacity.txt to udacity.txt


In [156]:
#Create a function to retrieve data from given URL
def find_udemy_courses(url):
  """Retrieve udemy courses from given URL and return a dataframe"""
  
  #Get data and parse with BeautifulSoup
  try:
    response = requests.get(url)
    print(response)
    soup = bs(response.text, 'html.parser')
    
    #Save retrieved data to lists
    d2_categories = [list(i)[1].replace('.', '').strip() for i in soup.find_all('p') if 'Category:' in i.text]
    d2_names = [i.text for i in soup.find_all(class_='course_title')]
    d2_urls = [i['href'] for i in soup.find_all(class_='course_title')]
    d2_ratings = [i.text.split(' ')[1] for i in soup.find_all('p') if 'Rating:' in i.text]
    d2_date = soup.find('h1').text.split('–')[1].strip()
    
    #Save data to dataframe
    df = pd.DataFrame()
    df = df.assign(category=d2_categories, name=d2_names,
                   url=d2_urls, rating=d2_ratings, date=d2_date)
    return df

  except TypeError:
    print('TypeError')

In [157]:
#Generating a dataframe of courses from url
url = 'https://yofreesamples.com/courses/free-discounted-udemy-courses-list/'

df2 = find_udemy_courses(url)

<Response [200]>
8/8/2021


In [158]:
#Converting column to datetime
df['date'] = pd.to_datetime(df['date'])
df2.tail()

Unnamed: 0,category,name,url,rating,date
104,IT & Software,AWS Services for Solutions Architect Associate...,https://www.udemy.com/course/aws-services-for-...,4.4.,2021-08-08
105,IT & Software,Time Series Analysis Real World Projects in Py...,https://www.udemy.com/course/time-series-analy...,4.4.,2021-08-08
106,Development,PHP for Beginners: PHP Crash Course 2021 CDeve...,https://www.udemy.com/course/learn-php-for-beg...,4.1.,2021-08-08
107,Photography & Video,Color Correction & Grading with Adobe Premiere...,https://www.udemy.com/course/color-correction-...,3.8.,2021-08-08
108,Photography & Video,Video Editing with Adobe Premiere Pro CC 2021 ...,https://www.udemy.com/course/video-editing-wit...,3.9.,2021-08-08


In [160]:
#Checking the unique values from rating column
df2['category'].unique()

array(['Data Science', 'IT & Software', 'Office Productivity',
       'Photography & Video', 'Business', 'Finance & Accounting',
       'Teaching & Academics', 'Design', 'Marketing',
       'Personal Development', 'Development',
       'Project Management & Operations'], dtype=object)

In [185]:
#Filtering only data science courses
#dsc_list = ['Artificial', 'Machine', 'Deep', 'Data', 'SQL', 'Spark']
df2_dsc = df2[(df2['name'].str.contains('Python', regex=True))]
              #| (df2['name'].str.contains('|'.join(dsc_list), regex=True))
df2_dsc

Unnamed: 0,category,name,url,rating,date
0,Data Science,Machine Learning and AI: Support Vector Machin...,https://www.udemy.com/course/support-vector-ma...,4.7.,2021-08-08
4,Data Science,Cutting-Edge AI: Deep Reinforcement Learning i...,https://www.udemy.com/course/cutting-edge-arti...,4.6.,2021-08-08
31,Development,Build & Host Artificial Intelligence Apps (Str...,https://www.udemy.com/course/ai-app-streamlit-...,4.8.,2021-08-08
49,Development,Python Network Programming for Network Enginee...,https://www.udemy.com/course/python-network-pr...,4.7.,2021-08-08
55,IT & Software,Python Complete Course For Python Beginners IT...,https://www.udemy.com/course/python-complete-c...,4.2.,2021-08-08
61,IT & Software,Natural Language Processing Real-World Project...,https://www.udemy.com/course/natural-language-...,4.4.,2021-08-08
92,Development,Python for beginners – Learn all the basics of...,https://www.udemy.com/course/python-for-beginn...,4.4.,2021-08-08
105,IT & Software,Time Series Analysis Real World Projects in Py...,https://www.udemy.com/course/time-series-analy...,4.4.,2021-08-08


In [178]:
#Extracting URL of desired course
df2['url'][0]

'https://www.udemy.com/course/support-vector-machines-in-python/?couponCode=FREEAUG21'