In [5]:
import json
import logging

from bs4 import BeautifulSoup
import numpy as np
import requests
import pandas as pd
import seaborn as sns

import utils.logger
from utils.logger import LoggingMessage


In [6]:
from datetime import datetime

time_stamp = f"Last time this notebook runs at {datetime.now():%d %h, %Y %H:%M:%S}."
logging.info(time_stamp)

# All courses dict

In [None]:
courses_dict: dict = json.load(open('data/courses/all_courses_dict.json'))
courses_dict.keys()

## Select course

In [None]:
# Copy and Paste name from above output
# course_name = 'Java with DSA and system design'
# course_name = 'Data Science masters'
course_name = 'Full Stack web development'
course_id = courses_dict[course_name]

def get_url(course_name):
    course_id = courses_dict[course_name]
    return f'https://learn.pwskills.com/course/{course_name}/{course_id}'


In [None]:
def get_live_course_dict(url: str):
    # Get request with url
    r = requests.get(url)
    logging.info(LoggingMessage.get_request_log.format(url))
    logging.info(LoggingMessage.status_code_log.format(r.status_code))

    # Check if request status is 200
    if r.status_code == 200:
        soup = BeautifulSoup(r.text, 'html.parser')
    else:
        raise requests.HTTPError('Status is not 200.')

    # Find required script tag
    script = soup.find('script', {'id': '__NEXT_DATA__'})

    # Get json data from the script
    if script:
        data = script.text
    else:
        raise TypeError('Required script tag is not available.')

    return dict(json.loads(data))['props']['pageProps']


In [None]:
url = get_url(course_name)

live_course = get_live_course_dict(url)

In [None]:
live_course.keys()

In [None]:
# sections data
live_course_section = live_course['sections']['sections']

# lessons data
live_course_lessons = live_course['sections']['lessonDetails']

# Import `sections` data


In [None]:
sections = pd.json_normalize(live_course_section, 'lessons', 'title')
print(sections.shape)
sections.head()

### Extract `dates` from `title`


In [None]:
# sections['date'] = (sections['title']
#                     .str.extract(r"(\d{1,2}\w{0,2} \w{3}'23)")
#                     .astype('datetime64'))

sections['date'] = pd.to_datetime((sections['title']
                             .str.rsplit('23', n=1).str.get(0).add('23')
                             .str.replace(r"' 23|'23| ' 23", ' 2023', regex=True)
                             .str.replace(r'^(23)$', '', regex=True)
                             .str.replace(r'^\d{1,2} - ', '', regex=True)
                             ), errors='coerce').bfill()

### Remove `dates` from `title`


In [None]:
sections['title'] = (sections['title']
                     .apply(lambda x: str(x).rsplit('23', 1)[-1] if x else x)
                     .str.strip())

### Rename columns


In [None]:
sections.rename(columns={0: '_id', 'title': 'sectionsTitle'}, inplace=True)

In [None]:
sections.head(28)

# Import `lessons` data


In [None]:
lessons = pd.DataFrame.from_dict(live_course_lessons)
lessons.head()

### Extract video `duration` from `data` column.

In [None]:
lessons['duration'] = lessons['data'].str.get('duration')  # type: ignore

### Extract assignments `maxPoints` from `data` column.

In [None]:
lessons['totalPointsInAssignment'] = lessons['data'].str.get('maxPoints')  # type: ignore

### Extract sectionResource's `resourceURl` from `data` column.

In [None]:
lessons['url'] = lessons['data'].str.get('resourceURL')  # type: ignore

In [None]:
lessons.info()

### Check `quizQuestions` column


In [None]:
no_of_na_vals = (lessons['quizQuestions']
                 .apply(lambda x: np.nan if isinstance(x, list) and len(x) == 0 else x)
                 .isnull().sum())

print(f"'quizQuestions' columns has {no_of_na_vals} null values out of {lessons.shape[0]} values.")
print('Thats why, need to drop it.')

### Drop Columns


In [None]:
lessons.drop(columns=['data', 'quizQuestions'], inplace=True)

### Rename columns


In [None]:
# Rename the title column to differentiate
lessons.rename(columns={
    'title': 'lessonsTitle'
}, inplace=True)

In [None]:
lessons.info()

# Merge the datasets


In [None]:
df = sections.merge(lessons, on='_id', how='inner')
print(df.shape)
df.head()

# EDA ON DATA

### Count of different `type` in course


In [None]:
df['type'].value_counts()

In [None]:
# In percentage
df['type'].value_counts(True).mul(100).round(1)

In [None]:
# In countplot
sns.countplot(data=df, x='type')

### What is _sectionResource_?

Initially, `sectionResource` contains the `resourceURL` for __Live classes, Doubt classes and Feedback form__.  
But now, on **2 March, 2023** new `sectionResource` is added to the course as **Python Practice** and it contains the G-Drive URL of PDFs and Docs.

In [None]:
sec_resource = df.query("type=='sectionResource'")

print(sec_resource.shape)
sec_resource.tail()

### Live Course Duration

In [None]:
total_quiz_ques = df['totalQuestionsInQuiz'].sum()
total_asgn_points = df['totalPointsInAssignment'].sum()
total_video_duration = round(df['duration'].sum() / 3600, 2)

print(f'No. of questions in Quiz = {int(total_quiz_ques)} questions.')
print(f'Total assignments points = {int(total_asgn_points)} points.')
print(f'Total duration of videos in Course = {total_video_duration} hours.')

### How many days has this course lasted so far?


In [None]:
max_date = df['date'].max()
min_date = df['date'].min()

print(f'For now the course has runs for {(max_date-min_date).days} days.')

In [None]:
print(f"First update in course on {min_date:%d %h, %Y}.")
print(f"Last update in course on {max_date:%d %h, %Y}.")

### Most recent course Section and Lesson.

In [None]:
most_recent_section = df.query('date==@max_date')['sectionsTitle'].values[0]
most_recent_lesson = df.query('date==@max_date')['lessonsTitle'].values

print(f"Most recent course Section covered is {most_recent_section}.")
print(f"Most recent course Lesson covered are {', '.join(most_recent_lesson)}.")

### No. of Sections and Lessons covered

In [None]:
no_of_sections = df['sectionsTitle'].nunique()
no_of_lessons = df['lessonsTitle'].nunique()

print(f"Till now, {no_of_sections} Sections covered in the course.")
print(f"And, {no_of_lessons} Lessons.")

# Make URLs

> Sample URL for videos: `https://api.pwskills.com/v1/learn/lesson/video-session/63a2eb428899436daf7eb489/lesson/63fa15eb182c67f36e4b90dd`

> Sample URL for Quiz and Assignment: `https://api.pwskills.com/v1/learn/lesson/course/63a2eb428899436daf7eb489/63fa2330182c6727c14b9189`


### Videos

In [None]:
# Base url for video
base_vid_url = f'https://api.pwskills.com/v1/learn/lesson/video-session/{course_id}/lesson/'
videos = df.query('type=="video"')

df.loc[videos.index, 'url'] = base_vid_url + videos['_id']

### Quiz and Assignments

In [None]:
# Base url for Quiz and Assignment
base_url = f'https://api.pwskills.com/v1/learn/lesson/course/{course_id}/'
quizzes = df.query('type=="quiz"')

df.loc[quizzes.index, 'url'] = base_url + quizzes['_id']

In [None]:
# Assignment url
assignments = df.query('type=="assignment"')

df.loc[assignments.index, 'url'] = base_url + assignments['_id']

In [None]:
df.info()

In [None]:
df[['sectionsTitle', 'lessonsTitle', 'type', 'duration']].sample(28)

# Export the dataset as CSV

In [None]:
# df.to_csv(f'../data/{course_id}.csv', index=False)