In [4]:
!pip install seaborn==0.11.1
!pip install wordcloud==1.8.1

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import ImageColorGenerator, STOPWORDS, WordCloud

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')



In [5]:
courses_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/course_genre.csv'
ratings_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/ratings.csv'

courses_df = pd.read_csv(courses_url)
ratings_df = pd.read_csv(ratings_url)

In [None]:
# COURSE CONTENT DATASET
courses_df.columns

In [None]:
courses_df.shape[0]  # 307 unique courses

In [None]:
courses_df.head()

In [None]:
courses_df.dtypes

In [None]:
courses_df.iloc[1, ]

In [None]:
titles = ' '.join(title for title in courses_df.TITLE.astype(str))  # coalesce course titles into single string
titles

In [None]:
# filter stopwords
stopwords = set(STOPWORDS)  # filter doesn't work if set() run without 'STOPWORDS'
stopwords.update(['getting started', 'using', 'enabling', 'template', 'university', 'end', 'introduction', 'basic'])

In [None]:
# generate wordcloud
wordcloud = WordCloud(stopwords=stopwords, background_color='white', width=800, height=400)
wordcloud.generate(titles)

In [None]:
# visualise wordcloud
plt.axis('off')
plt.figure(figsize=(40, 20))
plt.tight_layout(pad=0)
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()

In [None]:
# find courses with MachineLearning == 1
ml_df = courses_df[courses_df.MachineLearning == 1]
ml_df.TITLE

In [None]:
# find courses with MachineLearning == 1 and BigData == 1
ml_bd_df = ml_df[ml_df.BigData == 1]
ml_bd_df.TITLE

In [None]:
genres = courses_df.columns[2:]
genres

In [None]:
courses_df.Python.value_counts()  # 279 non-Python courses, 28 Python courses

In [None]:
genre_sums = courses_df[genres].sum(axis=0)
genre_sums  # BackendDev most popular genre; Chatbot, Blockchain least

In [None]:
gs_df = pd.DataFrame({'genre': genre_sums.index, 'count': genre_sums.values}).set_index('genre')
gs_df = gs_df.sort_values(by='count', ascending=False)
gs_df

In [None]:
# plot course genre counts
plt.figure()
sns.barplot(x=gs_df.index, y='count', data=gs_df, palette='crest')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# COURSE ENROLMENTS DATASET
ratings_df.head()

In [None]:
ratings_df.rating.unique()  # 2: user audited course without completing; 3: user completed course and earned certificate

In [None]:
total = ratings_df.shape[0]
total  # 233,306 total enrolments

In [None]:
user_rating_counts = ratings_df.groupby('user').rating.size()
print(user_rating_counts)

In [None]:
user_rating_counts.size  # 33,901 users enrolled in 233,306 courses

In [None]:
user_rating_counts.value_counts()

In [None]:
user_rating_counts.describe()

In [None]:
# plot course enrolment counts
plt.figure()
cm = sns.color_palette("crest",30)
plot = sns.histplot(user_rating_counts, bins=75, palette=cm)
for bin_,i in zip(plot.patches,cm):
    bin_.set_facecolor(i)
plt.xlabel('enrolments')
plt.ylabel('count')
plt.show()

In [None]:
# aggregate enrolment ('rating') counts for each course ('item')
agg_df = ratings_df.groupby(['item']).size().reset_index()
agg_df.columns = ['course', 'enrolments']
agg_df = agg_df.sort_values(by='enrolments', ascending=False).reset_index(drop=True)
agg_df

In [None]:
agg_df.head(20)  # view enrolment counts for top 20 courses

In [None]:
# check course names against enrolment counts
merge_df = pd.merge(courses_df[['COURSE_ID', 'TITLE']], agg_df, left_on='COURSE_ID', right_on='course', how='right')
merge_df.columns = ['a', 'course', 'b', 'enrolments']
top_courses = merge_df[['course', 'enrolments']].head(20)
top_courses

In [None]:
top = top_courses.Enrolments.values.sum()
top  # 147,688 enrolments across top 20 courses

In [None]:
print(f'Top 20 courses account for {round((top * 100) / total, 2)}% of all enrolments')