## 1| Importing libraries

In [48]:
#importing libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

### 2 | Read Data

In [49]:
# read data

data = pd.read_csv('EdX.csv')
data.columns = map(str.lower, data.columns)
data.head()

Unnamed: 0,name,university,difficulty level,link,about,course description
0,How to Learn Online,edX,Beginner,https://www.edx.org/course/how-to-learn-online,Learn essential strategies for successful onli...,"Designed for those who are new to elearning, t..."
1,Programming for Everybody (Getting Started wit...,The University of Michigan,Beginner,https://www.edx.org/course/programming-for-eve...,"This course is a ""no prerequisite"" introductio...",This course aims to teach everyone the basics ...
2,CS50's Introduction to Computer Science,Harvard University,Beginner,https://www.edx.org/course/cs50s-introduction-...,An introduction to the intellectual enterprise...,"This is CS50x , Harvard University's introduct..."
3,The Analytics Edge,Massachusetts Institute of Technology,Intermediate,https://www.edx.org/course/the-analytics-edge,"Through inspiring examples and stories, discov...","In the last decade, the amount of data availab..."
4,Marketing Analytics: Marketing Measurement Str...,"University of California, Berkeley",Beginner,https://www.edx.org/course/marketing-analytics...,This course is part of a MicroMasters® Program,Begin your journey in a new career in marketin...


In [50]:
#For the corpus we will utilise three columns name, about & course description

def show_course_id(id):
    print(f"Course Name:\n{data['name'][id]}",'\n')
    print(f"What is the course about?:\n{data['about'][id]}",'\n')
    print(f"Course Description:\n{data['course description'][id]}")
    
# Show example of course description contents
print('Sample from Dataset:\n')
show_course_id(0)

Sample from Dataset:

Course Name:
How to Learn Online 

What is the course about?:
Learn essential strategies for successful online learning 

Course Description:
Designed for those who are new to elearning, this course will prepare you with strategies to be a successful online learner.The edX learning design team has curated some of the most powerful, science-backed techniques which you can start using right away and on any learning platform.The Verified Certificate for this course is free. Use the following coupon code before September 1, 2020 to upgrade at no cost to you: Y5ZADM5NU2AN5JU7This course will help you answer the following questions: Education & Teacher Training


In [51]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   name                720 non-null    object
 1   university          720 non-null    object
 2   difficulty level    720 non-null    object
 3   link                720 non-null    object
 4   about               720 non-null    object
 5   course description  720 non-null    object
dtypes: object(6)
memory usage: 33.9+ KB


In [52]:
#Drop Duplicate Rows

print(f'number of duplicate rows: {data.duplicated().sum()}')
data.drop_duplicates(inplace = True)

data.reset_index(drop = True, inplace = True)
print(f'number of rows: {data.shape[0]}')

number of duplicate rows: 1
number of rows: 719


## <b>3 <span style='color:#15C3BA'>|</span> EXPLORATORY DATA ANALYSIS</b> 

In [53]:
#Unique institutions offering courses

print(f"Number of unique institutions offering courses: {len(list(data['university'].value_counts().index))}")

Number of unique institutions offering courses: 102


In [54]:
#institutes offering most and least courses

display(data['university'].value_counts().head())
display(data['university'].value_counts().tail())

Harvard University                       90
Massachusetts Institute of Technology    41
Delft University of Technology           28
The University of Queensland             25
IBM                                      21
Name: university, dtype: int64

University of Oxford                                                      1
University of Toronto                                                     1
University System of Maryland-The University of Maryland, College Park    1
Universidades Anáhuac                                                     1
University of Maryland Global Campus-University System of Maryland        1
Name: university, dtype: int64

In [55]:
#Count difficulty level of courses
data['difficulty level'].value_counts()

Beginner        437
Intermediate    204
Advanced         78
Name: difficulty level, dtype: int64

In [56]:
#group by university and difficulty level
ldf = data.groupby(['difficulty level','university'],as_index=False).size()
ldf = ldf.sort_values(by='size',ascending=False)
ldf

Unnamed: 0,difficulty level,university,size
48,Beginner,Harvard University,74
68,Beginner,Stanford University,16
123,Intermediate,Delft University of Technology,15
59,Beginner,Massachusetts Institute of Technology,15
83,Beginner,The University of Queensland,15
...,...,...,...
74,Beginner,The International Monetary Fund,1
70,Beginner,Tecnológico de Monterrey,1
67,Beginner,SchoolYourself,1
61,Beginner,National Research Nuclear University,1


In [57]:
fig = px.bar(ldf,y='university',x='size',color='difficulty level',height=900,template='plotly_white')
fig.show()

## 3.1 | n-gram of course description¶


In [58]:
# importing the dependencies needed for pre processing
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [59]:
en_stopwords = stopwords.words("english") # stop words 
lemma = WordNetLemmatizer() # lemmatiser

In [60]:
# define a function for preprocessing
def clean(text):
    text = re.sub("[^A-Za-z1-9 ]", "", text) #removes punctuation marks
    text = text.lower() #changes to lower case
    tokens = word_tokenize(text) #tokenize the text
    clean_list = [] 
    for token in tokens:
        if token not in en_stopwords: #removes stopwords
            clean_list.append(lemma.lemmatize(token)) #lemmatizing and appends to clean_list
    return " ".join(clean_list)# joins the tokens

In [61]:
# applying the "clean" function on the text column
ldata = data['course description'].apply(clean)

In [62]:
import spacy   #analyzing and processing text data
from collections import Counter #it provides a way to count the frequency of elements in a list
import plotly.express as px  #creating interactive data visualizations.

In [63]:
nlp = spacy.load('en_core_web_sm')

dict_ngrams = {'unigram':[],'bigram':[],'trigram':[]}
for document in ldata:

    doc = nlp(document)
    tokens = [token.text for token in doc]

    def n_grams(tokens,n):
        lst_bigrams = [' '.join(i) for i in [tokens[i:i+n] for i in range(len(tokens)-n+1)]]
        return lst_bigrams

    dict_ngrams['unigram'].extend(n_grams(tokens,1))
    dict_ngrams['bigram'].extend(n_grams(tokens,2))
    dict_ngrams['trigram'].extend(n_grams(tokens,3))
    
print('unigrams',len(dict_ngrams['bigram']))
print('bigrams',len(dict_ngrams['unigram']))
print('trigrams',len(dict_ngrams['trigram']))

unigrams 76575
bigrams 77294
trigrams 75856


In [64]:
# plot ngrams
def plot_counter(counter,top,name):
    labels, values = zip(*counter.items())
    fig = px.bar(pd.Series(values,index=labels,name=name).sort_values(ascending=False)[:top],
                 template='plotly_white',orientation='h')
    fig.show()

In [65]:
plot_counter(Counter(dict_ngrams['unigram']),20,'unigram')

In [66]:
plot_counter(Counter(dict_ngrams['bigram']),20,'bigrams')

In [67]:
plot_counter(Counter(dict_ngrams['trigram']),20,'trigrams')

## 4 | NATURAL LANGUAGE PROCESSING

In [68]:
# Drop irrelavant rows(remove column data we will not utilise)
data.drop(columns = ["university", "difficulty level"], 
          axis =1, inplace = True)
df = data.copy()

In [69]:
# generate a corpus
data['text'] = data['name'] + ' ' + data['about'] + ' ' + data['course description']
data.head(5)

Unnamed: 0,name,link,about,course description,text
0,How to Learn Online,https://www.edx.org/course/how-to-learn-online,Learn essential strategies for successful onli...,"Designed for those who are new to elearning, t...",How to Learn Online Learn essential strategies...
1,Programming for Everybody (Getting Started wit...,https://www.edx.org/course/programming-for-eve...,"This course is a ""no prerequisite"" introductio...",This course aims to teach everyone the basics ...,Programming for Everybody (Getting Started wit...
2,CS50's Introduction to Computer Science,https://www.edx.org/course/cs50s-introduction-...,An introduction to the intellectual enterprise...,"This is CS50x , Harvard University's introduct...",CS50's Introduction to Computer Science An int...
3,The Analytics Edge,https://www.edx.org/course/the-analytics-edge,"Through inspiring examples and stories, discov...","In the last decade, the amount of data availab...",The Analytics Edge Through inspiring examples ...
4,Marketing Analytics: Marketing Measurement Str...,https://www.edx.org/course/marketing-analytics...,This course is part of a MicroMasters® Program,Begin your journey in a new career in marketin...,Marketing Analytics: Marketing Measurement Str...


In [70]:
text_data = data[['name','about','course description','text']]
text_data.to_csv('text_data.csv',index=False)

In [71]:
#Text Cleaning / Stemming
#importing the dependencies needed for pre processing
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [72]:
en_stopwords = stopwords.words("english") # stop words 
lemma = WordNetLemmatizer() # lemmatiser

In [73]:
# define a function for preprocessing
def clean(text):
    text = re.sub("[^A-Za-z1-9 ]", "", text) #removes punctuation marks
    text = text.lower() #changes to lower case
    tokens = word_tokenize(text) #tokenize the text
    clean_list = [] 
    for token in tokens:
        if token not in en_stopwords: #removes stopwords
            clean_list.append(lemma.lemmatize(token)) #lemmatizing and appends to clean_list
    return " ".join(clean_list)# joins the tokens

In [74]:
# applying the "clean" function on the text column
data.text = data.text.apply(clean)
data.text

0      learn online learn essential strategy successf...
1      programming everybody getting started python c...
2      cs5s introduction computer science introductio...
3      analytics edge inspiring example story discove...
4      marketing analytics marketing measurement stra...
                             ...                        
714    global china mongol ming explore impact conque...
715    leader citizen security justice management car...
716    computational neuroscience neuronal dynamic co...
717    city challenge sustainable development sustain...
718    mathtrackx special function understand trigono...
Name: text, Length: 719, dtype: object

In [75]:
# clean & stem the pandas data frame corpus text
# Preprocessing, returns list instead
def clean_for_word2vec(text):
    
    text = re.sub("[^A-Za-z1-9 ]", "", text) #removes punctuation marks
    text = text.lower() #changes to lower case
    tokens = word_tokenize(text) #tokenize the text
    clean_list = [] 
    for token in tokens:
        if token not in en_stopwords: #removes stopwords
            clean_list.append(lemma.lemmatize(token)) #lemmatizing and appends to clean_list
    return clean_list

In [76]:
#cleaning the documents
corpus_cleaned = data.text.apply(clean_for_word2vec)
lst_corpus = corpus_cleaned.tolist()

In [77]:
lst_corpus

[['learn',
  'online',
  'learn',
  'essential',
  'strategy',
  'successful',
  'online',
  'learning',
  'designed',
  'new',
  'elearning',
  'course',
  'prepare',
  'strategy',
  'successful',
  'online',
  'learnerthe',
  'edx',
  'learning',
  'design',
  'team',
  'curated',
  'powerful',
  'sciencebacked',
  'technique',
  'start',
  'using',
  'right',
  'away',
  'learning',
  'platformthe',
  'verified',
  'certificate',
  'course',
  'free',
  'use',
  'following',
  'coupon',
  'code',
  'september',
  '1',
  '22',
  'upgrade',
  'cost',
  'y5zadm5nu2an5ju7this',
  'course',
  'help',
  'answer',
  'following',
  'question',
  'education',
  'teacher',
  'training'],
 ['programming',
  'everybody',
  'getting',
  'started',
  'python',
  'course',
  'prerequisite',
  'introduction',
  'python',
  'programming',
  'learn',
  'variable',
  'conditional',
  'execution',
  'repeated',
  'execution',
  'use',
  'function',
  'homework',
  'done',
  'web',
  'browser',
  'progr

In [78]:
corpus = []
for words in data['text']:
    corpus.append(words.split())
    
len(f'corpus length: {corpus}')

1001337

## 5 | COURSE RECOMMENDATIONS

In [79]:
# course names
lst_names = list(data['name'])
lst_names[:3]

['How to Learn Online',
 'Programming for Everybody (Getting Started with Python)',
 "CS50's Introduction to Computer Science"]

In [80]:
#providing recommendations is based on cosine similarity 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [81]:
print(f"fitting data on:\n{data['text'][:3]} of type: {type(data['text'])}")

fitting data on:
0    learn online learn essential strategy successf...
1    programming everybody getting started python c...
2    cs5s introduction computer science introductio...
Name: text, dtype: object of type: <class 'pandas.core.series.Series'>


In [82]:
vectoriser = TfidfVectorizer()
test_matrix = vectoriser.fit_transform(data['text'])
print(f'\noutput matrix size: {test_matrix.shape}')
print(f'length of the vectoriser vocabulary: {len(vectoriser.vocabulary_)}')


output matrix size: (719, 11066)
length of the vectoriser vocabulary: 11066


In [83]:
# define a function that will return the first five recommended courses
def Recommendation_Cosine_similarity(matrix, name):
    
    # get its index from list
    row_num = lst_names.index(name)
     
    # cosine similarity matrix for each index in list (square matrix)
    similarity = cosine_similarity(test_matrix)
    
    # get similar courses by highest cosine similarity
    similar_courses = list(enumerate(similarity[row_num]))
    sorted_similar_courses = sorted(similar_courses, key=lambda x:x[1], reverse= True)[:6]
    
    print(f'recommended courses for {name}\n')
    # This part will return the description of the recommended courses
    i = 0
    for item in sorted_similar_courses:
        course_description = data[data.index == item[0]]["name"].values[0]
        recommendations = print(f"{i+1} {course_description}")
        i = i + 1
    return recommendations

Recommendation_Cosine_similarity(test_matrix,'MathTrackX: Differential Calculus')

recommended courses for MathTrackX: Differential Calculus

1 MathTrackX: Differential Calculus
2 MathTrackX: Integral Calculus
3 MathTrackX: Statistics
4 MathTrackX: Polynomials, Functions and Graphs
5 MathTrackX: Probability
6 MathTrackX: Special Functions


In [84]:
#Word2Vec w/ Gensim
import gensim.downloader as api
from gensim.models import KeyedVectors

In [85]:
from gensim.models import Word2Vec

model = Word2Vec(vector_size=100,min_count=1)
model.build_vocab(corpus)
print(f"words in corpus: {model.corpus_total_words}")
print(f'corpus count: {model.corpus_count}')
model.train(corpus, total_examples = model.corpus_count, epochs = 50)
model.save('embeddings')

words in corpus: 89579
corpus count: 719


In [86]:
#TESTING THE MODEL
vocab_len = len(model.wv)
print(f'Vocabulary size: {vocab_len}')

print('First 10 words in vocabulary:')
key_vocab = model.wv.index_to_key[:10]
print(key_vocab)

Vocabulary size: 11091
First 10 words in vocabulary:
['course', 'learn', 'data', 'business', 'science', 'management', 'skill', 'learning', 'also', 'program']


In [87]:
result = model.wv.similar_by_word('deep')
for i in result:
    print(i)

word1 = 'deep'; word2 = 'learning'
similarity = model.wv.similarity(word1,word2)
print(f'\nsimilarity b/w {word1} and {word2} {round(similarity,2)}\n')

# embedding vectors

vector = model.wv['computer']  # numpy vector of a word
print(f'computer word embedding')
print(f'first {10} components')
print(vector[:10])

('tensorflow', 0.7396812438964844)
('unsupervised', 0.7235405445098877)
('machine', 0.7154614329338074)
('backpropagation', 0.703831672668457)
('salary', 0.6875907778739929)
('multiclass', 0.6833214163780212)
('supervised', 0.6801084280014038)
('deploydeep', 0.6660279035568237)
('pretrained', 0.6632823348045349)
('curve', 0.6468446254730225)

similarity b/w deep and learning 0.5199999809265137

computer word embedding
first 10 components
[-0.650581    2.8528109   0.97926545 -0.0226705  -0.88035804  3.2470968
  2.574869   -1.9261923  -1.9006276   2.0990543 ]


In [88]:
# View similar words based on gensim's model
print('Similar Words')
similar_words = {search_term: [item[0] for item in model.wv.most_similar([search_term], topn=5)]
                  for search_term in key_vocab}
similar_words

Similar Words


{'course': ['mooc', 'studioxafter', 'seven', 'intonation', 'consecutive'],
 'learn': ['discover', 'examine', 'learnimportant', 'delve', 'explore'],
 'data': ['datasets', 'r', 'analytics', 'visualize', 'manipulate'],
 'business': ['strategic', 'peter', 'retail', 'corporate', 'innovator'],
 'science': ['distinguishes', 'groupnote', 'graduate', 'chip', 'inextricably'],
 'management': ['assuccessful',
  'customercentric',
  'managing',
  'operational',
  'monitoring'],
 'skill': ['confidence', 'ability', 'mend', 'mindset', 'sensemaking'],
 'learning': ['dig', 'pytorch', 'deep', 'salary', 'employ'],
 'also': ['modelled', 'finally', 'techniquesthis', 'purpose', 'vary'],
 'program': ['credential',
  'programmeyou',
  'programaudit',
  'functionalitywhat',
  'micromasters']}

In [89]:
#VISUALISATION EMBEDDING VECTORS
# Lower dimensionality visualisation of embeddings (100->2)
import plotly.express as px
from sklearn.manifold import TSNE
import warnings; warnings.filterwarnings('ignore')

words = sum([[k] + v for k, v in similar_words.items()], [])
wvs = model.wv[words]

tsne = TSNE(n_components=2, 
            random_state=0, 
            n_iter=10000)

X = tsne.fit_transform(wvs)
labels = words

px.scatter(X[:, 0], X[:, 1],text=labels,
           template='plotly_white',
           width=800,
           title='Word Embedding Visualisation')

In [90]:
# Get average embedding vector for each text
def doc_vectorizer(doc, model):
    
    doc_vector = []
    num_words = 0
    
    for word in doc:
        try:
            if num_words == 0:
                doc_vector = model.wv[word]
            else:
                doc_vector = np.add(doc_vector, model.wv[word])
            num_words += 1
        except:
            pass  # if embedding vector isn't found
        return np.asarray(doc_vector) / num_words

X = []
for doc in lst_corpus:
    X.append(doc_vectorizer(doc,model))
    
print(f'list of sentence vectors/sentences: {len(X)}')
print(f'each sentence has {X[0].shape} dimensions')

list of sentence vectors/sentences: 719
each sentence has (100,) dimensions


In [91]:
#Recommendation using cosine similarity
def course_recommender(X,course):

    # Finding cosine similarity for the vectors
    cosine_similarities = cosine_similarity(X,X)

    # Taking the Title and Movie Image Link and store in new dataframe called 'movies'
    courses = data[['name']]

    # Reverse mapping of the index
    indices = pd.Series(data.index, index = data['name']).drop_duplicates()
    idx = indices[course]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    recommend = courses.iloc[movie_indices]

    for index, row in recommend.iterrows():
        print(row['name'])
        
course_recommender(X,'Data Science: Inference and Modeling')

Data Science: Computational Thinking with Python
The Data Science Method
Data Science: R Basics
Data Science: Machine Learning
Data Science: Visualization
