In [1]:
import pandas as pd
import neattext.functions as nfx

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

In [3]:
df = pd.read_csv("dataset/Udemy_Web_Dev.csv")

In [4]:
df.head()

Unnamed: 0,course_title,course_instructor,Description,Rating,User_vote,Total_hours,Lecture,Level
0,Modern JavaScript From The Beginning,Brad Traversy,Learn and build projects with pure JavaScript ...,4.7,23409,21.5,122,All Levels
1,Scrum Certification Prep PlusScrum MasterPlus ...,Paul Ashun,Overview of Scrum Agile project managementPlus...,4.4,22795,3.0,70,All Levels
2,NodeJS The Complete Guide MVC REST APIs Gr...,Academind by Maximilian Schwarzm ller Maximil...,Master Node JS and Deno js build REST APIs wi...,4.7,22601,40.5,541,All Levels
3,The Complete Angular Course Beginner to Advanced,Mosh Hamedani,The most comprehensive Angular 4 Angular 2Plu...,4.4,22477,29.5,376,All Levels
4,CHash Intermediate Classes Interfaces and OOP,Mosh Hamedani,An in depth step by step guide to classes in...,4.6,21967,6.0,45,Intermediate


In [5]:
df['course_title']

0                    Modern JavaScript From The Beginning
1       Scrum Certification Prep PlusScrum MasterPlus ...
2       NodeJS   The Complete Guide MVC  REST APIs  Gr...
3       The Complete Angular Course  Beginner to Advanced
4         CHash Intermediate  Classes  Interfaces and OOP
                              ...                        
9848           React Native  Tips  Tricks  and Techniques
9849           Type Safe Interfaces with Modern CPlusPlus
9850                     Modernise your code with CHash 8
9851             Terraform on AWS with Hands On July 2020
9852    Algorithms and Big O   101 Basics Course  CRUS...
Name: course_title, Length: 9853, dtype: object

In [6]:
# Clean Course Title: stopwords
df['clean_course_title'] = df['course_title'].apply(nfx.remove_stopwords)

In [7]:
# Clean Course Title: special characters
df['clean_course_title'] = df['clean_course_title'].apply(nfx.remove_special_characters)

In [10]:
df[['course_title', 'clean_course_title']]

Unnamed: 0,course_title,clean_course_title
0,Modern JavaScript From The Beginning,Modern JavaScript Beginning
1,Scrum Certification Prep PlusScrum MasterPlus ...,Scrum Certification Prep PlusScrum MasterPlus ...
2,NodeJS The Complete Guide MVC REST APIs Gr...,NodeJS Complete Guide MVC REST APIs GraphQL Deno
3,The Complete Angular Course Beginner to Advanced,Complete Angular Course Beginner Advanced
4,CHash Intermediate Classes Interfaces and OOP,CHash Intermediate Classes Interfaces OOP
...,...,...
9848,React Native Tips Tricks and Techniques,React Native Tips Tricks Techniques
9849,Type Safe Interfaces with Modern CPlusPlus,Type Safe Interfaces Modern CPlusPlus
9850,Modernise your code with CHash 8,Modernise code CHash 8
9851,Terraform on AWS with Hands On July 2020,Terraform AWS Hands July 2020


In [11]:
# Vectorize our Text
count_vect = CountVectorizer()
cv_mat = count_vect.fit_transform(df['clean_course_title'])

In [12]:
#Sparse
cv_mat

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 51623 stored elements and shape (9853, 4396)>

In [13]:
# Dense
cv_mat.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], shape=(9853, 4396))

In [16]:
df_cv_words = pd.DataFrame(cv_mat.todense(), columns = count_vect.get_feature_names_out())

In [17]:
df_cv_words.head()

Unnamed: 0,000plus,061,071,10,100,1000,1000plus,100plus,100pluschallenges,100pluspractice,...,zend,zenda,zeppelin,zero,zf2,zoho,zombie,zookeeper,zoom,zynq
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Cosine Similarity Matrix
cosine_sim_matrix = cosine_similarity(cv_mat)

In [19]:
cosine_sim_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], shape=(9853, 9853))

In [20]:
cosine_sim_matrix[0:10]

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(10, 9853))

In [21]:
# Get Course ID
course_indices = pd.Series(df.index, index = df['course_title']).drop_duplicates()

In [22]:
course_indices

course_title
Modern JavaScript From The Beginning                                     0
Scrum Certification Prep PlusScrum MasterPlus Agile Scrum Training       1
NodeJS   The Complete Guide MVC  REST APIs  GraphQL  Deno                2
The Complete Angular Course  Beginner to Advanced                        3
CHash Intermediate  Classes  Interfaces and OOP                          4
                                                                      ... 
React Native  Tips  Tricks  and Techniques                            9848
Type Safe Interfaces with Modern CPlusPlus                            9849
Modernise your code with CHash 8                                      9850
Terraform on AWS with Hands On July 2020                              9851
Algorithms and Big O   101 Basics Course  CRUSH The Interview         9852
Length: 9853, dtype: int64

In [28]:
idx = course_indices['Modern JavaScript From The Beginning']

In [29]:
idx

np.int64(0)

In [31]:
scores = list(enumerate(cosine_sim_matrix[idx]))

In [32]:
scores

[(0, np.float64(1.0000000000000002)),
 (1, np.float64(0.0)),
 (2, np.float64(0.0)),
 (3, np.float64(0.0)),
 (4, np.float64(0.0)),
 (5, np.float64(0.3333333333333334)),
 (6, np.float64(0.0)),
 (7, np.float64(0.0)),
 (8, np.float64(0.0)),
 (9, np.float64(0.0)),
 (10, np.float64(0.0)),
 (11, np.float64(0.0)),
 (12, np.float64(0.0)),
 (13, np.float64(0.0)),
 (14, np.float64(0.0)),
 (15, np.float64(0.0)),
 (16, np.float64(0.0)),
 (17, np.float64(0.0)),
 (18, np.float64(0.0)),
 (19, np.float64(0.0)),
 (20, np.float64(0.0)),
 (21, np.float64(0.0)),
 (22, np.float64(0.0)),
 (23, np.float64(0.4714045207910318)),
 (24, np.float64(0.0)),
 (25, np.float64(0.0)),
 (26, np.float64(0.0)),
 (27, np.float64(0.0)),
 (28, np.float64(0.0)),
 (29, np.float64(0.0)),
 (30, np.float64(0.0)),
 (31, np.float64(0.0)),
 (32, np.float64(0.0)),
 (33, np.float64(0.0)),
 (34, np.float64(0.0)),
 (35, np.float64(0.0)),
 (36, np.float64(0.0)),
 (37, np.float64(0.0)),
 (38, np.float64(0.0)),
 (39, np.float64(0.0)),
 (40,

In [33]:
# Sort scores per cosine score
sorted_scores = sorted(scores, key = lambda x:x[1], reverse = True)

In [35]:
# Omit the first value
sorted_scores[1:]

[(75, np.float64(0.6666666666666669)),
 (2495, np.float64(0.6666666666666669)),
 (9797, np.float64(0.6666666666666669)),
 (4042, np.float64(0.6123724356957945)),
 (1560, np.float64(0.5773502691896258)),
 (2677, np.float64(0.5773502691896258)),
 (7421, np.float64(0.5773502691896258)),
 (8171, np.float64(0.5773502691896258)),
 (8821, np.float64(0.5773502691896258)),
 (1364, np.float64(0.5773502691896257)),
 (9527, np.float64(0.5773502691896257)),
 (147, np.float64(0.5163977794943223)),
 (1196, np.float64(0.5163977794943223)),
 (1413, np.float64(0.5163977794943223)),
 (1703, np.float64(0.5163977794943223)),
 (5803, np.float64(0.5163977794943223)),
 (5814, np.float64(0.5163977794943223)),
 (8625, np.float64(0.5163977794943223)),
 (9166, np.float64(0.5163977794943223)),
 (9182, np.float64(0.5163977794943223)),
 (9647, np.float64(0.5163977794943223)),
 (9739, np.float64(0.5163977794943223)),
 (23, np.float64(0.4714045207910318)),
 (158, np.float64(0.4714045207910318)),
 (1296, np.float64(0.4

In [36]:
# Selected courses indices
selected_course_indices = [i[0] for i in sorted_scores[1:]]

In [37]:
selected_course_indices

[75,
 2495,
 9797,
 4042,
 1560,
 2677,
 7421,
 8171,
 8821,
 1364,
 9527,
 147,
 1196,
 1413,
 1703,
 5803,
 5814,
 8625,
 9166,
 9182,
 9647,
 9739,
 23,
 158,
 1296,
 1480,
 1699,
 1756,
 2384,
 2786,
 3184,
 321,
 775,
 1546,
 1571,
 1789,
 1833,
 2520,
 4513,
 4568,
 5202,
 5203,
 5216,
 5699,
 6059,
 9005,
 9075,
 180,
 918,
 986,
 1634,
 2137,
 2873,
 3026,
 3176,
 3768,
 3879,
 4014,
 4204,
 4233,
 4618,
 4730,
 4872,
 5390,
 5497,
 6107,
 6627,
 6853,
 6907,
 6914,
 7447,
 7476,
 7520,
 7535,
 7847,
 8305,
 9223,
 4776,
 5485,
 6701,
 7170,
 8915,
 9073,
 9355,
 1464,
 2684,
 3436,
 4752,
 6795,
 7079,
 1755,
 5,
 111,
 122,
 219,
 420,
 556,
 887,
 937,
 1229,
 1251,
 1529,
 1553,
 1620,
 1628,
 2117,
 2156,
 2506,
 2976,
 3002,
 3070,
 3096,
 3395,
 3397,
 3428,
 3506,
 3618,
 3767,
 3881,
 3961,
 3969,
 3971,
 4094,
 4117,
 4269,
 4947,
 5022,
 5081,
 5098,
 5384,
 5394,
 5395,
 5400,
 5435,
 5547,
 5732,
 6112,
 6133,
 6259,
 6464,
 6527,
 6676,
 6717,
 7774,
 7812,
 7951,

In [38]:
# Selected courses scores
selected_course_scores = [i[1] for i in sorted_scores[1:]]

In [39]:
selected_course_scores

[np.float64(0.6666666666666669),
 np.float64(0.6666666666666669),
 np.float64(0.6666666666666669),
 np.float64(0.6123724356957945),
 np.float64(0.5773502691896258),
 np.float64(0.5773502691896258),
 np.float64(0.5773502691896258),
 np.float64(0.5773502691896258),
 np.float64(0.5773502691896258),
 np.float64(0.5773502691896257),
 np.float64(0.5773502691896257),
 np.float64(0.5163977794943223),
 np.float64(0.5163977794943223),
 np.float64(0.5163977794943223),
 np.float64(0.5163977794943223),
 np.float64(0.5163977794943223),
 np.float64(0.5163977794943223),
 np.float64(0.5163977794943223),
 np.float64(0.5163977794943223),
 np.float64(0.5163977794943223),
 np.float64(0.5163977794943223),
 np.float64(0.5163977794943223),
 np.float64(0.4714045207910318),
 np.float64(0.4714045207910318),
 np.float64(0.4714045207910318),
 np.float64(0.4714045207910318),
 np.float64(0.4714045207910318),
 np.float64(0.4714045207910318),
 np.float64(0.4714045207910318),
 np.float64(0.4714045207910318),
 np.float6

In [42]:
recommended_result = df['course_title'].iloc[selected_course_indices]

In [43]:
recom = pd.DataFrame(recommended_result)

In [44]:
recom.head()

Unnamed: 0,course_title
75,The Modern JavaScript Bootcamp
2495,Using Modern JavaScript Today
9797,Modern JavaScript For Developers
4042,Modern JavaScript fundamentals for Beginners J...
1560,JavaScript for Beginning Web Developers


In [46]:
recom['similarity scores'] = selected_course_scores

In [47]:
recom

Unnamed: 0,course_title,similarity scores
75,The Modern JavaScript Bootcamp,0.666667
2495,Using Modern JavaScript Today,0.666667
9797,Modern JavaScript For Developers,0.666667
4042,Modern JavaScript fundamentals for Beginners J...,0.612372
1560,JavaScript for Beginning Web Developers,0.577350
...,...,...
9847,Complete Coding Interview Course for Web Devel...,0.000000
9848,React Native Tips Tricks and Techniques,0.000000
9850,Modernise your code with CHash 8,0.000000
9851,Terraform on AWS with Hands On July 2020,0.000000


In [48]:
def recommend_course(title, num_of_rec = 10):
    idx = course_indices[title]
    scores = list(enumerate(cosine_sim_matrix[idx]))
    sorted_scores = sorted(scores, key = lambda x:x[1], reverse = True)
    selected_course_indices = [i[0] for i in sorted_scores[1:]]
    selected_course_scores = [i[1] for i in sorted_scores[1:]]
    recommended_result = df['course_title'].iloc[selected_course_indices]
    recom = pd.DataFrame(recommended_result)
    recom['similarity scores'] = selected_course_scores
    return recom.head(num_of_rec)

In [49]:
recommend_course('JavaScript for Beginning Web Developers')

Unnamed: 0,course_title,similarity scores
9460,JavaScript from scratch for web developers,0.75
5854,HTML and CSS for Beginning Web Developers,0.67082
4468,HTML5 APIs For JavaScript A Course For Web D...,0.612372
4503,Ajax jQuery and JSON for Beginning Web Develo...,0.612372
0,Modern JavaScript From The Beginning,0.57735
6717,Web Development by Doing Javascript,0.57735
9797,Modern JavaScript For Developers,0.57735
329,Learn JavaScript for Web Development,0.5
2899,The Complete Javascript Course for Developers,0.5
3156,Web Development And Javascript Bootcamp,0.5


In [50]:
dir(nfx)

['BTC_ADDRESS_REGEX',
 'CURRENCY_REGEX',
 'CURRENCY_SYMB_REGEX',
 'Counter',
 'DATE_REGEX',
 'EMAIL_REGEX',
 'EMOJI_REGEX',
 'HASTAG_REGEX',
 'MASTERCard_REGEX',
 'MD5_SHA_REGEX',
 'MOST_COMMON_PUNCT_REGEX',
 'NUMBERS_REGEX',
 'PHONE_REGEX',
 'PoBOX_REGEX',
 'SPECIAL_CHARACTERS_REGEX',
 'STOPWORDS',
 'STOPWORDS_de',
 'STOPWORDS_en',
 'STOPWORDS_es',
 'STOPWORDS_fr',
 'STOPWORDS_ru',
 'STOPWORDS_yo',
 'STREET_ADDRESS_REGEX',
 'TextFrame',
 'URL_PATTERN',
 'USER_HANDLES_REGEX',
 'VISACard_REGEX',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__generate_text',
 '__loader__',
 '__name__',
 '__numbers_dict',
 '__package__',
 '__spec__',
 '_lex_richness_herdan',
 '_lex_richness_maas_ttr',
 'clean_text',
 'defaultdict',
 'digit2words',
 'extract_btc_address',
 'extract_currencies',
 'extract_currency_symbols',
 'extract_dates',
 'extract_emails',
 'extract_emojis',
 'extract_hashtags',
 'extract_html_tags',
 'extract_mastercard_addr',
 'extract_md5sha',
 'extract_numbers',
 'extr