In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import neattext.functions as nfx
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("udemy_courses.csv")

In [2]:
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30T20:07:24Z,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13T14:57:18Z,Business Finance


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3678 entries, 0 to 3677
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3678 non-null   int64  
 1   course_title         3678 non-null   object 
 2   url                  3678 non-null   object 
 3   is_paid              3678 non-null   bool   
 4   price                3678 non-null   int64  
 5   num_subscribers      3678 non-null   int64  
 6   num_reviews          3678 non-null   int64  
 7   num_lectures         3678 non-null   int64  
 8   level                3678 non-null   object 
 9   content_duration     3678 non-null   float64
 10  published_timestamp  3678 non-null   object 
 11  subject              3678 non-null   object 
dtypes: bool(1), float64(1), int64(5), object(5)
memory usage: 319.8+ KB


In [4]:
# Check the number of rows and columns
print("Number of rows: ", df.shape[0])
print("Number of columns: ", df.shape[1])

# Display the first 5 rows
print(df.head())

# Display the summary statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

Number of rows:  3678
Number of columns:  12
   course_id                                       course_title  \
0    1070968                 Ultimate Investment Banking Course   
1    1113822  Complete GST Course & Certification - Grow You...   
2    1006314  Financial Modeling for Business Analysts and C...   
3    1210588  Beginner to Pro - Financial Analysis in Excel ...   
4    1011058       How To Maximize Your Profits Trading Options   

                                                 url  is_paid  price  \
0  https://www.udemy.com/ultimate-investment-bank...     True    200   
1      https://www.udemy.com/goods-and-services-tax/     True     75   
2  https://www.udemy.com/financial-modeling-for-b...     True     45   
3  https://www.udemy.com/complete-excel-finance-c...     True     95   
4  https://www.udemy.com/how-to-maximize-your-pro...     True    200   

   num_subscribers  num_reviews  num_lectures               level  \
0             2147           23            51     

In [5]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel

In [6]:
df['course_title']

0                      Ultimate Investment Banking Course
1       Complete GST Course & Certification - Grow You...
2       Financial Modeling for Business Analysts and C...
3       Beginner to Pro - Financial Analysis in Excel ...
4            How To Maximize Your Profits Trading Options
                              ...                        
3673    Learn jQuery from Scratch - Master of JavaScri...
3674    How To Design A WordPress Website With No Codi...
3675                        Learn and Build using Polymer
3676    CSS Animations: Create Amazing Effects on Your...
3677    Using MODX CMS to Build Websites: A Beginner's...
Name: course_title, Length: 3678, dtype: object

In [7]:
dir(nfx)

['BTC_ADDRESS_REGEX',
 'CURRENCY_REGEX',
 'CURRENCY_SYMB_REGEX',
 'Counter',
 'DATE_REGEX',
 'EMAIL_REGEX',
 'EMOJI_REGEX',
 'HASTAG_REGEX',
 'MASTERCard_REGEX',
 'MD5_SHA_REGEX',
 'MOST_COMMON_PUNCT_REGEX',
 'NUMBERS_REGEX',
 'PHONE_REGEX',
 'PoBOX_REGEX',
 'SPECIAL_CHARACTERS_REGEX',
 'STOPWORDS',
 'STOPWORDS_de',
 'STOPWORDS_en',
 'STOPWORDS_es',
 'STOPWORDS_fr',
 'STOPWORDS_ru',
 'STOPWORDS_yo',
 'STREET_ADDRESS_REGEX',
 'TextFrame',
 'URL_PATTERN',
 'USER_HANDLES_REGEX',
 'VISACard_REGEX',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__generate_text',
 '__loader__',
 '__name__',
 '__numbers_dict',
 '__package__',
 '__spec__',
 '_lex_richness_herdan',
 '_lex_richness_maas_ttr',
 'clean_text',
 'defaultdict',
 'digit2words',
 'extract_btc_address',
 'extract_currencies',
 'extract_currency_symbols',
 'extract_dates',
 'extract_emails',
 'extract_emojis',
 'extract_hashtags',
 'extract_html_tags',
 'extract_mastercard_addr',
 'extract_md5sha',
 'extract_numbers',
 'extr

In [8]:
# Clean Text:stopwords,special charac
df['clean_course_title'] = df['course_title'].apply(nfx.remove_stopwords)

In [9]:
# Clean Text:stopwords,special charac
df['clean_course_title'] = df['clean_course_title'].apply(nfx.remove_special_characters)

In [10]:
df[['course_title','clean_course_title']]

Unnamed: 0,course_title,clean_course_title
0,Ultimate Investment Banking Course,Ultimate Investment Banking Course
1,Complete GST Course & Certification - Grow You...,Complete GST Course Certification Grow Practice
2,Financial Modeling for Business Analysts and C...,Financial Modeling Business Analysts Consultants
3,Beginner to Pro - Financial Analysis in Excel ...,Beginner Pro Financial Analysis Excel 2017
4,How To Maximize Your Profits Trading Options,Maximize Profits Trading Options
...,...,...
3673,Learn jQuery from Scratch - Master of JavaScri...,Learn jQuery Scratch Master JavaScript library
3674,How To Design A WordPress Website With No Codi...,Design WordPress Website Coding
3675,Learn and Build using Polymer,Learn Build Polymer
3676,CSS Animations: Create Amazing Effects on Your...,CSS Animations Create Amazing Effects Website


In [11]:
# Vectorize our Text
count_vect = CountVectorizer()
cv_mat = count_vect.fit_transform(df['clean_course_title'])
feature_names = count_vect.get_feature_names_out()

In [12]:
# Sparse
cv_mat

<3678x3559 sparse matrix of type '<class 'numpy.int64'>'
	with 18333 stored elements in Compressed Sparse Row format>

In [13]:
# Dense
cv_mat.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
df_cv_words = pd.DataFrame(cv_mat.todense(), columns=feature_names)

In [15]:
df_cv_words.head()

Unnamed: 0,000005,001,01,02,10,100,101,101master,102,10k,...,zend,zero,zerotohero,zf2,zinsen,zoho,zombie,zu,zuhause,zur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Cosine Similarity Matrix
cosine_sim_mat = cosine_similarity(cv_mat)

In [17]:
cosine_sim_mat

array([[1.        , 0.20412415, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.20412415, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.23570226],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.23570226, 0.        ,
        1.        ]])

In [18]:
print("We Have the Following Subject Options")
subject_counts = df['subject'].value_counts()
print(subject_counts)
sub=input("Enter your Preffered Subject:")

We Have the Following Subject Options
subject
Web Development        1200
Business Finance       1195
Musical Instruments     680
Graphic Design          603
Name: count, dtype: int64


In [19]:
#Making Separate DAta Frame for Each Subject
df_ch = df[df['subject'] == sub]
df_ch


Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,clean_course_title
1195,880202,Anatomy and Figure Drawing for Beginners,https://www.udemy.com/anatomy-and-figure-drawi...,True,150,2252,240,111,Beginner Level,21.0,2016-06-20T15:31:48Z,Graphic Design,Anatomy Figure Drawing Beginners
1196,1197206,Illustrator CC MasterClass,https://www.udemy.com/illustrator-cc-masterclass/,True,95,462,50,86,All Levels,12.0,2017-05-02T16:41:21Z,Graphic Design,Illustrator CC MasterClass
1197,1117796,Typographic Logo Design in Illustrator - Begin...,https://www.udemy.com/typographical-logo-design/,True,150,1720,40,27,All Levels,1.5,2017-02-23T21:40:39Z,Graphic Design,Typographic Logo Design Illustrator Beginners
1198,1219520,Adobe Illustrator T-Shirt Design for Merch by ...,https://www.udemy.com/merchbyamazondesign/,True,20,390,44,15,All Levels,1.0,2017-06-13T20:41:14Z,Graphic Design,Adobe Illustrator TShirt Design Merch Amazon
1199,595876,Logo Design in Adobe Illustrator - for Beginne...,https://www.udemy.com/logodesign/,True,200,4297,337,110,All Levels,7.5,2015-10-01T21:40:39Z,Graphic Design,Logo Design Adobe Illustrator Beginners
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1793,456388,Adobe Photoshop CC | The Essential Guide,https://www.udemy.com/adobe-photoshop-cc-the-e...,True,20,254,5,81,All Levels,4.5,2015-05-12T00:19:27Z,Graphic Design,Adobe Photoshop CC Essential Guide
1794,955242,Learn to Composite a 2D Action Shot in Photoshop,https://www.udemy.com/3dmotive-learn-to-compos...,True,20,40,2,8,Beginner Level,1.5,2016-09-10T21:38:26Z,Graphic Design,Learn Composite 2D Action Shot Photoshop
1795,496430,Infographic Design: How To Create Your Own Inf...,https://www.udemy.com/infographic-design-how-t...,True,20,84,8,13,All Levels,2.0,2015-05-12T20:38:58Z,Graphic Design,Infographic Design Create Infographic
1796,793246,Autodesk Inventor 2016 : Complete Guide,https://www.udemy.com/learn-autodesk-inventor-...,True,20,23,4,83,Beginner Level,7.5,2016-03-22T20:35:56Z,Graphic Design,Autodesk Inventor 2016 Complete Guide


In [20]:
# Find the index of the row with the maximum value in the num_subscribers column
max_subscribers_index = df_ch['num_subscribers'].idxmax()

# Get the course title corresponding to the index
course_title_highest_subscribers = df_ch.loc[max_subscribers_index, 'course_title']

print("Course title with the highest number of subscribers:", course_title_highest_subscribers)

Course title with the highest number of subscribers: Photoshop In-Depth: Master all of Photoshop's Tools Easily


In [21]:
# Get Course ID/Index
course_indices = pd.Series(df.index,index=df['course_title']).drop_duplicates()

In [22]:
course_indices

course_title
Ultimate Investment Banking Course                                0
Complete GST Course & Certification - Grow Your CA Practice       1
Financial Modeling for Business Analysts and Consultants          2
Beginner to Pro - Financial Analysis in Excel 2017                3
How To Maximize Your Profits Trading Options                      4
                                                               ... 
Learn jQuery from Scratch - Master of JavaScript library       3673
How To Design A WordPress Website With No Coding At All        3674
Learn and Build using Polymer                                  3675
CSS Animations: Create Amazing Effects on Your Website         3676
Using MODX CMS to Build Websites: A Beginner's Guide           3677
Length: 3678, dtype: int64

In [23]:
course_indices[course_title_highest_subscribers]

1461

In [24]:
idx = course_indices[course_title_highest_subscribers]

In [25]:
idx

1461

In [26]:
scores = list(enumerate(cosine_sim_mat[idx]))

In [27]:
scores

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.0),
 (44, 0.0),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.0),
 (49, 0.0),
 (50, 0.0),
 (51, 0.0),
 (52, 0.1543033499620919),
 (53, 0.0),
 (54, 0.0),
 (55, 0.0),
 (56, 0.0),
 (57, 0.0),
 (58, 0.0),
 (59, 0.0),
 (60, 0.13608276348795434),
 (61, 0.0),
 (62, 0.0),
 (63, 0.0),
 (64, 0.0),
 (65, 0.0),
 (66, 0.0),
 (67, 0.0),
 (68, 0.0),
 (69, 0.0),
 (70, 0.0),
 (71, 0.0),
 (72, 0.0),
 (73, 0.0),
 (74, 0.0),
 (75, 0.0),
 (76, 0.0),
 (77, 0.0),
 (78, 0.0),
 (79, 0.0),
 (80, 0.0),
 (81, 0

In [28]:
# Sort our scores per cosine score
sorted_scores = sorted(scores,key=lambda x:x[1],reverse=True)

In [29]:
# Omit the First Value/itself
sorted_scores[1:]

[(1651, 0.4714045207910318),
 (1387, 0.4082482904638631),
 (1389, 0.4082482904638631),
 (1727, 0.4082482904638631),
 (1358, 0.408248290463863),
 (1338, 0.36927447293799825),
 (1754, 0.36514837167011077),
 (1768, 0.36514837167011077),
 (1418, 0.3086066999241838),
 (1491, 0.3086066999241838),
 (1302, 0.2886751345948129),
 (1317, 0.2886751345948129),
 (1484, 0.2886751345948129),
 (1634, 0.2886751345948129),
 (1699, 0.2886751345948129),
 (1732, 0.2886751345948129),
 (1628, 0.2721655269759087),
 (343, 0.2357022603955159),
 (1254, 0.2357022603955159),
 (1263, 0.2357022603955159),
 (1296, 0.2357022603955159),
 (1468, 0.2357022603955159),
 (1503, 0.2357022603955159),
 (1610, 0.2357022603955159),
 (1623, 0.2357022603955159),
 (1637, 0.2357022603955159),
 (1746, 0.2357022603955159),
 (2123, 0.2357022603955159),
 (310, 0.20412414523193154),
 (430, 0.20412414523193154),
 (714, 0.20412414523193154),
 (1233, 0.20412414523193154),
 (1235, 0.20412414523193154),
 (1237, 0.20412414523193154),
 (1238, 0.

In [30]:
# Selected Courses Indices
selected_course_indices = [i[0] for i in sorted_scores[1:]]

In [31]:
selected_course_indices

[1651,
 1387,
 1389,
 1727,
 1358,
 1338,
 1754,
 1768,
 1418,
 1491,
 1302,
 1317,
 1484,
 1634,
 1699,
 1732,
 1628,
 343,
 1254,
 1263,
 1296,
 1468,
 1503,
 1610,
 1623,
 1637,
 1746,
 2123,
 310,
 430,
 714,
 1233,
 1235,
 1237,
 1238,
 1241,
 1253,
 1264,
 1267,
 1290,
 1313,
 1314,
 1330,
 1341,
 1402,
 1443,
 1496,
 1501,
 1514,
 1554,
 1565,
 1568,
 1580,
 1596,
 1652,
 1696,
 1720,
 1742,
 1750,
 1752,
 1756,
 1770,
 2171,
 2829,
 3170,
 3520,
 3521,
 3584,
 3665,
 302,
 1212,
 1240,
 1266,
 1268,
 1269,
 1271,
 1275,
 1277,
 1304,
 1309,
 1329,
 1332,
 1370,
 1380,
 1382,
 1388,
 1392,
 1396,
 1416,
 1441,
 1454,
 1465,
 1470,
 1472,
 1476,
 1481,
 1523,
 1538,
 1545,
 1546,
 1552,
 1553,
 1559,
 1594,
 1605,
 1617,
 1641,
 1664,
 1671,
 1672,
 1716,
 1718,
 1724,
 1763,
 1773,
 1791,
 1793,
 2172,
 2195,
 2199,
 2200,
 2215,
 2243,
 2298,
 2372,
 2546,
 2821,
 3082,
 3130,
 3178,
 3529,
 3562,
 400,
 404,
 700,
 729,
 998,
 1080,
 1213,
 1223,
 1226,
 1257,
 1284,
 1303,
 1

In [32]:
# Selected Courses Scores
selected_course_scores = [i[1] for i in sorted_scores[1:]]

In [33]:
recommended_result = df['course_title'].iloc[selected_course_indices]

In [34]:
rec_df = pd.DataFrame(recommended_result)

In [35]:
rec_df.head()

Unnamed: 0,course_title
1651,Photoshop Tools 101
1387,خطوتك الأولى لتعلم الفوتوشوبPhotoshop
1389,Photoshopマスターコース　基礎から上級まで　ステップバイステップでPhotoshop...
1727,Photoshop　中級者、上級者がプロになるために最後に学ぶ「超絶技巧」テクニック
1358,Photoshop CC 2017 for Beginners: Master Photos...


In [36]:
rec_df['similarity_scores'] = selected_course_scores

In [37]:
rec_df

Unnamed: 0,course_title,similarity_scores
1651,Photoshop Tools 101,0.471405
1387,خطوتك الأولى لتعلم الفوتوشوبPhotoshop,0.408248
1389,Photoshopマスターコース　基礎から上級まで　ステップバイステップでPhotoshop...,0.408248
1727,Photoshop　中級者、上級者がプロになるために最後に学ぶ「超絶技巧」テクニック,0.408248
1358,Photoshop CC 2017 for Beginners: Master Photos...,0.408248
...,...,...
3672,jQuery UI in Action: Build 5 jQuery UI Projects,0.000000
3674,How To Design A WordPress Website With No Codi...,0.000000
3675,Learn and Build using Polymer,0.000000
3676,CSS Animations: Create Amazing Effects on Your...,0.000000


In [55]:
def recommend_course(title=course_title_highest_subscribers, num_of_rec=10):
    # ID for title
    idx = course_indices[title]
    # Course Indice
    # Search inside cosine_sim_mat
    scores = list(enumerate(cosine_sim_mat[idx]))
    # Scores
    # Sort Scores
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    # Recomm
    selected_course_indices = [i[0] for i in sorted_scores[1:]]
    selected_course_scores = [i[1] for i in sorted_scores[1:]]
    result = df['course_title'].iloc[selected_course_indices]
    rec_df = pd.DataFrame(result)
    rec_df['similarity_scores'] = selected_course_scores
    return rec_df.head(num_of_rec)

In [58]:
recommend_course()

Unnamed: 0,course_title,similarity_scores
1651,Photoshop Tools 101,0.471405
1387,خطوتك الأولى لتعلم الفوتوشوبPhotoshop,0.408248
1389,Photoshopマスターコース　基礎から上級まで　ステップバイステップでPhotoshop...,0.408248
1727,Photoshop　中級者、上級者がプロになるために最後に学ぶ「超絶技巧」テクニック,0.408248
1358,Photoshop CC 2017 for Beginners: Master Photos...,0.408248
1338,Adobe Photoshop Essentials: Master Adobe Photo...,0.369274
1754,Master Graphic Design Using Photoshop with Rac...,0.365148
1768,Mind-Blowing Photoshop: Master Skills Season 1,0.365148
1418,Master Adobe Photoshop Lightroom CC - From Beg...,0.308607
1491,Photoshop Fantastic! - The Comprehensive Guide...,0.308607
