In [2]:
!python3 -m pip install sentence-transformers
!python3 -m pip install faiss-cpu

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
import pandas as pd
df = pd.read_csv("unified_courses_v1.csv")
print(df.shape)

(26232, 16)


In [4]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')



In [5]:
embeddings = model.encode(df['embedding_text'].tolist(), batch_size=64, show_progress_bar=True)

Batches:   0%|          | 0/410 [00:00<?, ?it/s]

In [6]:
import numpy as np

np.save("course_embeddings.npy", embeddings)
print("Saved course_embeddings.npy")

Saved course_embeddings.npy


In [7]:
import faiss
dimension = embeddings.shape[1]

index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print("Index size:", index.ntotal)

Index size: 26232


In [8]:
import faiss

faiss.write_index(index, "faiss_index.bin")
print("Saved faiss_index.bin")

Saved faiss_index.bin


In [8]:
def recommend_similar(title_query, k=10):
    query = title_query.lower()
    matches = df[df['course_title'].str.lower().str.contains(query)]
    
    if matches.empty:
        return "Course not found. Try using a shorter part of the course title."
    
    idx = matches.index[0]
    query_vec = embeddings[idx].reshape(1, -1)
    distances, indices = index.search(query_vec, k+1)
    results = df.iloc[indices[0][1:k+1]][[
        'course_title', 'category', 'subcategory',
        'avg_rating_90d', 'num_rating_all_time', 'enrollments'
    ]]
    
    return results

In [9]:
recommend_similar("Machine Learning for Data Analysis", k=5)

Unnamed: 0,course_title,category,subcategory,avg_rating_90d,num_rating_all_time,enrollments
17188,Machine Learning for Data Analysis: Unsupervis...,Development,Database Design & Development,4.45,735,6542
1463,Machine Learning for Data Analysis: Data Profi...,Data Science,Machine Learning,4.54,3702,22615
18284,Machine Learning for Data Analysis: Data Profi...,Data Science,Machine Learning,4.57,3587,22676
1147,Machine Learning for Data Analysis: Classifica...,Data Science,Machine Learning,4.8,467,12121
17977,Machine Learning for Data Analysis: Classifica...,Data Science,Machine Learning,4.87,464,12169


In [10]:
recommend_similar("Data Analysis", k=5)

Unnamed: 0,course_title,category,subcategory,avg_rating_90d,num_rating_all_time,enrollments
16878,Data Analysis with Python: NumPy & Pandas Mast...,Data Science,Analytics,4.6,1231,19310
7247,Data Analysis with Pandas and Python,Data Science,Analytics,4.63,21610,201688
13194,Data Analysis with Pandas and Python,Data Science,Analytics,4.62,18666,496727
24122,Data Analysis with Pandas and Python,Data Science,Analytics,4.65,21345,306630
7248,Data Analysis with Pandas and Python,Data Science,Analytics,4.63,21610,201688


In [11]:
recommend_similar("Machine Learning", k=5)

Unnamed: 0,course_title,category,subcategory,avg_rating_90d,num_rating_all_time,enrollments
16844,No-Code and No-Math Machine Learning,Data Science,Math & Statistics,4.85,86,1445
6131,Data Science : Master Machine Learning Without...,Data Science,Machine Learning,4.34,966,38675
23002,Data Science : Master Machine Learning Without...,Data Science,Machine Learning,4.4,951,38543
6130,Data Science : Master Machine Learning Without...,Data Science,Machine Learning,4.34,966,38675
23003,Data Science : Master Machine Learning Without...,Data Science,Machine Learning,4.4,951,38543


In [12]:
def embed_query(text):
    return model.encode([text])[0]

In [13]:
def semantic_search(query_text, k=10):
    # embed query
    query_vec = embed_query(query_text).reshape(1, -1)
    
    # FAISS search
    distances, indices = index.search(query_vec, k)
    
    # return human-readable table
    results = df.iloc[indices[0]][[
        'course_title', 'category', 'subcategory',
        'avg_rating_90d', 'num_rating_all_time', 'enrollments'
    ]].copy()
    
    results['score'] = distances[0]
    return results


In [14]:
semantic_search("machine learning", 5)

Unnamed: 0,course_title,category,subcategory,avg_rating_90d,num_rating_all_time,enrollments,score
1402,Complete Machine Learning 2024 A-Z™: 10 Real W...,Data Science,Machine Learning,4.33,437,29167,0.754888
18227,Complete Machine Learning 2023 A-Z™: 10 Real W...,Data Science,Machine Learning,4.55,429,29289,0.780812
10942,An Introduction to Machine Learning for Data E...,Development,Data Science,4.42,3058,88972,0.79448
5200,Feature Selection for Machine Learning,Data Science,Machine Learning,4.75,2143,62937,0.813747
22045,Feature Selection for Machine Learning,Data Science,Machine Learning,4.7,2117,63066,0.813747


In [15]:
semantic_search("machine learning for finance", 5)

Unnamed: 0,course_title,category,subcategory,avg_rating_90d,num_rating_all_time,enrollments,score
12640,Python for Finance: Investment Fundamentals & ...,Data Science,Analytics,4.57,25562,292189,0.67368
2331,"Artificial Intelligence for Finance, Accountin...",Data Science,Machine Learning,4.52,1543,32189,0.685297
19152,"Artificial Intelligence for Finance, Accountin...",Data Science,Machine Learning,4.5,1397,32142,0.685297
3751,Manage Finance Data with Python & Pandas: Uniq...,Finance & Accounting,Financial Modeling & Analysis,4.5,840,31628,0.697649
20576,Manage Finance Data with Python & Pandas: Uniq...,Finance & Accounting,Financial Modeling & Analysis,4.53,822,31620,0.697649


In [16]:
semantic_search("python for beginners", 5)

Unnamed: 0,course_title,category,subcategory,avg_rating_90d,num_rating_all_time,enrollments,score
25116,Python for Beginners with Examples,Development,Programming Languages,4.37,9662,10428,0.297954
8231,Python for Beginners with Examples,Development,Programming Languages,4.39,9676,10349,0.297954
14887,Python for Beginners with Examples,Development,Programming Languages,4.35,9424,11713,0.297954
5703,Python for beginners,Development,Programming Languages,4.55,24791,270150,0.373771
10547,Python for beginners,Development,Programming Languages,4.54,21628,267622,0.373771


In [17]:
semantic_search("leadership skills for new managers", 5)

Unnamed: 0,course_title,category,subcategory,avg_rating_90d,num_rating_all_time,enrollments,score
8611,New Manager: The Basics and More of Being a Gr...,Leadership & Management,Leadership,4.56,13200,320340,0.463874
15551,New Manager: The Basics and More of Being a Gr...,Leadership & Management,Leadership,4.57,9511,333293,0.463874
25505,New Manager: The Basics and More of Being a Gr...,Leadership & Management,Leadership,4.57,12854,324833,0.463874
8610,New Manager: The Basics and More of Being a Gr...,Leadership & Management,Leadership,4.56,13200,320340,0.510928
15550,New Manager: The Basics and More of Being a Gr...,Leadership & Management,Leadership,4.57,9511,333293,0.510928


In [18]:
semantic_search("AWS certification beginner", 5)

Unnamed: 0,course_title,category,subcategory,avg_rating_90d,num_rating_all_time,enrollments,score
2462,AWS Certified Cloud Practitioner - Essentials ...,IT Operations,IT Certifications,4.47,1295,29952,0.466187
19283,AWS Certified Cloud Practitioner - Essentials ...,IT Operations,IT Certifications,4.5,1253,30062,0.466187
2463,AWS Certified Cloud Practitioner - Essentials ...,Cloud Computing,Cloud Certifications,4.47,1295,29952,0.480968
19282,AWS Certified Cloud Practitioner - Essentials ...,Cloud Computing,Cloud Certifications,4.5,1253,30062,0.480968
2644,AWS Certified Developer Associate Exam Trainin...,IT Operations,IT Certifications,4.61,3678,78867,0.507188


In [19]:
def pretty_search(query_text, k=10):
    results = semantic_search(query_text, k)
    print(f"\nTop {k} semantic matches for query: '{query_text}'\n")
    
    display(results[['course_title', 'category', 'subcategory',
                     'avg_rating_90d', 'num_rating_all_time', 'enrollments']])

In [20]:
pretty_search("python for data analysis", 5)
pretty_search("presentation skills for managers", 5)
pretty_search("sql analytics", 5)


Top 5 semantic matches for query: 'python for data analysis'



Unnamed: 0,course_title,category,subcategory,avg_rating_90d,num_rating_all_time,enrollments
375,Python for Data Analysis & Visualization,Development,Programming Languages,4.56,1261,5345
17212,Python for Data Analysis & Visualization,Development,Programming Languages,4.55,1169,5445
8703,Learning Python for Data Analysis and Visualiz...,Development,Programming Languages,4.32,19558,46772
8702,Learning Python for Data Analysis and Visualiz...,Development,Programming Languages,4.32,19558,46772
1668,Data Analysis with Python,Data Science,Statistical Analysis,4.28,437,18984



Top 5 semantic matches for query: 'presentation skills for managers'



Unnamed: 0,course_title,category,subcategory,avg_rating_90d,num_rating_all_time,enrollments
15941,Presentation Skills: Give a Great Team Present...,Leadership & Management,Communication,4.5,3107,38677
10652,"Presentation Skills: Give More Powerful, Memor...",Leadership & Management,Communication,4.63,6431,269736
10650,"Presentation Skills: Give More Powerful, Memor...",Leadership & Management,Communication,4.63,6431,269736
15942,Presentation Skills: Give a Great Team Present...,Business,Communication,4.5,3107,38677
10653,"Presentation Skills: Give More Powerful, Memor...",Business,Communication,4.63,6431,269736



Top 5 semantic matches for query: 'sql analytics'



Unnamed: 0,course_title,category,subcategory,avg_rating_90d,num_rating_all_time,enrollments
5652,SQL - MySQL for Data Analytics and Business In...,Development,Database Design & Development,4.58,56337,682390
10427,SQL - MySQL for Data Analytics and Business In...,Development,Database Design & Development,4.6,46532,680584
22506,SQL - MySQL for Data Analytics and Business In...,Development,Database Design & Development,4.59,55531,686285
3419,SQL for Data Analysis: Beginner MySQL Business...,Data Science,Analytics,4.66,8775,216082
20238,SQL for Data Analysis: Beginner MySQL Business...,Data Science,Analytics,4.64,8665,217066
