In [26]:
from google import genai
import pypdf
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.genai import types
import chromadb
import pandas as pd
from typing import List, Tuple, Dict, Any



In [None]:
# PLEASE ADD API_KEY HERE
api_key=""

In [None]:
client = genai.Client(api_key=api_key)

In [8]:
model = "models/gemini-embedding-001"

In [9]:
# Read the CSV dataset file (not the notebook file)
df = pd.read_csv("assignment2dataset.csv")


In [15]:
df.shape

(25, 3)

In [None]:
df.tail

Unnamed: 0,course_id,title,description
0,C001,Foundations of Machine Learning,Understand foundational machine learning algor...
1,C002,Deep Learning with TensorFlow and Keras,Explore neural network architectures using Ten...
2,C003,Natural Language Processing Fundamentals,Dive into NLP techniques for processing and un...
3,C004,Computer Vision and Image Processing,Learn the principles of computer vision and im...
4,C005,Reinforcement Learning Basics,Get introduced to reinforcement learning parad...
5,C006,Data Engineering on AWS,Build scalable data pipelines using AWS servic...
6,C007,Cloud Computing with Azure,Master Microsoft Azure’s core services: virtua...
7,C008,DevOps Practices and CI/CD,Adopt DevOps methodologies to accelerate softw...
8,C009,Containerization with Docker and Kubernetes,Learn container fundamentals with Docker: imag...
9,C010,APIs and Microservices Architecture,Design and implement RESTful and GraphQL APIs ...


In [19]:
class GeminiEmbeddingFunction(EmbeddingFunction):
  def __call__(self, input: Documents) -> Embeddings:
    EMBEDDING_MODEL_ID = "gemini-embedding-001"  # @param ["gemini-embedding-001", "text-embedding-004"] {"allow-input": true, "isTemplate": true}
    title = "Custom query"
    response = client.models.embed_content(
        model=EMBEDDING_MODEL_ID,
        contents=input,
        config=types.EmbedContentConfig(
          task_type="retrieval_document",
          title=title
        )
    )

    # Return all embeddings as a list, not just the first one
    return [embedding.values for embedding in response.embeddings]





In [21]:
def create_chroma_db(df, name):
    chroma_client = chromadb.Client()
    collection = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())
    collection.add(
        documents = df["description"].tolist(),
        metadatas = df[["title", "course_id"]].to_dict(orient="records"),
        ids = df["course_id"].tolist()
    )

    return collection
    

In [22]:
db = create_chroma_db(df, "courses2")

  collection = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())


In [27]:
def recommend_courses(profile: str, completed_ids: List[str]) -> List[Tuple[str, float]]:
    """
    Returns a list of (course_id, similarity_score) for the top-5 recommendations.
    
    Args:
        profile: User's learning profile/description
        completed_ids: List of course IDs the user has already completed
        
    Returns:
        List of tuples containing (course_id, similarity_score) for top 5 recommendations
    """
    # Create a query that combines user profile with their learning goals
    query_text = f"User profile: {profile}. Recommend courses that align with their interests and learning goals."
    
    # Query the vector database for similar courses
    results = db.query(
        query_texts=[query_text],
        n_results=10  # Get more results to filter out completed courses
    )
    
    # Extract course IDs and distances from results
    course_ids = results['ids'][0]
    distances = results['distances'][0]
    
    # Convert distances to similarity scores (1 - distance for cosine similarity)
    similarity_scores = [1 - distance for distance in distances]
    
    # Filter out completed courses
    recommendations = []
    for course_id, score in zip(course_ids, similarity_scores):
        if course_id not in completed_ids:
            recommendations.append((course_id, score))
    
    # Return top 5 recommendations
    return recommendations[:5]


In [None]:


# Test case 1: User interested in machine learning and data science
profile1 = "I'm a software developer interested in machine learning and data science. I want to build AI applications and work with big data."
completed1 = ["C001"]  # Already completed Foundations of ML

recommendations1 = recommend_courses(profile1, completed1)
print(f"\nRecommendations for ML/Data Science profile:")
for course_id, score in recommendations1:
    course_info = df[df['course_id'] == course_id]
    if not course_info.empty:
        title = course_info.iloc[0]['title']
        print(f"  {course_id}: {title} (Score: {score:.3f})")

# Test case 2: User interested in cloud and DevOps
profile2 = "I work in IT operations and want to learn cloud computing, containerization, and DevOps practices for better deployment and scaling."
completed2 = ["C006", "C007"]  # Already completed AWS and Azure courses

recommendations2 = recommend_courses(profile2, completed2)
print(f"\nRecommendations for Cloud/DevOps profile:")
for course_id, score in recommendations2:
    course_info = df[df['course_id'] == course_id]
    if not course_info.empty:
        title = course_info.iloc[0]['title']
        print(f"  {course_id}: {title} (Score: {score:.3f})")




Testing the recommendation system...

Recommendations for ML/Data Science profile:
  C011: Big Data Analytics with Spark (Score: 0.601)
  C002: Deep Learning with TensorFlow and Keras (Score: 0.593)
  C003: Natural Language Processing Fundamentals (Score: 0.588)
  C005: Reinforcement Learning Basics (Score: 0.587)
  C018: Product Management Essentials (Score: 0.579)

Recommendations for Cloud/DevOps profile:
  C009: Containerization with Docker and Kubernetes (Score: 0.678)
  C008: DevOps Practices and CI/CD (Score: 0.657)
  C022: Internet of Things (IoT) Development (Score: 0.612)
  C025: MLOps: Productionizing Machine Learning (Score: 0.601)
  C021: Cybersecurity Fundamentals (Score: 0.593)

Recommendation system test completed successfully!


In [30]:
profile3 = "I want to learn the scalability and reliability of ML Models."
completed3 = ["C006", "C007"]  # Already completed AWS and Azure courses

recommendations3 = recommend_courses(profile3, completed3)
print(f"\nRecommendations for Cloud/DevOps profile:")
for course_id, score in recommendations3:
    course_info = df[df['course_id'] == course_id]
    if not course_info.empty:
        title = course_info.iloc[0]['title']
        print(f"  {course_id}: {title} (Score: {score:.3f})")

print("\nRecommendation system test completed successfully!")


Recommendations for Cloud/DevOps profile:
  C025: MLOps: Productionizing Machine Learning (Score: 0.659)
  C001: Foundations of Machine Learning (Score: 0.631)
  C005: Reinforcement Learning Basics (Score: 0.609)
  C011: Big Data Analytics with Spark (Score: 0.608)
  C002: Deep Learning with TensorFlow and Keras (Score: 0.589)

Recommendation system test completed successfully!


In [33]:
profile4 = "I’m interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?"
completed4 = []  

recommendations4 = recommend_courses(profile4, completed4)
print(f"\nRecommendations for blockchain profile:")
for course_id, score in recommendations4:
    course_info = df[df['course_id'] == course_id]
    if not course_info.empty:
        title = course_info.iloc[0]['title']
        print(f"  {course_id}: {title} (Score: {score:.3f})")

print("\nRecommendation system test completed successfully!")


Recommendations for blockchain profile:
  C023: Blockchain Technology and Smart Contracts (Score: 0.685)
  C018: Product Management Essentials (Score: 0.583)
  C022: Internet of Things (IoT) Development (Score: 0.570)
  C009: Containerization with Docker and Kubernetes (Score: 0.567)
  C021: Cybersecurity Fundamentals (Score: 0.565)

Recommendation system test completed successfully!


In [35]:
profile5 = "I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization. What should I take next?"
completed5 = ['C016']  

recommendations5 = recommend_courses(profile5, completed5)
print(f"\nRecommendations for Python programming profile:")
for course_id, score in recommendations5:
    course_info = df[df['course_id'] == course_id]
    if not course_info.empty:
        title = course_info.iloc[0]['title']
        print(f"  {course_id}: {title} (Score: {score:.3f})")

print("\nRecommendation system test completed successfully!")


Recommendations for Python programming profile:
  C014: Data Visualization with Tableau (Score: 0.580)
  C012: SQL for Data Analysis (Score: 0.576)
  C011: Big Data Analytics with Spark (Score: 0.576)
  C003: Natural Language Processing Fundamentals (Score: 0.575)
  C001: Foundations of Machine Learning (Score: 0.566)

Recommendation system test completed successfully!
