In [45]:
from bs4 import BeautifulSoup
import json
import requests
from dotenv import load_dotenv
import os
load_dotenv()

es_username = os.getenv("ES_USERNAME")
es_password = os.getenv("ES_PASSWORD")


# URL of the course descriptions page
url = 'https://catalog.northeastern.edu/course-descriptions/info/'

# Fetch the HTML content from the URL
response = requests.get(url)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Find all course blocks
course_blocks = soup.find_all('div', class_='courseblock')

# Extract course titles and descriptions
courses = []
for block in course_blocks:
    course_title = block.find('p', class_='courseblocktitle').get_text(strip=True)
    course_description = block.find('p', class_='cb_desc').get_text(strip=True)
    
    # Initialize prerequisites as None
    prerequisites = "None"
    
    # Check for prerequisites
    prereq_tag = block.find('p', class_='courseblockextra')
    if prereq_tag and 'Prerequisite' in prereq_tag.get_text():
        prerequisites = prereq_tag.get_text(strip=True)
    
    courses.append({
        'course': course_title,
        'description': course_description,
        'prerequisites': prerequisites
    })

# Print the extracted courses and descriptions
# print(json.dumps(courses, indent=4))

ModuleNotFoundError: No module named 'dotenv'

: 

In [7]:
with open('courses.json', 'w') as json_file:
    json.dump(courses, json_file, indent=4)

# Pre Process the text

In [4]:
def load_data(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def clean_text(text):
    # Add more cleaning steps as needed
    text = text.replace('\u2019', "'").replace('\u201c', '"').replace('\u201d', '"')
    text = text.replace('\u00a0', ' ').replace('\u2122', 'TM').replace('\u2013', '-')
    return text

def preprocess_course_data(course_list):
    for i in course_list:
        i['course'] = clean_text(i['course'])
        i['description'] = clean_text(i['description'])
        i['prerequisites'] = clean_text(i['prerequisites'])
    return course_list

def save_data(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

In [28]:
input_filename = 'courses.json'
output_filename = 'cleaned_course.json'
data = load_data(input_filename)
clean_data = preprocess_course_data(data)
save_data(clean_data, "cleaned_courses.json")


# Generate Embeddings

In [5]:
import json
import pickle
from sentence_transformers import SentenceTransformer

def save_cache(cache, filename):
    with open(filename, 'wb') as f:
        pickle.dump(cache, f)

def load_cache(filename):
    try:
        with open(filename, 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        return {}

def generate_embeddings(data, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = {}

    for course in data:
        course_name = course['course']
        description_embedding = model.encode(course['description']).tolist()
        pre_requisite_embedding = model.encode(course['prerequisites']).tolist() if course['prerequisites'] != "None" else []

        embeddings[course_name] = {
            'description_embedding': description_embedding,
            'pre_requisite_embedding': pre_requisite_embedding
        }

        course['description_embedding'] = description_embedding
        course['pre_requisite_embedding'] = pre_requisite_embedding

    return data, embeddings


    # for website, faqs in data.items():
    #     for faq in faqs:
    #         question_embedding = model.encode(faq['question']).tolist()
    #         faq['question_embedding'] = question_embedding
    #         embeddings[faq['question']] = question_embedding
    # return data, embeddings

In [8]:
input_filename = 'cleaned_courses.json'
output_filename = 'vectorized_courses.json'
cache_filename = 'embeddings_cache_courses.pkl'

clean_data = load_data(input_filename)
vectorized_data, embeddings = generate_embeddings(clean_data)
save_data(vectorized_data, output_filename)
save_cache(embeddings, cache_filename)




# Create or replace embeddings and mapping

In [24]:
from opensearchpy import OpenSearch
from opensearchpy.helpers import bulk
import os

def create_index(os_client, index_name='faqs'):
    settings = {
        "settings": {
            "index": {
                "knn": True,
                "number_of_shards": 1,
                "number_of_replicas": 0,
                "knn.algo_param.ef_search": 100  # Optional: Tune this parameter based on your needs
            },
            "analysis": {
                "analyzer": {
                    "default": {
                        "type": "standard"
                    }
                }
            }
        },
        "mappings": {
            "properties": {
                "course": {
                    "type": "text"
                },
                "description": {
                    "type": "text"
                },
                "prerequisites": {
                    "type": "text"
                },
                "description_embedding": {
                    "type": "knn_vector",
                    "dimension": 384,  # Adjust this to the dimensionality of your embeddings
                    "method": {
                        "name": "hnsw",
                        "space_type": "l2",
                        "engine": "nmslib",
                        "parameters": {
                            "ef_construction": 128,
                            "m": 24
                        }
                    }
                }
            }
        }
    }
    # Delete the index if it already exists
    if os_client.indices.exists(index=index_name):
        os_client.indices.delete(index=index_name)

    os_client.indices.create(index=index_name, body=settings, ignore=400)

def index_faq_embeddings(os_client, index_name, data):
    # Prepare bulk indexing data
    actions = [
        {
            "_index": index_name,
            "_id": i,
            "_source": {
                "course": doc["course"],
                "description": doc["description"],
                "prerequisites": doc["prerequisites"],
                "description_embedding": doc["description_embedding"]
            }
        }
        for i, doc in enumerate(data)
    ]

    # Bulk index data
    bulk(os_client, actions)

    # Check index mapping
    mapping_response = os_client.indices.get_mapping(index=index_name)
    print("\n\nIndex Mapping:")
    print(mapping_response)

In [1]:
index_name = "info_catalog"
os_client = OpenSearch(
        hosts=['https://search-faq-chatbot-jzwpe6i7iz5elujpadeanj6fby.us-east-2.es.amazonaws.com'],
        http_auth=(es_username, es_password)
    )

# create_index(os_client=os_client, index_name=index_name)
# data = load_data('')

NameError: name 'OpenSearch' is not defined

# Dump Vector Embeddings to OpenSearch

In [25]:
data = load_data('vectorized_courses.json')
index_faq_embeddings(os_client=os_client, index_name="info_catalog", data=data)




Index Mapping:
{'info_catalog': {'mappings': {'properties': {'course': {'type': 'text'}, 'description': {'type': 'text'}, 'description_embedding': {'type': 'knn_vector', 'dimension': 384, 'method': {'engine': 'nmslib', 'space_type': 'l2', 'name': 'hnsw', 'parameters': {'ef_construction': 128, 'm': 24}}}, 'prerequisites': {'type': 'text'}}}}}


# KNN Search

In [34]:
from opensearchpy import OpenSearch
from sentence_transformers import SentenceTransformer
import os
import json

def search_similar_questions(os_client, query, model, index_name='info_catalog', top_n=5):
    # Encode the query to get the query vector
    query_vector = model.encode(query).tolist()
    # Debug: Print the query vector to check its format
    # print("Query Vector:", query_vector)
    # print(len(query_vector))

    # # Ensure the query vector is correctly formatted as a list of floats
    # if not isinstance(query_vector, list):
    #     raise ValueError("query_vector should be a list of floats")

    # Change top n according to usecase
    knn_query = {
        "size": 5,
        "query": {
            "knn": {
                "description_embedding": {
                    "vector": query_vector,
                    "k": 5
                }
            }
        }
    }

    # print("KNN Query:", json.dumps(knn_query, indent=2))

    response = os_client.search(index=index_name, body=knn_query) # change the index name according to selected option
    print(response['hits'])
    return [(hit['_source']['course'], hit['_source']['description']) for hit in response['hits']['hits']]



In [44]:



os_client = OpenSearch(
    hosts=['https://search-faq-chatbot-jzwpe6i7iz5elujpadeanj6fby.us-east-2.es.amazonaws.com'],
    http_auth=(es_username, es_password)
)

model = SentenceTransformer('all-MiniLM-L6-v2')

query = "where can I learn building software?" 
similar_questions = search_similar_questions(os_client, query, model)

for course, description in similar_questions:
    print(f"Q: {course}\nA: {description}\n")


{'total': {'value': 5, 'relation': 'eq'}, 'max_score': 0.49554205, 'hits': [{'_index': 'info_catalog', '_type': '_doc', '_id': '2', '_score': 0.49554205, '_source': {'course': 'INFO 5100.  Application Engineering and Development.  (4 Hours)', 'description': 'Takes students in a step-by-step manner through the process of systematically combining UX techniques, business processes, and complex data models to assemble applications that are user-friendly and meet business requirements. Employs the object-oriented paradigm, visual user experience, and system design principles to put together complicated, powerful, real-world applications. The primary objective of this course is to practice social-technical software engineering methods and tools to solve real-world problems. Offers students an opportunity to learn innovative design and programming techniques to build significant business applications quickly; to practice simple and smart ways of making software construction enjoyable; and to 