# ITS Support Chatbot

This chatbot is an educational tool that's built to answer questions related to the CSUSB's [Information Technology Services](https://www.csusb.edu/its).


### Milvus Setup

In [None]:
# Import all required libraries for the chatbot
# Operating system interface for file/directory operations
import os
# Milvus database connection and utility functions
from pymilvus import connections, utility
# LangChain component for combining multiple documents into one context
from langchain.chains.combine_documents import create_stuff_documents_chain
# Base document class for storing text and metadata
from langchain.schema import Document
# Template system for creating chat prompts
from langchain_core.prompts import ChatPromptTemplate
# Groq's language model interface
from langchain_groq.chat_models import ChatGroq
# Milvus vector database integration for LangChain
from langchain_milvus import Milvus
from pymilvus import connections, utility, Collection, CollectionSchema, FieldSchema, DataType
# Tool for downloading web pages recursively
from langchain_community.document_loaders import RecursiveUrlLoader
# Library for parsing HTML content
from bs4 import BeautifulSoup
# Tool for splitting text into smaller chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Component for creating document retrieval systems
from langchain.chains import create_retrieval_chain
# Interface for HuggingFace's embedding models
from langchain_huggingface import HuggingFaceEmbeddings
import pymilvus;

# Define constant values used throughout the program
# URL of the website we'll use as our knowledge base
WEBSITE_URL = 'https://www.csusb.edu/its'
# Path where we'll store our vector database files
DATABASE_PATH = "milvus/jupyter_milvus_vector3.db"
# Name of the embedding model we'll use to convert text to vectors
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L12-v2"


def vector_store_check(uri):
    """
    Returns response on whether the vector storage exists

    Returns:
        boolean
    """
    # Create the directory if it does not exist
    head = os.path.split(uri)
    os.makedirs(head[0], exist_ok=True)

    # Connect to the Milvus database
    connections.connect("default", uri=uri)

    # Return True if exists, False otherwise
    return utility.has_collection("IT_support")

print("Function `vector_store_check` defined.")

def load_existing_db(uri=DATABASE_PATH):
    """
    Load an existing vector store from the local Milvus database specified by the URI.

    Args:
        uri (str, optional): Path to the local milvus db. Defaults to DATABASE_PATH.

    Returns:
        vector_store: The vector store created
    """
    # Load an existing vector store
    vector_store = Milvus(
        collection_name="IT_support",
        embedding_function=get_embedding_function(),
        connection_args={"uri": uri},
        index_params={"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 128}},  # Override default index
                        
    )
    print("Vector store loaded")
    return vector_store

print("Function `load_existing_db` defined.")

def get_embedding_function():
    """
    Returns embedding function for the model

    Returns:
        embedding function
    """
    embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

    return embedding_function

print("Function `get_embedding_function` defined.")

def load_documents_from_web():
    """
    Load the documents from the web and store the page contents

    Returns:
        list: The documents loaded from the web
    """
    loader = RecursiveUrlLoader(
        url=DATABASE_PATH,
        prevent_outside=True,
        base_url=DATABASE_PATH
        )
    raw_documents = loader.load()

    # Ensure documents are cleaned
    cleaned_documents = []
    for doc in raw_documents:
        cleaned_text = clean_text_from_html(doc.page_content)
        cleaned_documents.append(Document(page_content=cleaned_text, metadata=doc.metadata))

    return cleaned_documents

print("Function `load_documents_from_web` defined.")

def split_documents(documents):
    """
    Split the documents into chunks

    Args:
        documents (list): The documents to split

    Returns:
        list: list of chunks of documents
    """
    # Create a text splitter to split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=300,
        is_separator_regex=False,
    )

    # Split the documents into chunks
    docs = text_splitter.split_documents(documents)

    print("Documents successfully split")
    return docs

print("Function `split_documents` defined.")

def create_vector_store(docs, embeddings, uri):
    """
    This function initializes a vector store using the provided documents and embeddings.

    Args:
        docs (list): A list of documents to be stored in the vector store.
        embeddings : A function or model that generates embeddings for the documents.
        uri (str): Path to the local milvus db

    Returns:
        vector_store: The vector store created
    """
    # Create a new vector store and drop any existing one
    vector_store = Milvus.from_documents(
        documents=docs,
        embedding=embeddings,
        collection_name="IT_support",
        connection_args={"uri": uri},
        drop_old=True,
        index_params={"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 128}},  # Override default index

    )

    print("Vector store created")
    return vector_store

print("Function `create_vector_store` defined.")

def initialize_milvus(uri: str=DATABASE_PATH):
    print(pymilvus.__version__)
    """
    Initialize the vector store for the RAG model

    Args:
        uri (str, optional): Path to the local vector storage. Defaults to DATABASE_PATH.

    Returns:
        vector_store: The vector store created
    """
    if vector_store_check(uri):
        vector_store = load_existing_db(uri)
        print("Embeddings loaded from existing storage")
    else:
        embeddings = get_embedding_function()
        print("Embeddings Loaded")
        documents = load_documents_from_web()
        print("Documents Loaded")

        # Split the documents into chunks
        docs = split_documents(documents=documents)
        print("Documents Splitting completed")

        vector_store = create_vector_store(docs, embeddings, uri)
    print("Milvus successfully initialized")
    return vector_store

print("Function `initialize_milvus` defined.")

In [None]:
def main():
    initialize_milvus();

In [None]:
# Check if this file is being run directly (not imported)
if __name__ == "__main__":
    # Start the main program
    main()