In [1]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, Docx2txtLoader
from pathlib import Path
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from itertools import combinations
import numpy as np
from langchain.memory import ConversationSummaryBufferMemory,ConversationBufferMemory, ConversationBufferWindowMemory
from langchain.prompts import PromptTemplate
from langchain.chains import create_retrieval_chain, RetrievalQA, ConversationalRetrievalChain, RetrievalQAWithSourcesChain

from langchain_community.llms import HuggingFaceHub

import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
LOCAL_VECTOR_STORE_DIR = Path('./data')
# get hugging face API key from .env file

In [3]:
def langchain_document_loader(TMP_DIR):
    """
    Load documents from the temporary directory (TMP_DIR). 
    Files can be in txt, pdf, CSV or docx format.
    """

    documents = []

    # txt_loader = DirectoryLoader(
    #     TMP_DIR.as_posix(), glob="**/*.txt", loader_cls=TextLoader, show_progress=True
    # )
    # documents.extend(txt_loader.load())

    # pdf_loader = DirectoryLoader(
    #     TMP_DIR.as_posix(), glob="**/*.pdf", loader_cls=PyPDFLoader, show_progress=True
    # )
    # documents.extend(pdf_loader.load())

    # csv_loader = DirectoryLoader(
    #     TMP_DIR.as_posix(), glob="**/*.csv", loader_cls=CSVLoader, show_progress=True,
    #     loader_kwargs={"encoding":"utf8"}
    # )
    # documents.extend(csv_loader.load())

    doc_loader = DirectoryLoader(
        TMP_DIR.as_posix(),
        glob="**/*.docx",
        loader_cls=Docx2txtLoader,
        show_progress=True,
    )
    documents.extend(doc_loader.load())
    return documents

In [4]:
directory_path = 'course reviews'
TMP_DIR = Path(directory_path)
documents = langchain_document_loader(TMP_DIR)


100%|██████████| 295/295 [00:05<00:00, 54.36it/s]


In [5]:
print(documents[0])
print(len(documents))

page_content='Year of study: Junior\n\nEcon 100\nEcon 100 is one of the most fun, intuitive course I took. It gives insights into the world if economics without overwhelming the students. The course delved into some basic Economic models, their applications. The graded instruments were nicely segmented with a well defined outline. The instructor, though some times can feel very standoffish, is no doubt a great instructor if not great human.\n\nGpa: Not yet completed one.' metadata={'source': 'course reviews\\Student_10_Course_100.docx'}
295


In [6]:
def select_embedding_model():
    embedding = OllamaEmbeddings(model='nomic-embed-text')
    return embedding

embeddings_nomic = select_embedding_model()

In [7]:
def create_vectorstore(embeddings,documents,vectorstore_name):
    """Create a Chroma vector database."""
    persist_directory = (LOCAL_VECTOR_STORE_DIR.as_posix() + "/" + vectorstore_name)
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    return vector_store

In [8]:

sentences = ["I like pets.",
             "Pets bring joy to our lives.",
             "Langchain is a framework for developing applications powered by LLMs."]

# 1. Calculate embedding vectors
embedding_vectors = [embeddings_nomic.embed_query(sentence) for sentence in sentences]

for combination in list(combinations(range(len(sentences)),2)):
    # 2. Calculate similarity using dot product from numpy:
    dot_prodduct = round(np.dot(embedding_vectors[combination[0]], embedding_vectors[combination[1]]),3)
    print(f"Similarty of sentences {combination}: {dot_prodduct}")

Similarty of sentences (0, 1): 331.425
Similarty of sentences (0, 2): 120.974
Similarty of sentences (1, 2): 170.213


In [9]:
create_vectorstores = False # change to True to create vectorstores

if create_vectorstores:
    vector_store_nomic = create_vectorstore(embeddings_nomic,documents,"vector_store_nomic")
    print("Vector store created")
    print("")

In [10]:
vector_store_nomic = Chroma(persist_directory = LOCAL_VECTOR_STORE_DIR.as_posix() + "/vector_store_nomic", 
                            embedding_function=embeddings_nomic)
print("vector_store_Ollama:",vector_store_nomic._collection.count(),"chunks.")

vector_store_Ollama: 429 chunks.


In [11]:
def print_documents(docs,search_with_score=False):
    """helper function to print documents."""
    if search_with_score:
        # used for similarity_search_with_score
        print(
            f"\n{'-' * 100}\n".join(
                [f"Document {i+1}:\n\n" + doc[0].page_content +"\n\nscore:"+str(round(doc[-1],3))+"\n" 
                 for i, doc in enumerate(docs)]
            )
        )
    else:
        # used for similarity_search or max_marginal_relevance_search
        print(
            f"\n{'-' * 100}\n".join(
                [f"Document {i+1}:\n\n" + doc.page_content 
                 for i, doc in enumerate(docs)]
            )
        )  

In [12]:
# Get most similar documents - with scores 
# Here, we use Cosine Similarity. So a lower score is better.

query = 'What are some difficult CS courses?'
docs_withScores = vector_store_nomic.similarity_search_with_score(query,k=4)

print_documents(docs_withScores,search_with_score=True)

Document 1:

Year of study: Junior

Review for 200 level course:

CS 202
Data Structures with Sir Ihsan was a challenging yet fun course. I learned a lot and the course challenges your ability to think and rationalize. Learning wise, this course is great. The outline is well defined and you already know the quiz schedule before the semester so that helps you set your schedule before hand. There is no midterm either which helps during the midweek by lessening the burden. The assignments are comparatively easier but still challenging enough. 

Gpa: 3.60-4.00

score:375.154

----------------------------------------------------------------------------------------------------
Document 2:

Year of study: Junior

Review for 200 level course:

CS 202
Data Structures with Sir Ihsan was a challenging yet fun course. I learned a lot and the course challenges your ability to think and rationalize. Learning wise, this course is great. The outline is well defined and you already know the quiz schedu

In [13]:
query_embeddings = embeddings_nomic.embed_query(query)
docs_embeddings = embeddings_nomic.embed_documents(
    [docs_withScores[i][0].page_content 
     for i in range(len(docs_withScores))
    ]
)

for i in range(len(docs_embeddings)):
    dot_product = round(np.dot(query_embeddings, docs_embeddings[i]),4)
    print(f"Similarty of document_{i} to the query: {dot_product}")

Similarty of document_0 to the query: 190.1584
Similarty of document_1 to the query: 190.1584
Similarty of document_2 to the query: 209.8965
Similarty of document_3 to the query: 206.0805


# Retriever

In [14]:
def Vectorstore_backed_retriever(vectorstore,search_type="similarity",k=4,score_threshold=None):
    """create a vectorsore-backed retriever
    Parameters: 
        search_type: Defines the type of search that the Retriever should perform.
            Can be "similarity" (default), "mmr", or "similarity_score_threshold"
        k: number of documents to return (Default: 4) 
        score_threshold: Minimum relevance threshold for similarity_score_threshold (default=None)
    """
    search_kwargs={}
    if k is not None:
        search_kwargs['k'] = k
    if score_threshold is not None:
        search_kwargs['score_threshold'] = score_threshold

    retriever = vectorstore.as_retriever(
        search_type=search_type,
        search_kwargs=search_kwargs
    )
    return retriever

In [15]:
# Similarity search
retriever = Vectorstore_backed_retriever(vector_store_nomic,search_type="similarity",k=4)

# Get relevant documents

query = 'What are some intellectually challenging and stimulating courses?'
relevant_docs = retriever.get_relevant_documents(query)

print_documents(relevant_docs)

Document 1:

Year of study: Junior

Review for 200 level course:

CS 202
Data Structures with Sir Ihsan was a challenging yet fun course. I learned a lot and the course challenges your ability to think and rationalize. Learning wise, this course is great. The outline is well defined and you already know the quiz schedule before the semester so that helps you set your schedule before hand. There is no midterm either which helps during the midweek by lessening the burden. The assignments are comparatively easier but still challenging enough. 

Gpa: 3.60-4.00
----------------------------------------------------------------------------------------------------
Document 2:

Year of study: Junior

Review for 200 level course:

CS 202
Data Structures with Sir Ihsan was a challenging yet fun course. I learned a lot and the course challenges your ability to think and rationalize. Learning wise, this course is great. The outline is well defined and you already know the quiz schedule before the se

# Instantiating LLM

In [16]:
def instantiate_LLM(api_key,temperature=0.5,top_p=0.95,model_name=None):
    """Instantiate LLM in Langchain.
    Parameters:
        LLM_provider (str): the LLM provider; in ["OpenAI","Google","HuggingFace"]
        model_name (str): in ["gpt-3.5-turbo", "gpt-3.5-turbo-0125", "gpt-4-turbo-preview", 
            "gemini-pro", "mistralai/Mistral-7B-Instruct-v0.2"].            
        api_key (str): google_api_key or openai_api_key or huggingfacehub_api_token 
        temperature (float): Range: 0.0 - 1.0; default = 0.5
        top_p (float): : Range: 0.0 - 1.0; default = 1.
    """
    
  
    llm = HuggingFaceHub(
        repo_id="mistralai/Mistral-7B-Instruct-v0.2", 
        # repo_id="meta-llama/Meta-Llama-3-8B-Instruct", 
        # repo_id=model_name,
        huggingfacehub_api_token=api_key,
        model_kwargs={
            "temperature":temperature,
            "top_p": top_p,
            "do_sample": True,
            "max_new_tokens":1024,
        },
    )
    return llm

HUGGINGFACE_API_KEY = os.getenv("HUGGING_FACE_API_KEY")
llm = instantiate_LLM(api_key=HUGGINGFACE_API_KEY)



  warn_deprecated(


# Memory Initialization

In [17]:
def create_memory():
    """Creates a ConversationSummaryBufferMemory for our model
    Creates a ConversationBufferWindowMemory for our models."""
    
    memory = ConversationBufferWindowMemory(
        memory_key="history",
        input_key="question",
        return_messages=True,
        k=2
    )

    return memory

memory = create_memory()

# Creating the Context

In [18]:
memory.save_context(
    {"question": "What can you do?"},
    {"output": "I can answer queries based on the past reviews and course outlines of various courses offered at LUMS."}
)

# Prompt Template

In [19]:
context_qa = """
You are a professional chatbot assistant for helping students at LUMS regarding course selection.

Please follow the following rules:

1. Answer the question in your own words from the context given to you.
2. If you don't know the answer, don't try to make up an answer.
3. If you don't have a course's review or outline, just say that you do not know about this course.
4. If a user enters a course code (e.g. ECON100 or CS370), match it with reviews with that course code. If the user enters a course name (e.g. Introduction to Economics or Database Systems), match it with reviews with that course name.
5. If the user is not asking about a course, ignore the context and answer the question based on your general knowledge.

Context: {context}

You are having a converation with a student at LUMS.

Chat History: {history}

Human: {question}

Assistant123:
"""

prompt = PromptTemplate(
    input_variables=["history", "context", "question"],
    template=context_qa  
)

# Putting it all together

In [20]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    verbose=False,
    return_source_documents=False,
    chain_type_kwargs={
        "prompt": prompt,
        "memory": memory
    },
)


In [22]:
while True:
    user_input = input("Enter your question here (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Exiting ChatBot. Goodbye!")
        break
    print("  .  .  .   ")
    #result = agent(user_input)

    result = qa({'query': user_input})

    # print("result:", result)

    answer = result['result']

    # only keep the part followed by 'Assistant123:'
    answer = answer.split('Assistant123:')[-1]
    print("Me:", user_input)
    print("Chatbot:", answer)



  .  .  .   


  warn_deprecated(


Me: what are good courses that are math extensive?
Chatbot: 
Based on the past reviews, courses that are math-extensive at LUMS include Introduction to Analysis I (MATH 205). Students who have taken this course have reported learning rigorous proofs behind calculus theory, covering sequences and series, continuity, and differentiability of functions. The course had quizzes, assignments, and both mid and final exams. The course instructor, Dr. Waqas, was praised for teaching effectively and providing notes on his website. If you enjoy proof-writing and want to build up your skills in rigorous mathematics, this course could be a good fit for you. It's also the basis for most advanced mathematics courses.
Exiting ChatBot. Goodbye!
