In [None]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr
import numpy as np

In [None]:
import torch
import os
import shutil
#from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import plotly.graph_objects as go
from sklearn.manifold import TSNE
from langchain.prompts import ChatPromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import LLMChain

In [None]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-mini"
db_name = "vector_db"
# your path to the slides folder
DATA_PATH = "slides_knowledge_base" 
OLD_CHUNK_ID = ""


In [None]:
# Load environment variables in a file called .env
# where your openAI key will be or else directly pass it down there

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
# getting the files and loading the PDf 
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

In [None]:
# loading the documents
documents = load_documents()

In [None]:
# split the documents
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=300,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [None]:
chunks =  split_documents(documents)

In [None]:
#making the embeddings
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

embeddings = OpenAIEmbeddings()



In [None]:
# it will calculate the chunk ids for the update of slides
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [None]:
# adding to the vector Database
def add_to_chroma(chunks: list[Document]):
    # Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
    #Create vectorstore
    vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
    OLD_CHUNK_ID = calculate_chunk_ids(chunks)
    print(f"Vectorstore created with {vectorstore._collection.count()} documents")
    return vectorstore , OLD_CHUNK_ID
        


In [None]:
#getting the database and the chunks ids of the pdf
vectorstore , OLD_CHUNK_ID = add_to_chroma(chunks)

In [None]:
# function for checking for the new data
def check_data(new_chunks_id : list, old_chunks_id:list):

    # Collecting the IDs from the newlist for comparison
    oldlist_ids = {old_doc.metadata['id'] for old_doc in old_chunks_id}
    # Iterate through the oldlist and check if the ID exists in newlist
    new_chunks= []
    for new_doc in new_chunks_id:
        if new_doc.metadata['id'] not in oldlist_ids:
            new_chunks.append(new_doc)
            


    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        vectorstore.add_documents(documents = new_chunks_id)
        return new_chunks_id
        
        # new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        # db.add_documents(new_chunks, ids=new_chunk_ids)
    else:
        print("✅ No new documents to add")
        return new_chunks_id

    

In [None]:
# it will create the database if not exist and will make the first embeddings
def new_pdf():
    if os.path.exists(db_name):
        new_documents = load_documents()
        new_chunks =  split_documents(new_documents)
        new_chunks_id = calculate_chunk_ids(new_chunks)
        return new_chunks_id
    else:
        print("first create your database")

In [None]:
# it will return the chunk ids
getting_new_chunks_ids = new_pdf()

In [None]:
# will check if new slide entered or not
new_chunks_id = check_data(getting_new_chunks_ids,OLD_CHUNK_ID)

In [None]:
# setting the global variable
OLD_CHUNK_ID = new_chunks_id

In [None]:
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Get one vector and find how many dimensions it has
collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

In [None]:
# Get the embeddings, documents, and metadatas
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['source'] for metadata in result['metadatas']]

# Define a pool of base colors
base_colors = ['blue', 'green', 'red', 'orange', 'yellow', 'grey', 'indigo', 'olive', 'forestgreen', 'pink', 'violet', 'darkred', 'ivory']

# Create a mapping of document types to colors
unique_doc_types = list(set(doc_types))  # Get unique document types
num_unique_docs = len(unique_doc_types)

# If there are more unique document types than base colors, generate more colors
if num_unique_docs > len(base_colors):
    # You can use a color generation library like matplotlib to generate distinct colors
    import matplotlib.pyplot as plt
    colors = plt.cm.get_cmap('hsv', num_unique_docs)  # Using a colormap to get distinct colors
    color_mapping = {doc: colors(i) for i, doc in enumerate(unique_doc_types)}
else:
    color_mapping = {doc: base_colors[i] for i, doc in enumerate(unique_doc_types)}

# Assign colors based on document types
colors = [color_mapping[t] for t in doc_types]


In [None]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.2, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})


# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory ,verbose=True)

In [None]:
query = "in bagging we need to train models on which version of dataset?"
result = conversation_chain.invoke({"question":query})
print(result["answer"])

In [None]:
# you can use the gradio for that too 
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [None]:
view = gr.ChatInterface(chat).launch()