# RAG on my own Data

In [11]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_huggingface import HuggingFaceEmbeddings  # Updated
from langchain_community.retrievers import BM25Retriever  # Updated
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain_cohere import CohereRerank  # Updated

In [12]:
from langchain.document_loaders import (
    PyPDFLoader,
    TextLoader,
    UnstructuredMarkdownLoader,
    Docx2txtLoader,
    CSVLoader,
    UnstructuredExcelLoader,
    UnstructuredPowerPointLoader,
    JSONLoader,
    UnstructuredHTMLLoader,
)
import io
from pathlib import Path
import tempfile

In [24]:
!pip install google_auth_oauthlib
!pip install unstructured
!pip install docx2txt
!pip install -U langchain-huggingface
!pip install sentence-transformers
!pip install rank_bm25
!pip install -U langchain-huggingface langchain-community langchain-cohere

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langchain-community
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-cohere
  Downloading langchain_cohere-0.4.6-py3-none-any.whl.metadata (6.6 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting langchain-text-splitters<2.0.0,>=1.0.0 (from langchain-classic<2.0.0,>=1.0.0->langchain-community)
  Downloading langchain_text_splitters-1.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting cohere<6.0,>=5.18.0 (from langchain-cohere)
  Downloading cohere-5.19.0-py3-none-any.whl.metadata (3.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.31-py3-none-any.whl.metadata (3.0 kB)
INFO: pip is looking at multiple versions of langchain-cohere to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-cohere
  Downloading langchain_cohere-0.4.5-py3-none-any.whl.met

In [13]:
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import pickle

In [14]:
class _TempFileContext:
    """Context manager for temporary files that auto-delete."""
    
    def __init__(self, file_path):
        self.file_path = file_path
    
    def __enter__(self):
        return self.file_path
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        # Delete the temp file
        try:
            Path(self.file_path).unlink(missing_ok=True)
        except:
            pass  # Ignore deletion errors
        return False

In [15]:
# Load environment variables in a file called .env

MODEL = "gpt-4o-mini"
db_name = "rag-perso-db"

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['CO_API_KEY'] = os.getenv('CO_API_KEY', 'your-key-if-not-using-env')  # Cohere API Key

In [16]:
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']

def authenticate_drive():
    """Authenticate and return Google Drive service."""
    creds = None
    
    # Token file stores access and refresh tokens
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    
    # If no valid credentials, authenticate
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                '.credentials.json', SCOPES)
            # This will open a browser for authentication
            creds = flow.run_local_server(port=0)
        
        # Save credentials for future runs
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)
    
    return build('drive', 'v3', credentials=creds)

service = authenticate_drive()

In [17]:
def list_files(service, query=None, folder_id=None):
    """List files from Google Drive with optional filtering."""
    q = []
    
    if folder_id:
        q.append(f"'{folder_id}' in parents")
    
    if query:
        q.append(query)
    
    # Only get non-trashed files
    q.append("trashed=false")
    
    query_string = " and ".join(q)
    
    results = service.files().list(
        q=query_string,
        pageSize=100,
        fields="files(id, name, mimeType, size)"
    ).execute()
    
    return results.get('files', [])

In [8]:
# ID Erebor - Private = 1SgEZopyErkjjJySNjzT3DdLW2S9bUgxy
list_files(service, folder_id="1SgEZopyErkjjJySNjzT3DdLW2S9bUgxy") 

[{'id': '1ml82VyxfORutmOrqsiTVY1KWDC956GY9',
  'name': 'signed_deloitte_contract',
  'mimeType': 'application/vnd.google-apps.folder'},
 {'id': '1Ckr1AiFhTzLACuSY-5DeWZ60Edpvr9xW',
  'name': 'Certif',
  'mimeType': 'application/vnd.google-apps.folder'},
 {'id': '15DmJW5vWlGnlT_gopjSvoinWW_b8t9HX',
  'name': 'Dividendes & AG & Bilans',
  'mimeType': 'application/vnd.google-apps.folder'},
 {'id': '13rux-o4rIwwa_i8sVWgGSycqW5S1VRGq',
  'name': 'Assurances',
  'mimeType': 'application/vnd.google-apps.folder'},
 {'id': '13LI61sCf8pBcz_aNGHmxQft5qTNenYDX',
  'name': 'IDS',
  'mimeType': 'application/vnd.google-apps.folder'},
 {'id': '1-4J1Vtsv27vUNlFehhHDnjork2btD67W',
  'name': 'Contrats',
  'mimeType': 'application/vnd.google-apps.folder'},
 {'id': '1Pa6pdTukckO5EIydSrk3FqpYJAq1KCu-',
  'name': 'Screencastify',
  'mimeType': 'application/vnd.google-apps.folder'},
 {'id': '1hB_K7wGUGVn3w3biLQY63s-F57IRTnihneVMAiEZtPM',
  'name': 'Dev processes',
  'mimeType': 'application/vnd.google-apps.do

In [18]:
def download_file(service, file_id, file_name):
    """
    Download a regular file from Google Drive to a temporary file.
    
    Returns a context manager that provides the temp file path.
    The file is automatically deleted when the context exits.
    
    Usage:
        with download_file(service, file_id, file_name) as temp_path:
            # Use temp_path here
            loader = SomeLoader(temp_path)
            docs = loader.load()
        # Temp file automatically deleted here
    """
    # Get file extension from name
    extension = Path(file_name).suffix or '.tmp'
    
    request = service.files().get_media(fileId=file_id)
    
    # Create temp file that persists until context exits
    tmp_file = tempfile.NamedTemporaryFile(
        suffix=extension, 
        delete=False,  # We'll delete it manually in context manager
        mode='wb'
    )
    
    try:
        downloader = MediaIoBaseDownload(tmp_file, request)
        done = False
        while not done:
            status, done = downloader.next_chunk()
        
        tmp_file.flush()
        tmp_file.close()
        
        # Return context manager
        return _TempFileContext(tmp_file.name)
    except:
        # Clean up on error
        tmp_file.close()
        Path(tmp_file.name).unlink(missing_ok=True)
        raise


def export_google_doc(service, file_id, file_name, mime_type):
    print("coucou")
    """
    Export Google Docs/Sheets/Slides to a temporary file.
    
    Returns a context manager that provides the temp file path.
    The file is automatically deleted when the context exits.
    
    Usage:
        with export_google_doc(service, file_id, file_name, mime_type) as temp_path:
            # Use temp_path here
            loader = SomeLoader(temp_path)
            docs = loader.load()
        # Temp file automatically deleted here
    """
    export_map = {
        'application/vnd.google-apps.document': ('text/plain', '.txt'),
        'application/vnd.google-apps.spreadsheet': ('text/csv', '.csv'),
        'application/vnd.google-apps.presentation': ('text/plain', '.txt'),
    }
    
    if mime_type not in export_map:
        return None, None
    
    export_mime, extension = export_map[mime_type]
    request = service.files().export_media(fileId=file_id, mimeType=export_mime)
    
    # Create temp file
    tmp_file = tempfile.NamedTemporaryFile(
        suffix=extension,
        delete=False,
        mode='wb'
    )
    print(export_mime, extension)
    print(tmp_file.name)
    
    try:
        downloader = MediaIoBaseDownload(tmp_file, request)
        done = False
        while not done:
            status, done = downloader.next_chunk()
        
        tmp_file.flush()
        tmp_file.close()
        
        return _TempFileContext(tmp_file.name), export_mime
    except:
        tmp_file.close()
        Path(tmp_file.name).unlink(missing_ok=True)
        raise


In [19]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=[
    "\n\n",      # Paragraphs first
    "\n",        # Then lines
    ". ",        # Then sentences
    " ",         # Then words
    ""           # Then characters
]
)

LOADER_MAPPING = {
    'application/pdf': PyPDFLoader,
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document': Docx2txtLoader,
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': UnstructuredExcelLoader,
    'application/vnd.ms-excel': UnstructuredExcelLoader,
    'text/csv': CSVLoader,
    'application/vnd.openxmlformats-officedocument.presentationml.presentation': UnstructuredPowerPointLoader,
    'application/vnd.ms-powerpoint': UnstructuredPowerPointLoader,
    'text/plain': TextLoader,
    'text/markdown': UnstructuredMarkdownLoader,
    'text/html': UnstructuredHTMLLoader
}
file_types = [
    # Documents
    'application/pdf',
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',  # .docx
    'application/vnd.google-apps.document',  # Google Docs
    
    # Spreadsheets
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',  # .xlsx
    'application/vnd.google-apps.spreadsheet',  # Google Sheets
    'text/csv',
    
    # Presentations
    'application/vnd.openxmlformats-officedocument.presentationml.presentation',  # .pptx
    'application/vnd.google-apps.presentation',  # Google Slides
    
    # Text
    'text/plain',
    'text/markdown',
]

In [20]:
def get_files(service, folder_id, file_types):
    print("üîç Starting recursive folder scan...")
    query = None
    folders_to_process = [folder_id]
    all_files = []
    type_queries = [f"mimeType='{ft}'" for ft in file_types]
    query = "(" + " or ".join(type_queries) + ")"
    folder_count = 0
    while len(folders_to_process) > 0:
        current_folder = folders_to_process.pop()

        files = list_files(service, query=query, folder_id=current_folder)
        all_files.extend(files)

        print(f"Found {len(files)} files to download")
        subfolders = list_files(service, query="mimeType='application/vnd.google-apps.folder'", folder_id=current_folder)
        for subfolder in subfolders:
            folders_to_process.append(subfolder['id'])
        folder_count += 1
        print(f"  üìÅ Folder {folder_count}: {len(files)} files, {len(subfolders)} subfolders (Total: {len(all_files)} files so far)")

    return all_files

def load_with_langchain(file_info):
    """Load a file using appropriate LangChain loader."""
    file_path = file_info['path']
    mime_type = file_info['mime_type']
    
    # Map MIME type to loader
    loader_class = LOADER_MAPPING.get(mime_type)
    
    if loader_class:
        try:
            loader = loader_class(file_path)
            return loader.load()
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Error loading {file_info['name']}: {e}")
            # Fallback to TextLoader
            try:
                loader = TextLoader(file_path, encoding='utf-8')
                return loader.load()
            except:
                return None
    else:
        # Try TextLoader as fallback
        try:
            loader = TextLoader(file_path, encoding='utf-8')
            return loader.load()
        except Exception as e:
            print(f"  ‚ö†Ô∏è  No loader for {mime_type}, skipping {file_info['name']}")
            return None

def process_file_content(service, file):
    print(f"  ‚¨áÔ∏è  {file['name']}")
    try:
        mime_type = file['mimeType']
        
        # Handle Google Workspace files (export)
        if mime_type.startswith('application/vnd.google-apps'):
            file_path, mime = export_google_doc(
                service, file['id'], file['name'], 
                mime_type
            )
            mime_type = mime
        # Handle regular files (download)
        else:
            file_path = download_file(service, file['id'], file['name'])
        if file_path:
            content = load_with_langchain({
                'path': file_path.file_path,
                'mime_type': mime_type,
                'name': file['name'],
                'id': file['id']
            })
            for doc in content:
                doc.metadata['source'] = file['name']
                doc.metadata["file_type"] = mime_type
            return content
        return None
            
    except Exception as e:
        print(f"  ‚ùå Error downloading {file['name']}: {e}")
        return None
    



In [21]:
def get_chunks(service, folder_id, text_splitter, file_types):
    files = get_files(service, folder_id, file_types)
    print(files)
    i = 0
    chunks_list = []
    for file in files:
        print(file["name"])
        content = process_file_content(service, file)
        print(content)
        if content:
            chunks = text_splitter.split_documents(content)
            chunks_list.extend(chunks)
            i += 1
    print(f"‚úÖ Downloaded {i} files\n")
    return chunks_list


In [22]:
chunks = get_chunks(service, "1SgEZopyErkjjJySNjzT3DdLW2S9bUgxy", text_splitter, file_types)

üîç Starting recursive folder scan...
Found 16 files to download
  üìÅ Folder 1: 16 files, 17 subfolders (Total: 16 files so far)
Found 8 files to download
  üìÅ Folder 2: 8 files, 0 subfolders (Total: 24 files so far)
Found 3 files to download
  üìÅ Folder 3: 3 files, 0 subfolders (Total: 27 files so far)
Found 6 files to download
  üìÅ Folder 4: 6 files, 0 subfolders (Total: 33 files so far)
Found 0 files to download
  üìÅ Folder 5: 0 files, 0 subfolders (Total: 33 files so far)
Found 11 files to download
  üìÅ Folder 6: 11 files, 0 subfolders (Total: 44 files so far)
Found 0 files to download
  üìÅ Folder 7: 0 files, 5 subfolders (Total: 44 files so far)
Found 0 files to download
  üìÅ Folder 8: 0 files, 0 subfolders (Total: 44 files so far)
Found 10 files to download
  üìÅ Folder 9: 10 files, 0 subfolders (Total: 54 files so far)
Found 0 files to download
  üìÅ Folder 10: 0 files, 0 subfolders (Total: 54 files so far)
Found 0 files to download
  üìÅ Folder 11: 0 files, 

Multiple definitions in dictionary at byte 0x520f5 for key /ExtGState


[Document(metadata={'producer': 'FPDF 1.53', 'creator': 'PyPDF', 'creationdate': 'D:20210208125422', 'source': 'devis_portail.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1', 'file_type': 'application/pdf'}, page_content="TORI PORTAILS\n472, Grand Route\nB1428 - LILLOIS\nTel : +32 (0)027 31 61 55\nWeb : www.toriportails.be\nDevis N¬∞ PT210113301\nDate : 10/01/2021\nVanhee Florian\n21 Kleinwaverstraat\n3040 - HULDENBERG\nInfos Client : T√©l. : +32 (0)474 71 34 00 - eMail : florian@istarii.com\nCommercial : PORTAILS TORI\nD√©signation Prix U. Qte Total H.T.\n    \nBonjour,    \n    \nVous trouverez ci-joint votre devis.    \n    \nNos atouts:    \n- Alu d'origine 100% fran√ßaise    \n- Garantie de 25 ans contre toute vice de fabrication ou d√©formation\nanormale\n   \n- Thermolaquage √† la norme Qualicoat Seaside - Qualit√© bord de mer -\nQualicoat.\n   \nCe Label europ√©en permet de vous garantir 25 ans le laquage de nos\nportails en aluminium et le laquage. Ainsi vous avez l'assur

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wrong pointing object 34 0 (offset 0)


[Document(metadata={'producer': 'macOS Version 14.5 (assemblage 23F79) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20240615081646Z00'00'", 'moddate': "D:20240615081646Z00'00'", 'source': 'RapportsAGSRL2024-pdf.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1', 'file_type': 'application/pdf'}, page_content="ERIADOR SRL  Bilan au 31 d√©cembre 2023 \n \n¬´ EREBOR ¬ª Soci√©t√© √† Responsabilit√© Limit√©e Avenue Hermann-Debroux 40 1160 Auderghem Taxe sur la Valeur Ajout√©e BE 0778.335.918  RAPPORT DE L‚ÄôADMINISTRATEUR UNIQUE   Mesdames, Messieurs,  Nous avons l‚Äôhonneur de vous soumettre notre rapport sur la gestion exerc√©e au cours de l‚Äôexercice social cl√¥tur√© le 31 d√©cembre 2023.  1. Des comptes annuels :  L‚Äôexercice se cl√¥ture par un b√©n√©fice de 34.271,89 ‚Ç¨. L‚Äôadministrateur unique a d√©cid√© d‚Äôarr√™ter les comptes annuels dans la forme suivante.  a) Du bilan : ACTIF :  \n    PASSIF :  \n    \n31-12-23 31-12-22 Variations\nFrais d'√©tablissement 0,00 

In [23]:
print(len(chunks))

2460


In [24]:

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
# embeddings = HuggingFaceEmbeddings(
#     model_name="intfloat/multilingual-e5-large"
# )

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma(
    embedding_function=embeddings,
    persist_directory=db_name
)

bm25_retriever = BM25Retriever.from_documents(chunks)

# Add documents in batches
batch_size = 500  # Adjust as needed
for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i+batch_size]
    vectorstore.add_documents(batch)
    print(f"Added batch {i//batch_size + 1}: {len(batch)} documents")

print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Added batch 1: 500 documents
Added batch 2: 500 documents
Added batch 3: 500 documents
Added batch 4: 500 documents
Added batch 5: 460 documents
Vectorstore created with 2460 documents


In [26]:
llm = ChatOpenAI(model=MODEL, temperature=0.7)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
ensemble_retriever = EnsembleRetriever(
    retrievers=[vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 20}), bm25_retriever],
    weights=[0.7, 0.3]
)

compressor = CohereRerank(model="rerank-multilingual-v3.0")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, 
    base_retriever=ensemble_retriever
)

#retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=compression_retriever, memory=memory)

In [27]:
def chat(question, history):
    result = chain.invoke({"question": question})
    return result["answer"]


view = gr.ChatInterface(
    fn=chat,
    title="RAG Personal Assistant",
    type="messages",
    description="Ask questions about your personal documents stored in Google Drive.",
)
view.launch(inbrowser=True)



* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




In [74]:
collection = vectorstore._collection
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
print(result['metadatas'])
documents = result['documents']

[{'source': '/var/folders/_w/64bprpy559n47rwq3chplx600000gn/T/tmps5yl1ldm.txt'}, {'page': 0, 'source': '/var/folders/_w/64bprpy559n47rwq3chplx600000gn/T/tmpggk78eu9.pdf', 'creator': 'PyPDF', 'creationdate': 'D:20220325151200', 'producer': 'FPDF 1.53', 'total_pages': 3, 'page_label': '1'}, {'page': 0, 'creator': 'PyPDF', 'creationdate': 'D:20220325151200', 'page_label': '1', 'total_pages': 3, 'source': '/var/folders/_w/64bprpy559n47rwq3chplx600000gn/T/tmpggk78eu9.pdf', 'producer': 'FPDF 1.53'}, {'total_pages': 3, 'page': 1, 'page_label': '2', 'creationdate': 'D:20220325151200', 'source': '/var/folders/_w/64bprpy559n47rwq3chplx600000gn/T/tmpggk78eu9.pdf', 'producer': 'FPDF 1.53', 'creator': 'PyPDF'}, {'total_pages': 3, 'creator': 'PyPDF', 'page': 1, 'creationdate': 'D:20220325151200', 'source': '/var/folders/_w/64bprpy559n47rwq3chplx600000gn/T/tmpggk78eu9.pdf', 'producer': 'FPDF 1.53', 'page_label': '2'}, {'page_label': '2', 'producer': 'FPDF 1.53', 'source': '/var/folders/_w/64bprpy559n

In [None]:
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [76]:
from langchain_core.callbacks import StdOutCallbackHandler

llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

retriever = vectorstore.as_retriever()

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])

query = "Y a t-il une CONVENTION DE RECONNAISSANCE DE DETTE ?"
result = conversation_chain.invoke({"question": query})
answer = result["answer"]
print("\nAnswer:", answer)



[1m> Entering new ConversationalRetrievalChain chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
convention sp√©ciale n'a √©t√© conclue doit √™tre apur√©
imm√©diatement et sans mise en demeure. Sur ce solde
seront dus de plein droit des int√©r√™ts calcul√©s au taux
appliqu√© par la banque aux d√©passements non autoris√©s
et fix√© en tenant notamment compte des donn√©es du
march√©. Ce taux d'int√©r√™t est port√© √† la connaissance
5/19
Conditions g√©n√©rales de leasing et de renting
002809 / 420-5069113-94/001/004/004 / KC1342

leasings avec la banque.
Conform√©ment aux dispositions de la Loi portant
organisation d'un Registre des cr√©dits aux entreprises
(ci-apr√®s d√©nomm√©e la 'Loi'), la banque doit
tran