## Importation

In [4]:
import streamlit as st
import os
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import base64
from langchain_community.llms import Ollama
# from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import tempfile


### Configuration de Chromadb

In [5]:

chroma_client = chromadb.PersistentClient(path="chromadb-vdb")
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
chroma_vdb = chroma_client.get_or_create_collection(
    name="cv_database",
    embedding_function=embedding_function,
)


  from .autonotebook import tqdm as notebook_tqdm


# Configuration du modèle Ollama

In [4]:

llm = Ollama(
    model="llama2",
    base_url="http://18.130.190.88:11434",  # URL explicite
    timeout=120,  # Augmentation du timeout
)


  llm = Ollama(


# Configuration du text splitter

In [5]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

# Template pour le prompt

In [8]:


prompt_template = """
Contexte: {context}

Question: {question}

Répondez à la question en utilisant uniquement le contexte fourni.
Si vous ne pouvez pas répondre à la question à partir du contexte, dites-le clairement.

Réponse:
"""

prompt = ChatPromptTemplate.from_template(prompt_template)
parser = StrOutputParser()

# Chaîne de traitement
chain = prompt | llm | parser

Traite un document (PDF ou Word) et retourne son contenu texte

In [9]:
def process_document(file):
    """Traite un document (PDF ou Word) et retourne son contenu."""
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        temp_file.write(file.getvalue())
        file_path = temp_file.name

    try:
        if file.name.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
            pages = loader.load()
            text = ' '.join([page.page_content for page in pages])
        elif file.name.endswith('.docx') or file.name.endswith('.doc'):
            loader = Docx2txtLoader(file_path)
            text = loader.load()[0].page_content
        else:
            raise ValueError("Format de fichier non supporté")
        
        return text
    finally:
        os.unlink(file_path)


## Ajoute un document à la base de données vectorielle sachant le texte avec la methode process_document et l'id du document

In [10]:

def add_document_to_vectorstore(text, document_id):
    """Ajoute un document à la base de données vectorielle."""
    chunks = text_splitter.split_text(text)
    
    # Ajout des chunks à Chroma
    chroma_vdb.add(
        documents=chunks,
        ids=[f"{document_id}-chunk-{i}" for i in range(len(chunks))],
        metadatas=[{"document_id": document_id} for _ in range(len(chunks))]
    )


## Recherche sémantiques des documents pertinents

In [6]:

def search_documents(query):
    """Recherche les documents pertinents."""
    results = chroma_vdb.query(
        query_texts=[query],
        n_results=3
    )
    return results


In [9]:
prompt = "donne moi les documents les plus anciennes"
results = chroma_vdb.query(
    query_texts=[prompt], n_results=4, include=['metadatas','distances']
)

Add of existing embedding ID: Professional Docs & Interview Prep Resume in Black White UConn Brand Style.pdf-chunk-0
Add of existing embedding ID: Professional Docs & Interview Prep Resume in Black White UConn Brand Style.pdf-chunk-1
Add of existing embedding ID: Professional Docs & Interview Prep Resume in Black White UConn Brand Style.pdf-chunk-2
Add of existing embedding ID: Black White Minimalist CV Resume.pdf-chunk-0
Add of existing embedding ID: Black White Minimalist CV Resume.pdf-chunk-1
Add of existing embedding ID: Black White Minimalist CV Resume.pdf-chunk-2
Add of existing embedding ID: Black White Minimalist CV Resume.pdf-chunk-3
Add of existing embedding ID: Moussa THIOR - CV 2.docx-chunk-0
Add of existing embedding ID: Moussa THIOR - CV 2.docx-chunk-1
Add of existing embedding ID: CV_Samba.pdf-chunk-0
Add of existing embedding ID: Grey White Black Minimalist CV Resume.pdf-chunk-0
Add of existing embedding ID: CV_TEST.pdf-chunk-0
Add of existing embedding ID: CV_Gnagna_Sa

In [16]:
for uri in results['uris'][0]:
    display(uri)

None

None

None

None

In [12]:

def main():
    st.set_page_config(layout="wide")
    st.title("Assistant documentaire RAG avec Ollama")
    
    # Zone de téléchargement de documents
    st.subheader("Téléchargement de documents")
    uploaded_file = st.file_uploader(
        "Téléchargez un document (PDF ou Word)",
        type=['pdf', 'docx', 'doc']
    )
    
    if uploaded_file:
        with st.spinner("Traitement du document..."):
            try:
                text = process_document(uploaded_file)
                document_id = uploaded_file.name
                add_document_to_vectorstore(text, document_id)
                st.success(f"Document '{document_id}' ajouté avec succès!")
            except Exception as e:
                st.error(f"Erreur lors du traitement du document: {str(e)}")
    
    # Zone de chat
    st.subheader("Zone de discussion")
    user_question = st.text_input("Posez votre question :")
    
    if user_question:
        with st.spinner("Recherche en cours..."):
            # Recherche des documents pertinents
            results = search_documents(user_question)
            
            if results["documents"]:
                # Préparation du contexte
                context = "\n".join(results["documents"][0])
                
                # Génération de la réponse
                response = chain.invoke({
                    "context": context,
                    "question": user_question
                })
                
                st.markdown("### Réponse:")
                st.markdown(response)
            else:
                st.warning("Aucun document pertinent trouvé pour votre question.")

if __name__ == "__main__":
    main()

2025-02-08 23:07:16.830 
  command:

    streamlit run /Users/mamadou.sene/Personnel/IRIS_RH/venv/lib/python3.13/site-packages/ipykernel_launcher.py [ARGUMENTS]
2025-02-08 23:07:16.834 Session state does not function when running a script without `streamlit run`
