In [1]:
import os
import pandas as pd
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_community.vectorstores.faiss import FAISS
from langchain_core.documents import Document
from mistralai import Mistral

  from .autonotebook import tqdm as notebook_tqdm
2025-10-28 08:25:10.992379: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Charger le CSV
df = pd.read_csv("squad_2.0/train.csv") 


In [3]:
# Convertir en documents LangChain
documents = []
for _, row in df.iterrows():
    content = f"Title: {row['title']}\nContext: {row['context']}\nAnswer: {row['answers']}"
    documents.append(Document(page_content=content, metadata={"id": row['id'], "question": row['question']}))


In [4]:
# Découper les documents en chunks si nécessaire
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)


In [5]:

# Initialiser les embeddings (HuggingFace, gratuit)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



In [8]:
# Créer FAISS vector store
vectorstore = FAISS.from_documents(docs, embeddings)

In [None]:
# vectorstore.save_local("faiss_index")

In [8]:
vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)


In [19]:
import os
from mistralai import Mistral
from dotenv import load_dotenv
load_dotenv()
# Charger les variables d'environnement
MISTRAL_MODEL_NAME = os.getenv("MISTRAL_MODEL_NAME", "mistral-tiny-2407")

# --- Fonction RAG ---
def rag_query(query, k=5):
    # Vérifier que la clé API existe
    api_key = os.getenv("MISTRAL_API_KEY")
    if not api_key or api_key.strip() == "":
        raise ValueError("MISTRAL_API_KEY is not set or is empty. Please set it in your environment variables.")
    
    # Recherche les documents les plus proches
    results = vectorstore.similarity_search(query, k=k)
    
    # Concaténer les contenus pour Mistral
    context = "\n\n".join([doc.page_content for doc in results])
    print(context)
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
    ]
    
    # Obtenir la réponse de Mistral
    with Mistral(api_key=api_key) as mistral:
        response = mistral.chat.complete(
            model=MISTRAL_MODEL_NAME, 
            messages=messages, 
            stream=False
        )
    
    return response

# Exemple de requête
try:
    query = "What photochromes are less stable than phyA?"
    answer = rag_query(query)
    print(answer)
except ValueError as e:
    print(f"Error: {e}")
    print("Please set your MISTRAL_API_KEY environment variable:")
    print("export MISTRAL_API_KEY='your-api-key-here'")


Title: Green
Context: Green is common in nature, as many plants are green because of a complex chemical known as chlorophyll, which is involved in photosynthesis. Chlorophyll absorbs the long wavelengths of light (red) and short wavelengths of light (blue) much more efficiently than the wavelengths that appear green to the human eye, so light reflected by plants is enriched in green. Chlorophyll absorbs green light poorly because it first arose in organisms living in oceans where purple halobacteria were already exploiting photosynthesis. Their purple color arose because they extracted energy in the green portion of the spectrum using bacteriorhodopsin. The new organisms that then later came to dominate the extraction of light were selected to exploit those portions of the spectrum not used by the halobacteria.
Answer: {'text': array(['organisms living in oceans'], dtype=object), 'answer_start': array([430], dtype=int32)}

Title: Green
Context: Green is common in nature, as many plants