In [10]:
!pip install faiss-cpu
!pip install mistralai
!pip install datasets
!pip install sentence_transformers
!pip install langchain
!pip install python-dotenv

Collecting python-dotenv
  Using cached python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Using cached python_dotenv-1.2.1-py3-none-any.whl (21 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.2.1


In [1]:
import faiss
import numpy as np
import pickle
import json
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from huggingface_hub import login
from mistralai import Mistral
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()

token = os.getenv('HF_TOKEN')
MISTRAL_API_KEY = os.getenv('MISTRAL_API_KEY')
login(token=token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
# Загрузить датасет
dataset = load_dataset("bigscience-data/roots_en_wikivoyage", split="train")

# Маленький векторайзер (например, MiniLM)
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Получить тексты
texts = dataset["text"]

chunk_size = 512
chunk_overlap = 128
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

chunked_texts = []
for text in texts:
    chunks = splitter.split_text(text)
    chunked_texts.extend(chunks)

# Vectorize
embeddings = model.encode(chunked_texts, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

# индекс FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Сохранить индекс
faiss.write_index(index, "wikivoyage.index")

# Сохранить chunks
with open("chunked_texts.pkl", "wb") as f:
    pickle.dump(chunked_texts, f)

# Сохранить embeddings
np.save("embeddings.npy", embeddings)

# Сохранить метаданные
metadata = {
    "chunk_size": chunk_size,
    "chunk_overlap": chunk_overlap,
    "total_chunks": len(chunked_texts),
    "embedding_dim": embeddings.shape[1]
}
with open("metadata.json", "w") as f:
    json.dump(metadata, f)

Batches: 100%|██████████| 19542/19542 [42:02<00:00,  7.75it/s]  


In [None]:
# Загрузить
index = faiss.read_index("wikivoyage.index")

with open("chunked_texts.pkl", "rb") as f:
    chunked_texts = pickle.load(f)

embeddings = np.load("embeddings.npy")

with open("metadata.json", "r") as f:
    metadata = json.load(f)

# Поиск
query = "How can I entertain in Amsterdam"
query_vec = model.encode([query]).astype("float32")
distances, indices = index.search(query_vec, k=5)
rag_result = [chunked_texts[i] for i in indices[0]]
# print(rag_result)

# client init
client = Mistral(api_key=MISTRAL_API_KEY)
model_llm = "mistral-small-latest"

system_prompt = """
You are an expert assistant specialized in providing accurate, well-researched answers based on provided context. Your role is to carefully analyze the given information and formulate clear, coherent responses that directly address the user's question.

Instructions:
1. Read the question carefully
2. Read the context provided below carefully
3. Find all information regarged to cultural characteristics or manners and customs or attractions and sights
4. Identify the most relevant information that relates to the question
5. Provide a comprehensive answer that is directly supported by the context
6. If the context does not contain sufficient information to answer the question, clearly state this
7. Avoid making assumptions or providing information not found in the context
8. Structure your answer clearly with proper formatting if needed
9. Be concise but thorough in your explanation
"""

user_prompt = f"""
Context:\n {rag_result}\n
Question: {query}\n
Answer:
"""
resp = client.chat.complete(
    model=model_llm,
    messages=[
        {
            "role": "system",
            "content": system_prompt
            },
        {
            "role": "user",
            "content": user_prompt
            },
    ],
    response_format={"type": "json_object"},
)
content = resp.choices[0].message.content
data = json.loads(content)
data

: 