In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.0


In [3]:
!pip install mistralai



In [4]:
import faiss
import numpy as np
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from huggingface_hub import login
from mistralai import Mistral
import json

In [5]:
from google.colab import userdata
token = userdata.get('HF_TOKEN')
MISTRAL_API_KEY = userdata.get('MISTRAL_API_KEY')
login(token=token)

In [6]:
# Загрузить датасет
dataset = load_dataset("bigscience-data/roots_en_wikivoyage", split="train")

# Маленький векторайзер (например, MiniLM)
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Получить тексты
texts = dataset["text"][:10000]  # первые 10k для примера

# Вectorize
embeddings = model.encode(texts, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

# Создать индекс FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Сохранить
faiss.write_index(index, "wikivoyage.index")

README.md:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/937 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/149M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24838 [00:00<?, ? examples/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [7]:
# Поиск
query = "How can I entertain in Amsterdam"
query_vec = model.encode([query]).astype("float32")
distances, indices = index.search(query_vec, k=5)
rag_result = [texts[i] for i in indices[0]]
# print(rag_result)

# client init
client = Mistral(api_key=MISTRAL_API_KEY)
model_llm = "mistral-medium-2505"

system_prompt = """
You are an expert assistant specialized in providing accurate, well-researched answers based on provided context. Your role is to carefully analyze the given information and formulate clear, coherent responses that directly address the user's question.

Instructions:
1. Read the question carefully
2. Read the context provided below carefully
3. Find all information regarged to cultural characteristics or manners and customs or attractions and sights
4. Identify the most relevant information that relates to the question
5. Provide a comprehensive answer that is directly supported by the context
6. If the context does not contain sufficient information to answer the question, clearly state this
7. Avoid making assumptions or providing information not found in the context
8. Structure your answer clearly with proper formatting if needed
9. Be concise but thorough in your explanation
"""

user_prompt = f"""
Context:\n {rag_result}\n
Question: {query}\n
Answer:
"""
resp = client.chat.complete(
    model=model_llm,
    messages=[
        {
            "role": "system",
            "content": system_prompt
            },
        {
            "role": "user",
            "content": user_prompt
            },
    ],
    response_format={"type": "json_object"},
)
content = resp.choices[0].message.content
data = json.loads(content)
data

{'Entertainment Options in Amsterdam': {'Cultural Attractions': [{'Name': 'Rijksmuseum',
    'Description': 'The largest and most prestigious museum for art and history in the Netherlands, featuring works by Vermeer, Rembrandt, and other Dutch masters.'},
   {'Name': 'Van Gogh Museum',
    'Description': 'Dedicated to Vincent van Gogh, this museum houses the largest collection of his paintings and drawings.'},
   {'Name': 'Stedelijk Museum',
    'Description': 'A museum of modern art, contemporary art, and design with a rich history dating back to 1874.'},
   {'Name': 'Anne Frank House',
    'Description': 'A museum dedicated to Anne Frank, offering insights into her life and the history of World War II.'},
   {'Name': 'Concertgebouw',
    'Description': 'Famous for its orchestra and acoustics, this concert hall hosts classical music performances and other musical events.'}],
  'Parks and Outdoor Activities': [{'Name': 'Vondelpark',
    'Description': 'The most popular park in Amsterda