In [1]:
!pip install transformers sentence-transformers torch faiss-cpu nltk --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import os
import pickle
import nltk
from nltk.corpus import stopwords
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

In [8]:

# Télécharger les stopwords si ce n'est pas déjà fait
nltk.download('stopwords')

# Liste des stopwords
stop_words = set(stopwords.words('english'))

# Charger les documents depuis pickle
PICKLE_PATH = '/content/drive/MyDrive/RAG_Project/models/all_documents.pkl'
if not os.path.exists(PICKLE_PATH):
    print(f"❌ Fichier pickle non trouvé à: {PICKLE_PATH}")
    raise FileNotFoundError(f"Le fichier {PICKLE_PATH} est introuvable.")

with open(PICKLE_PATH, 'rb') as f:
    all_documents = pickle.load(f)
print(f"✅ all_documents chargé : {len(all_documents)} documents")

# Fonction de prétraitement

✅ all_documents chargé : 18576 documents


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def preprocess_text(text):
    """
    Prétraite le texte :
    - Mise en minuscules
    - Suppression des caractères spéciaux
    - Suppression des mots vides (stopwords)
    """
    # Mise en minuscule
    text = text.lower()

    # Retirer les caractères non alphanumériques
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenisation
    tokens = text.split()

    # Retirer les stopwords
    tokens = [t for t in tokens if t not in stop_words]

    return ' '.join(tokens)

In [10]:

# Prétraitement des documents
processed_texts = [preprocess_text(t) for t in all_documents]
print("✅ Prétraitement terminé (extraits) :")
for t in processed_texts[:3]:
    print('-', t[:100])  # Affiche les 100 premiers caractères

✅ Prétraitement terminé (extraits) :
- id name aerocaribbean airlines type nan country cuba role nan dob nan pob nan nationality nan passpo
- id name anglocaribbean co ltd type nan country cuba role nan dob nan pob nan nationality nan passpor
- id name banco nacional de cuba type nan country cuba role nan dob nan pob nan nationality nan passpo


In [11]:
# Charger le modèle Sentence-BERT (ici, on utilise 'paraphrase-MiniLM-L6-v2', mais tu peux essayer d'autres modèles)
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Appliquer SBERT sur les documents prétraités
embeddings = model.encode(processed_texts, convert_to_tensor=True)
print(f"✅ Embeddings générés : {embeddings.shape}")

✅ Embeddings générés : torch.Size([18576, 384])


In [12]:
# Sauvegarder les embeddings
EMBEDDINGS_PATH = '/content/drive/MyDrive/RAG_Project/models/sbert_embeddings.pt'
torch.save(embeddings, EMBEDDINGS_PATH)
print(f"✅ Embeddings sauvegardés sous {EMBEDDINGS_PATH}")

✅ Embeddings sauvegardés sous /content/drive/MyDrive/RAG_Project/models/sbert_embeddings.pt


In [13]:
# Créer et sauvegarder l'index FAISS
embeddings_dense = embeddings.cpu().numpy().astype(np.float32)
dimension = embeddings_dense.shape[1]  # Nombre de dimensions des embeddings
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_dense)
INDEX_PATH = '/content/drive/MyDrive/RAG_Project/models/faiss_index.bin'
faiss.write_index(index, INDEX_PATH)
print(f"✅ Index FAISS sauvegardé sous {INDEX_PATH}")

✅ Index FAISS sauvegardé sous /content/drive/MyDrive/RAG_Project/models/faiss_index.bin


In [14]:
# Prétraiter et encoder la requête
query = "in which jurisdictions is the digital asset/cryptocurrency operating?"
processed_query = preprocess_text(query)
query_embedding = model.encode([processed_query]).astype(np.float32)

In [15]:
# Recherche dans l'index FAISS
k = 5  # Nombre de documents similaires à récupérer
distances, indices = index.search(query_embedding, k)
print(f"✅ {k} documents les plus similaires à la requête '{query}':")
for i in range(k):
    print(f"Document {i + 1}: Index {indices[0][i]}, Similarité : {distances[0][i]}")
    print("Extrait du document :")
    print(all_documents[indices[0][i]][:200])  # Affiche les 200 premiers caractères

✅ 5 documents les plus similaires à la requête 'in which jurisdictions is the digital asset/cryptocurrency operating?':
Document 1: Index 18211, Similarité : 31.821788787841797
Extrait du document :
ulatory
approaches between jurisdictions.
Digital assets continue to have a strong presence in Most are bringing forward bespoke regulation
the global economy, as evidenced by a significant (48%) beca
Document 2: Index 18334, Similarité : 32.97077941894531
Extrait du document :
id emergence of digital assets, underpinned by technological advancements such as blockchain,
distributed ledger technology (DLT), and smart contracts, has triggered a paradigm shift in the global
fin
Document 3: Index 18129, Similarité : 33.69208526611328
Extrait du document :
ryptographic keys are necessary to access
the assets and sign transactions to initiate the assets transfer;
c) compatibility: digital assets may, with the exception of artificial restrictions, freely

Document 4: Index 18096, Similarité : 34.3

In [17]:

# Préparer le contexte pour la génération de la réponse
context = "\n".join([f"Document {i + 1}: {all_documents[indices[0][i]][:500]}" for i in range(k)])

# Charger le modèle BART et son tokenizer
model_gen = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
tokenizer_gen = BartTokenizer.from_pretrained("facebook/bart-large")

# Préparer l'input pour le modèle BART
input_text = f"Question: {query}\nContext: {context}\nAnswer:"

# Tokenizer l'input
inputs = tokenizer_gen(input_text, return_tensors="pt", max_length=1024, truncation=True)

# Générer la réponse
with torch.no_grad():
    outputs = model_gen.generate(inputs['input_ids'], max_length=150, num_beams=4, early_stopping=True)

# Décoder la réponse générée
generated_answer = tokenizer_gen.decode(outputs[0], skip_special_tokens=True)

# Afficher la réponse générée
print("Generated Answer:", generated_answer)


Generated Answer: Question: in which jurisdictions is the digital asset/cryptocurrency operating?SourceFileContext: Document 1: ulatory fixmeapproaches between jurisdictions. Jurisdictions that                of cryptocurrencies varies significantly by country.2. have established or are developing a tailored fixmeAccording to an analysis by the Atlantic Council, regulatory f fixmeDocument 2: id emergence of digital assets, underpinned by technological advancements such as blockchain, PsyNetdistributed ledger technology (DLT), and smart contracts, has triggered a paradigm shift in the global fixmefinancialecosystem. Thesedigitalassets,whichencompasscryptocurrencies,tokenizedsecurities,stablecoins, PsyNetnon-fungible tokens (NFTs), and central bank digital currencies
