In [1]:
# --- 1. Installer les librairies nécessaires
!pip install nltk sentence-transformers faiss-cpu pdfplumber PyMuPDF --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m704.4 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install pdfplumber --quiet


In [3]:
# --- 2. Importer les librairies Python
import os
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pdfplumber
import fitz

In [4]:
# --- 2. Télécharger les ressources NLTK
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# --- 3. Charger `all_documents` pré-ingérés depuis 1_data_ingestion.ipynb
import os
import pickle
PICKLE_PATH = '/content/drive/MyDrive/RAG_Project/models/all_documents.pkl'
# Vérifier que le fichier pickle existe
if not os.path.exists(PICKLE_PATH):
    print(f"❌ Fichier pickle non trouvé à: {PICKLE_PATH}")
    print("Contenu du dossier /content/drive/MyDrive/RAG_Project/models/ :", os.listdir('/content/drive/MyDrive/RAG_Project/models/'))
    raise FileNotFoundError(f"Le fichier {PICKLE_PATH} est introuvable.")
with open(PICKLE_PATH, 'rb') as f:
    all_documents = pickle.load(f)
print(f"✅ all_documents chargé : {len(all_documents)} documents")


✅ all_documents chargé : 18576 documents


In [7]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


In [8]:
# --- 4. Prétraitement des textes
import re

def preprocess_text(text):
    """
    Fonction de prétraitement :
    - Met en minuscules
    - Retire ponctuation et caractères spéciaux
    - Retire les chiffres isolés
    - Retire les stopwords anglais
    """
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Garde seulement lettres et espaces
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)

# Appliquer sur tous les documents
processed_texts = [preprocess_text(t) for t in all_documents]

print("✅ Prétraitement terminé (extraits) :")
for t in processed_texts[:3]:
    print('-', t[:300])


✅ Prétraitement terminé (extraits) :
- id name aerocaribbean airlines type nan country cuba role nan dob nan pob nan nationality nan passport nan national id nan sanctions risk nan additional information nan yeardob nan country name cuba sanctions nan sanction program nan
- id name anglocaribbean co ltd type nan country cuba role nan dob nan pob nan nationality nan passport nan national id nan sanctions risk nan additional information nan yeardob nan country name cuba sanctions nan sanction program nan
- id name banco nacional de cuba type nan country cuba role nan dob nan pob nan nationality nan passport nan national id nan sanctions risk nan additional information aka bnc yeardob nan country name cuba sanctions nan sanction program nan


In [9]:
# --- 5. Embedding avec TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

print("\n✨ Vectorisation TF-IDF en cours...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
embeddings = tfidf_vectorizer.fit_transform(processed_texts)

print(f"✅ TF-IDF terminé : matrice de taille {embeddings.shape}")



✨ Vectorisation TF-IDF en cours...
✅ TF-IDF terminé : matrice de taille (18576, 5000)


In [10]:
# Définir les chemins
EMBEDDINGS_PATH = '/content/drive/MyDrive/RAG_Project/models/embeddings.npz'
VECTORIZER_PATH = '/content/drive/MyDrive/RAG_Project/models/tfidf_vectorizer.pkl'

print("\n✨ Sauvegarde des embeddings et du vectorizer...")


✨ Sauvegarde des embeddings et du vectorizer...


In [11]:
from scipy import sparse

# Sauvegarder les embeddings sous forme de matrice creuse
sparse.save_npz(EMBEDDINGS_PATH, embeddings)

print(f"✅ Embeddings sauvegardés sous forme de matrice creuse à : {EMBEDDINGS_PATH}")


✅ Embeddings sauvegardés sous forme de matrice creuse à : /content/drive/MyDrive/RAG_Project/models/embeddings.npz


In [24]:
# Sauvegarder le vectorizer
with open(VECTORIZER_PATH, 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

print("✅ Sauvegarde terminée.")

✅ Sauvegarde terminée.


In [12]:
# --- 6. Créer l'index FAISS
INDEX_PATH = '/content/drive/MyDrive/RAG_Project/models/faiss_index.bin'

In [13]:
# Charger les embeddings en format sparse
embeddings_sparse = sparse.load_npz(EMBEDDINGS_PATH)
print(f"✅ Embeddings sparse chargés : shape = {embeddings_sparse.shape}")

✅ Embeddings sparse chargés : shape = (18576, 5000)


In [14]:
# Convertir les embeddings en format dense (FAISS ne supporte pas les matrices creuses)
embeddings_dense = embeddings_sparse.toarray().astype(np.float32)
print(f"✅ Embeddings convertis en dense : shape = {embeddings_dense.shape}")

✅ Embeddings convertis en dense : shape = (18576, 5000)


In [15]:
# Créer l'index FAISS (Flat L2)
dimension = embeddings_dense.shape[1]  # Nombre de dimensions des embeddings
index = faiss.IndexFlatL2(dimension)

In [16]:
# Ajouter les embeddings à l'index FAISS
index.add(embeddings_dense)
print(f"✅ Index FAISS créé avec {index.ntotal} vecteurs")

✅ Index FAISS créé avec 18576 vecteurs


In [17]:
# Sauvegarder l'index FAISS
faiss.write_index(index, INDEX_PATH)
print(f"✅ Index FAISS sauvegardé sous {INDEX_PATH}")

✅ Index FAISS sauvegardé sous /content/drive/MyDrive/RAG_Project/models/faiss_index.bin
