In [1]:
pip install faiss-cpu sentence-transformers


Note: you may need to restart the kernel to use updated packages.


    click (>=7.0<=8.1.*)
          ~~~~~~^


In [4]:
import re
import json
import os
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import unicodedata

def clean_text(text):
    if not isinstance(text, str):
        return ""

    # 1. Normaliser accents (ex: é → é)
    text = unicodedata.normalize("NFKC", text)

    # 3. Supprimer emails/téléphones (facultatif selon besoin)
    # text = re.sub(r"\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b", " ", text)
    # text = re.sub(r"\+?\d[\d\s\-]{6,}", " ", text)

    # 4. Supprimer les retours ligne, multiples espaces, tabulations
    text = text.replace('\r', ' ').replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)

    # 5. Supprimer les caractères non imprimables
    text = ''.join(c for c in text if c.isprintable())

    # 6. Supprimer les balises HTML éventuelles
    text = re.sub(r'<[^>]+>', '', text)

    # 7. Enlever les majuscules continues (souvent inutiles)
    text = re.sub(r'\b[A-Z\s]{5,}\b', '', text)

    # 8. Minuscule sauf entités utiles (à faire plus tard si besoin)
    # text = text.lower()

    return text.strip()


def save_index_and_metadata(index, docs, index_path, metadata_path):
    faiss.write_index(index, index_path)
    with open(metadata_path, "w", encoding="utf-8") as f:
        json.dump(docs, f, ensure_ascii=False, indent=2)

def create_faiss_index(input_json, output_folder):
    with open(input_json, "r", encoding="utf-8") as f:
        docs = json.load(f)

    os.makedirs(output_folder, exist_ok=True)
    index_path = os.path.join(output_folder, "faiss.index")
    metadata_path = os.path.join(output_folder, "metadata.json")

    # Nettoyage
    texts = []
    cleaned_docs = []
    for doc in docs:
        raw = doc.get('text', '')
        clean = clean_text(raw)
        if clean:
            doc['text_clean'] = clean
            texts.append(clean)
            cleaned_docs.append(doc)

    # Embedding
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(texts, batch_size=64, convert_to_numpy=True, show_progress_bar=True)

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    save_index_and_metadata(index, cleaned_docs, index_path, metadata_path)
    print(f"✅ Index et métadonnées enregistrés dans {output_folder}")

if __name__ == "__main__":
    create_faiss_index("result.json", "faiss_data")


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.16s/it]

✅ Index et métadonnées enregistrés dans faiss_data



