In [5]:
# Installer les dépendances
!pip install python-pptx langchain langchain-community pypdf sentence-transformers faiss-cpu transformers torch google-colab

from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor
from pptx.enum.shapes import MSO_SHAPE
import re
from pathlib import Path
from google.colab import drive
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import numpy as np
import os

# Monter Google Drive
drive.mount('/content/drive')

# Définir la variable d'environnement pour gérer la fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Wrapper pour rendre SentenceTransformer compatible avec LangChain
class SentenceTransformerEmbeddings:
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        embeddings = self.model.encode(texts, convert_to_numpy=True, device='cpu')
        return embeddings.tolist()

    def embed_query(self, text):
        embedding = self.model.encode([text], convert_to_numpy=True, device='cpu')[0]
        return embedding.tolist()

    def __call__(self, text):
        return self.embed_query(text)

# Initialisation RAG
def initialize_rag():
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    embedder = SentenceTransformerEmbeddings('all-MiniLM-L6-v2')
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    if torch.cuda.is_available():
        model = model.to('cuda')
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    return embedder, model, tokenizer, text_splitter

# Création de l’index FAISS
def create_faiss_index(pdf_path, embedder, text_splitter):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    texts = text_splitter.split_documents(documents)
    vectorstore = FAISS.from_documents(texts, embedder)
    return vectorstore

# Prompt simple
def build_prompt(query, context):
    return f"Question: {query}\nContext: {context}\nAnswer:"

# Fonction RAG
def get_answer_from_rag(question, vectorstore, model, tokenizer):
    docs = vectorstore.similarity_search(question, k=2)
    context = "\n\n".join(doc.page_content for doc in docs)
    full_prompt = build_prompt(question, context)
    inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=2048)
    if torch.cuda.is_available():
        inputs = inputs.to('cuda')

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )

    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    if "<|assistant|>" in generated_text:
        response = generated_text.split("<|assistant|>")[-1].strip()
    elif "Answer:" in generated_text:
        response = generated_text.split("Answer:")[-1].strip()
    else:
        response = generated_text.strip()

    return response

# Fonction pour ajouter un fond bleu clair
def set_slide_background(slide):
    background = slide.background
    fill = background.fill
    fill.gradient()
    fill.gradient_angle = 45
    stop1 = fill.gradient_stops[0]
    stop1.color.rgb = RGBColor(173, 216, 230)  # Bleu clair
    stop2 = fill.gradient_stops[1]
    stop2.color.rgb = RGBColor(135, 206, 235)  # Un peu plus foncé pour le gradient
    left, top, width, height = Inches(8), Inches(5), Inches(1), Inches(1)
    shape = slide.shapes.add_shape(MSO_SHAPE.OVAL, left, top, width, height)
    shape.fill.solid()
    shape.fill.fore_color.rgb = RGBColor(135, 206, 250)  # Cercle bleu clair
    shape.line.color.rgb = RGBColor(135, 206, 235)

# Extraction des questions
def extract_questions_from_md(md_path, target_pdf, max_questions=9):
    questions = []
    current_pdf = None
    current_section = None
    current_category = None

    with open(md_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('## '):
                pdf_section = line[3:].split(' – Section ')
                current_pdf = pdf_section[0]
                section_info = pdf_section[1].split(': ', 1)
                current_section = section_info[1]
            elif line.startswith('### '):
                current_category = line[4:]
            elif line.startswith('- ') and current_pdf == target_pdf:
                question = line[2:].strip()
                if not question.startswith("Here are ") and question.endswith("?"):
                    questions.append({
                        'pdf': current_pdf,
                        'section': current_section,
                        'category': current_category,
                        'question': question
                    })

    questions = questions[:max_questions]
    print(f"Débogage : {len(questions)} questions extraites de {md_path} pour {target_pdf}")
    return questions

# Création d’une diapositive avec section et objectif
def create_slide(prs, section_title, objective, question_data, answer, projet_logo_path, faculte_logo_path):
    slide_layout = prs.slide_layouts[1]
    slide = prs.slides.add_slide(slide_layout)
    set_slide_background(slide)

    title = slide.shapes.title
    title.text = f"{section_title}: {question_data['question']}"
    title.text_frame.paragraphs[0].font.size = Pt(28)
    title.text_frame.paragraphs[0].font.color.rgb = RGBColor(0, 102, 204)  # Bleu foncé
    title.text_frame.paragraphs[0].font.name = 'Arial'
    title.text_frame.paragraphs[0].alignment = PP_ALIGN.LEFT

    content = slide.placeholders[1].text_frame
    content.text = f"Objective: {objective}\n\nAnswer: {answer}"
    for paragraph in content.paragraphs:
        paragraph.font.size = Pt(20)
        paragraph.font.color.rgb = RGBColor(0, 102, 204)
        paragraph.font.name = 'Arial'
        paragraph.alignment = PP_ALIGN.LEFT

    slide.shapes.add_picture(projet_logo_path, Inches(0.5), Inches(0.2), height=Inches(0.5))
    slide.shapes.add_picture(faculte_logo_path, Inches(8.5), Inches(0.2), height=Inches(0.5))

    note = slide.shapes.add_textbox(Inches(0.5), Inches(5.5), Inches(9), Inches(0.5))
    note_tf = note.text_frame
    note_tf.text = f"Source: {question_data['pdf']} - Section: {question_data['section']}"
    note_tf.paragraphs[0].font.size = Pt(12)
    note_tf.paragraphs[0].font.color.rgb = RGBColor(0, 102, 153)
    note_tf.paragraphs[0].font.name = 'Arial'

# Création de la diapositive de recommandations
def create_recommendations_slide(prs, projet_logo_path, faculte_logo_path):
    slide_layout = prs.slide_layouts[1]
    slide = prs.slides.add_slide(slide_layout)
    set_slide_background(slide)

    title = slide.shapes.title
    title.text = "Recommendations"
    title.text_frame.paragraphs[0].font.size = Pt(28)
    title.text_frame.paragraphs[0].font.color.rgb = RGBColor(0, 102, 204)
    title.text_frame.paragraphs[0].font.name = 'Arial'
    title.text_frame.paragraphs[0].alignment = PP_ALIGN.LEFT

    content = slide.placeholders[1].text_frame
    content.text = "Objective: Provide actionable recommendations based on the due diligence findings.\n\n- Further investigate regulatory compliance in key markets.\n- Assess the scalability of the compliance framework.\n- Engage with stakeholders to mitigate potential conflicts of interest."
    for paragraph in content.paragraphs:
        paragraph.font.size = Pt(20)
        paragraph.font.color.rgb = RGBColor(0, 102, 204)
        paragraph.font.name = 'Arial'
        paragraph.alignment = PP_ALIGN.LEFT

    slide.shapes.add_picture(projet_logo_path, Inches(0.5), Inches(0.2), height=Inches(0.5))
    slide.shapes.add_picture(faculte_logo_path, Inches(8.5), Inches(0.2), height=Inches(0.5))

# Fonction principale
def generate_crypto_ppt_report(md_path, pdf_path, logo_dir, output_ppt_path):
    projet_logo_path = f"{logo_dir}/projet.png"
    faculte_logo_path = f"{logo_dir}/faculte.png"

    for logo in [projet_logo_path, faculte_logo_path]:
        if not Path(logo).exists():
            raise FileNotFoundError(f"Logo non trouvé : {logo}")

    # Initialiser le RAG
    embedder, model, tokenizer, text_splitter = initialize_rag()

    # Créer l’index FAISS
    vectorstore = create_faiss_index(pdf_path, embedder, text_splitter)

    # Extraire les questions (limité à 9)
    target_pdf = Path(pdf_path).name
    questions = extract_questions_from_md(md_path, target_pdf, max_questions=9)

    # Initialiser la présentation
    prs = Presentation()

    # Diapositive de titre
    slide_layout = prs.slide_layouts[0]
    slide = prs.slides.add_slide(slide_layout)
    set_slide_background(slide)

    title = slide.shapes.title
    title.text = "Rapport de Due Diligence - WEF_Digital_Assets_Regulation_2024"
    title.text_frame.paragraphs[0].font.size = Pt(28)
    title.text_frame.paragraphs[0].font.color.rgb = RGBColor(0, 102, 204)
    title.text_frame.paragraphs[0].font.name = 'Arial'

    slide.shapes.add_picture(projet_logo_path, Inches(0.5), Inches(0.5), height=Inches(1))
    slide.shapes.add_picture(faculte_logo_path, Inches(8.5), Inches(0.5), height=Inches(1))

    # Définir les sections et objectifs
    sections = [
        ("Regulatory Environment", "Understand the regulatory framework and legal implications", [questions[3], questions[5], questions[6]]),  # Q4, Q6, Q7
        ("Compliance Framework", "Evaluate the adequacy of internal processes against regulations", [questions[4]]),  # Q5
        ("Risk Assessment", "Identify risks related to digital assets and their mitigations", [questions[7], questions[8]]),  # Q8, Q9
        ("Stakeholder Analysis", "Identify stakeholders and their influence", [questions[0], questions[1]]),  # Q1, Q2
        ("Methodology Review", "Verify the robustness of methods used in the report", [questions[2]])  # Q3
    ]

    # Créer une diapositive par question avec section et objectif
    if not questions:
        print("Avertissement : Aucune question extraite. Vérifiez due_diligence_questions.md.")
    else:
        for section_title, objective, section_questions in sections:
            for question_data in section_questions:
                print(f"Traitement de la question : {question_data['question']}")
                answer = get_answer_from_rag(question_data['question'], vectorstore, model, tokenizer)
                create_slide(prs, section_title, objective, question_data, answer, projet_logo_path, faculte_logo_path)
                torch.cuda.empty_cache()
                del answer
                import gc
                gc.collect()

    # Ajouter la diapositive de recommandations
    create_recommendations_slide(prs, projet_logo_path, faculte_logo_path)

    # Nettoyage final
    del model, tokenizer, vectorstore
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Sauvegarder
    prs.save(output_ppt_path)
    print(f"✅ Présentation générée : {output_ppt_path} avec {prs.slides.__len__()} diapositives")

# Exemple d’utilisation
md_path = "/content/drive/MyDrive/due_diligence_questions.md"
pdf_path = "/content/drive/MyDrive/DueDil_documents/WEF_Digital_Assets_Regulation_2024.pdf"
logo_dir = "/content/drive/MyDrive/Logos"
output_ppt_path = "/content/drive/MyDrive/due_diligence_crypto_report1.pptx"

generate_crypto_ppt_report(md_path, pdf_path, logo_dir, output_ppt_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Débogage : 9 questions extraites de /content/drive/MyDrive/due_diligence_questions.md pour WEF_Digital_Assets_Regulation_2024.pdf
Traitement de la question : **What methodologies and data sources were used to generate the findings, interpretations, and conclusions presented in the document?**?
Traitement de la question : **Does our current compliance framework adequately address the risks associated with digital assets, considering the varying regulatory landscapes globally?**?
Traitement de la question : **What are the potential implications of operating in jurisdictions with no existing regulatory framework for digital assets?**?
Traitement de la question : **What is the legal status of digital assets, specifically cryptocurrencies, in the jurisdictions where we operate or plan to operate?**?
Traitement de la question : **What specific regulations are being developed or implemented for stablecoins in key markets like the UK, Hong Kong, Singapore, and the EU?**?
Traitement de la quest