In [3]:
from pptx import Presentation
from pptx.util import Inches

# Create a PowerPoint presentation object
prs = Presentation()

# Function to add a slide with a title and content
def add_slide(title, content):
    slide_layout = prs.slide_layouts[1]  # Title and Content layout
    slide = prs.slides.add_slide(slide_layout)
    title_placeholder = slide.shapes.title
    content_placeholder = slide.shapes.placeholders[1]
    title_placeholder.text = title
    content_placeholder.text = content

# Slide 1: Title Slide
slide_layout = prs.slide_layouts[0]  # Title Slide layout
slide = prs.slides.add_slide(slide_layout)
title = slide.shapes.title
subtitle = slide.placeholders[1]
title.text = "Adapting Large Language Models for JSON Extraction from Text Corpora"
subtitle.text = "Van-Tuan Tran, Chin-Shiuh Shieh, Ying-Chieh Chao, Casper Tsai, Mong-Fong Horng\nNational Kaohsiung University of Science and Technology\nIWCE 2024"

# Slide 2: Introduction
add_slide("Introduction", 
          "- Extracting structured JSON data from unstructured text is a critical task.\n"
          "- JSON is a standard format for structured data.\n"
          "- Need for improved methods to automate JSON extraction using LLMs.")

# Slide 3: Problem Statement
add_slide("Problem Statement", 
          "- Processing large datasets with diverse formats (HTML tags, paragraphs, irregular text).\n"
          "- Traditional rule-based methods are inefficient.\n"
          "- Gap in literature for direct application of LLMs in JSON extraction.")

# Slide 4: Research Objective
add_slide("Research Objective", 
          "To fine-tune LLMs (Llama-2-7B, Llama-3-8B, Llama-3.1-8B) for accurate and efficient JSON extraction using QLoRA and Fully Sharded Data Parallel (FSDP).")

# Slide 5: Related Work
add_slide("Related Work", 
          "- Rule-based methods: Require manual effort, lack scalability.\n"
          "- Pre-trained transformers (BERT, GPT-3): Focus on other NLP tasks, not JSON extraction.\n"
          "- Need for specialized fine-tuning of LLMs.")

# Slide 6: Methodology
add_slide("Methodology", 
          "- Base models: Llama-2-7B, Llama-3-8B, Llama-3.1-8B.\n"
          "- Fine-tuning techniques: QLoRA, Fully Sharded Data Parallel (FSDP).\n"
          "- Multi-GPU setup with PyTorch and Hugging Face Transformers.")

# Slide 7: Dataset Preparation
add_slide("Dataset Preparation", 
          "- Custom dataset combining publicly available and manually annotated data.\n"
          "- Preprocessing: Normalization, tokenization, noise filtering.\n"
          "- Dataset split into training, validation, and test sets.")

# Slide 8: Evaluation Metrics
add_slide("Evaluation Metrics", 
          "- Accuracy (standard and normalized).\n"
          "- Loss reduction (cross-entropy loss).\n"
          "- Structural correctness of JSON outputs.\n"
          "- Tasks: ARC Challenge, HellaSwag, OpenBookQA, PIQA.")

# Slide 9: Results
add_slide("Results", 
          "- Significant loss reduction across all models.\n"
          "- Llama-3.1-8B achieved highest accuracy (92%) in JSON extraction.\n"
          "- Faster convergence and better scalability with larger datasets.")

# Slide 10: Quantitative Metrics (Graph Placeholder)
add_slide("Quantitative Metrics", 
          "- Loss curves for each model (Llama-2-7B, Llama-3-8B, Llama-3.1-8B).\n"
          "- Accuracy comparison across tasks (graph placeholder).")

# Slide 11: Comparative Study
add_slide("Comparative Study", 
          "- Rule-based methods vs. LLMs: Scalability and generalization.\n"
          "- BERT and GPT-3 for NER vs. Llama for JSON extraction.\n"
          "- Llama models show superior accuracy and efficiency.")

# Slide 12: Efficiency & Scalability
add_slide("Efficiency & Scalability", 
          "- QLoRA reduces memory usage by 30%.\n"
          "- FSDP enables efficient training on large datasets.\n"
          "- Better performance on larger datasets with minimal accuracy drop.")

# Slide 13: Conclusion
add_slide("Conclusion", 
          "- Improved JSON extraction with fine-tuned LLMs.\n"
          "- Scalability and efficiency in real-world applications.\n"
          "- Future work: Expand datasets, optimize fine-tuning techniques.")

# Slide 14: Acknowledgements
add_slide("Acknowledgements", 
          "Supported by National Science and Technology Council Taiwan, with grant numbers NSTC 112-2221-E-992-045, NSTC 112-2221-E-992-057-MY3, and NSTC 112-2622-8-992-009-TD1.")

# Save the presentation
pptx_file = "Adapting_LLM_JSON_Extraction_Presentation.pptx"
prs.save(pptx_file)

pptx_file

'Adapting_LLM_JSON_Extraction_Presentation.pptx'