# Objectif du notebook :
le code suivant a pour but de tester la lecture d'un fichier et d'en extraire les questions et propositions de réponses après sa conversion en images .jpeg.
Le modèle utilisé est 

In [None]:
#Installation de pdf2Image
pip install pdf2image
#Installation de groq
pip install groq

In [2]:
from pdf2image import convert_from_path
from groq import Groq
import base64
import os

### 1 - On convertit le pdf en images plates.

In [7]:
#Création du dossier spécifique pour stocker les images temporaires : 
import os
if(os.path.exists('data/images')==False):
    os.mkdir('data/images')

In [8]:
# Conversion du PDF en images
nomfichier = "data/pu_p01_aap04.pdf"
images = convert_from_path(nomfichier)

# Enregistre chacunes des pages comme une image
for i in range(len(images)):
    images[i].save('data/images/page' + str(i) + '.jpg', 'JPEG')

### 2 - On lit le document sur les serveurs Groq avec le LLM llama-4-scout-17b-16e-instruct

#### 1- Lecture avec deux questions simples, c'est à dire : "Describe this page" et "Extract the questions from this page"

In [None]:
#Clef Groq à ajouter :
KEY=""

In [24]:
# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

# Récupération des pages du pdf :
pages = os.listdir('data/images/')

for page in pages : 
    image_path = "data/images/"+page
    base64_image = encode_image(image_path)
    client = Groq(api_key=KEY)
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this page"},
                    {"type": "text", "text": "Extract the questions from this page"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                        },
                    },
                ],
            }
        ],
        model="meta-llama/llama-4-scout-17b-16e-instruct",
    )
    print("PAGE : "+page)
    print(chat_completion.choices[0].message.content)
    print()

PAGE : page0.jpg
**Page Description:**

The page appears to be a template for a project proposal, specifically for the BIMP-EAGA-ROK Cooperation Fund (BKCF). The template provides a structured format for applicants to submit their project proposals, including sections for proponent contact information, project information, and project details.

**Extracted Questions:**

Here are the questions extracted from the page:

1. What is the Organization Name?
2. What is the Organization Type (Public or Private)?
3. What is the Organization Address?
4. Who is the Focal Person (Name and Position)?
5. What is the Telephone Number?
6. What is the Email Address?
7. What is the Project Title?
8. Which Target Country(ies) does the project apply to? (Select from: Brunei Darussalam, Malaysia, Indonesia, Philippines)
9. What is the Target Geographic Location(s)? (Select from: BIMP-EAGA or Not within BIMP-EAGA)
10. What is the Project Duration? (Select from: 1 year or 2 years)
11. What is the Proposed Pr

#### 2- Essai avec une question plus "complexe"

In [23]:
for page in pages : 
    image_path = "data/images/"+page
    base64_image = encode_image(image_path)
    client = Groq(api_key=KEY)
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Extract all the questions of the page in a Python list format, including the answer choices when available."},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                        },
                    },
                ],
            }
        ],
        model="meta-llama/llama-4-scout-17b-16e-instruct",
    )
    print("PAGE : "+page)
    print(chat_completion.choices[0].message.content)
    print()

NameError: name 'encode_image' is not defined

#### Essai avec une question encore plus complexe

"text": "Extract all the questions from the page, including the answer choices when available.For each question, "
                                            "you must specify the thematic category: either 'organization' or 'project'. "
                                "The extraction should be in the following Python list format: '[
                                    {
                                        'question': '',
                                        'section': ''
                                    }"

In [None]:
for page in pages : 
    image_path = "data/images/"+page
    base64_image = encode_image(image_path)
    client = Groq(api_key=KEY)
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Extract all the questions from the page with any additional details related to each question, if available. "
                                            "Each question must include its answer choices directly within the question text, if they exist. "
                                            "You must also specify the thematic category for each question: either 'organization' or 'project'. "
                                            "The extraction should be in the following Python list format: "
                                            "[{'question': '', 'section': ''},...,{'question': '', 'section': ''}]"},
                    
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                        },
                    },
                ],
            }
        ],
        model="meta-llama/llama-4-scout-17b-16e-instruct",
    )
    print("PAGE : "+page)
    print(chat_completion.choices[0].message.content)
    print()

PAGE : page0.jpg
Here is the extracted data in the required Python list format:

```
[
    {'question': 'Organization Type (Please indicate the organization type)', 'section': 'organization', 'options': ['Public (e.g. NGOs/Research Institutions)', 'Private']},
    {'question': 'Target Country(ies)', 'section': 'project', 'options': ['Brunei Darussalam', 'Malaysia', 'Indonesia', 'Philippines']},
    {'question': 'Target Geographic Location(s) (Please tick the box and indicate the specific target location (province/city) accordingly.)', 'section': 'project', 'options': ['BIMP-EAGA', 'Not within BIMP-EAGA']},
    {'question': 'Project Duration', 'section': 'project', 'options': ['1 year', '2 years']}
]
```

PAGE : page1.jpg
## Step 1
To extract the questions from the given image, we first need to identify the sections that contain questions. The image appears to be a form with several sections, each potentially containing a question.

## Step 2
Upon closer inspection, we can see that ther

#### Essai quatre : (avec llama 4 maverick)
Remarque : Le traitement est bien plus long (x10)

In [67]:
for page in pages : 
    image_path = "data/images/"+page
    base64_image = encode_image(image_path)
    client = Groq(api_key=KEY)
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Extract all the questions from the page with any additional details related to each question, if available. "
                                            "Make sure that each question includes its answer choices within the question text, if any. Avoid simplifying or dividing the questions. Submit the most detailed version available for each question but remove any unnecessary special characters from your text. "
                                            "Each question must include the conditions for how it should be answered, such as the maximum number of words, formatting, and so on within the question text, if available"
                                            "You must also specify the thematic category for each question: either 'organization' or 'project'. "
                                            "The extraction should be in the following Python list format : "
                                            "[{'question': '', 'section': ''},...,{'question': '', 'section': ''}]"
                                            "Respect this format: for the available answer choices, you must include them directly within the question text"
                                            },
                    
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                        },
                    },
                ],
            }
        ],
        model="meta-llama/llama-4-maverick-17b-128e-instruct",
        #model="meta-llama/llama-4-scout-17b-16e-instruct",
    )
    print("PAGE : "+page)
    print(chat_completion.choices[0].message.content)
    print()

PAGE : page0.jpg
Here is the extracted data in the requested Python list format:

```python
[
    {'question': 'Organization Name', 'section': 'organization'},
    {'question': 'Organization Type (Please indicate the organization type) Public (e.g. NGOs/Research Institutions) or Private', 'section': 'organization'},
    {'question': 'Organization Address e.g. Address 1, Address 2, City, Country', 'section': 'organization'},
    {'question': 'Focal Person Name / Position (Please indicate the name and position of the focal person of the project.)', 'section': 'organization'},
    {'question': 'Telephone Number', 'section': 'organization'},
    {'question': 'Email Address', 'section': 'organization'},
    {'question': 'Project Title (Please keep the project title clear and concise)', 'section': 'project'},
    {'question': 'Target Country(ies) Brunei Darussalam, Malaysia, Indonesia, Philippines', 'section': 'project'},
    {'question': 'Target Geographic Location(s) (Please tick the box a

####  Un nouvel essai avec llama 4 Scout et le chargement de plusieurs images à la fois

Dans le code ci-dessous, deux images sont chargés afin de lier certaines questions pouvant être découpées dans l'AAP.

In [None]:
# Test de Chargement de la page suivante et de la page précédente pour gérer des questions 

pages = os.listdir('data/images/')
nbpages=len(pages)
print(nbpages)
#On récupère le nombre de page. On selectionne les pages deux à deux, puis à la dernière page. On ne prend que celle-ci
for i in range(nbpages):
    if(i != nbpages-1) :
        current_page = pages[i]
        next_page = pages[i + 1]
        print(current_page, next_page)
    else: 
        current_page = pages[i]
        print(current_page)

In [38]:
# Retrouver le nombre de page : 
pages = os.listdir('data/images/')
nbpages=len(pages)

for i in range(nbpages):
    if(i != nbpages-1) :
        current_page = pages[i]
        next_page = pages[i + 1]  
        print("Pour la page",current_page,next_page)
        image_path_1 = "data/images/"+current_page
        image_path_2="data/images/"+next_page
        base64_image_1 = encode_image(image_path_1)
        base64_image_2 = encode_image(image_path_2)
        client = Groq(api_key=KEY)
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Two pages are sent to you. Extract all the questions from only the first page you received with any additional details related to each question, if available. "
                                                "If the first question on the first page is cut off, do not extract it. However, if the last question on the first page is connected to the first question on the second page, link them together and extract the complete question."
                                                "Make sure that each question includes its answer choices within the question text, if any. Avoid simplifying or dividing the questions. Submit the most detailed version available for each question but remove any unnecessary special characters from your text. "
                                                "Each question must include the conditions for how it should be answered, such as the maximum number of words, formatting, and so on within the question text, if available"
                                                "You must also specify the thematic category for each question: either 'organization' or 'project'. "
                                                "The extraction must be absolutly in the following Python list format : "
                                                "{\"question\": \"\", \"section\": \"\"},...,{\"question\": \"\", \"section\": \"\"}"
                                                "Respect this format: for the available answer choices, you must include them directly within the question text"
                                                "Just give me the extracted data."
                                                },
                        
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image_1}",
                            },
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image_2}",
                            },
                        },
                    ],
                }
            ],
            model="meta-llama/llama-4-scout-17b-16e-instruct",
        )
        print("PAGE : "+current_page)
        print(chat_completion.choices[0].message.content)
        print()
    else:
         print("last page")
         current_page = pages[i]
         image_path_1 = "data/images/"+current_page
         client = Groq(api_key=KEY)
         chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Extract all the questions from the page with any additional details related to each question, if available. "
                                                "Make sure that each question includes its answer choices within the question text, if any. Avoid simplifying or dividing the questions. Submit the most detailed version available for each question but remove any unnecessary special characters from your text. "
                                                "Each question must include the conditions for how it should be answered, such as the maximum number of words, formatting, and so on within the question text, if available"
                                                "You must also specify the thematic category for each question: either 'organization' or 'project'. "
                                                "The extraction must be absolutly in the following Python list format : "
                                                "{\"question\": \"\", \"section\": \"\"},...,{\"question\": \"\", \"section\": \"\"}"
                                                "Respect this format: for the available answer choices, you must include them directly within the question text"
                                                },
                        
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image_1}",
                            },
                        },
                    ],
                }
            ],
            model="meta-llama/llama-4-scout-17b-16e-instruct",
        )
         print("PAGE : "+current_page)
         print(chat_completion.choices[0].message.content)
         print()

Pour la page page0.jpg page1.jpg
PAGE : page0.jpg
[
    {"question": "Organization Name", "section": "organization"},
    {"question": "Organization Type (Please indicate the organization type) Public (e.g. NGOs/Research Institutions) Private", "section": "organization"},
    {"question": "Organization Address e.g. Address 1, Address 2, City, Country", "section": "organization"},
    {"question": "Focal Person Name / Position (Please indicate the name and position of the focal person of the project.)", "section": "organization"},
    {"question": "Telephone Number", "section": "organization"},
    {"question": "Email Address", "section": "organization"},
    {"question": "Project Title (Please keep the project title clear and concise)", "section": "project"},
    {"question": "Target Country(ies) Brunei Darussalam Malaysia Indonesia Philippines", "section": "project"},
    {"question": "Target Geographic Location(s) (Please tick the box and indicate the specific target location (provin