In [6]:
# Import libraries
import pandas as pd
import numpy as np
import pdfplumber
import sys
import os


In [11]:
## Define directories

# Define the project root directory
root_dir = '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/'

# File paths
report1_dir = '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/report1.pdf'
report1_annex_dir = '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/report1_annex.pdf'
target_data_dir = '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/XLSX_target_data_v1.2_LLM.csv'

# LLM parameters
chunks = 10
seed = None 
temperature = 0.2
model = "llama-3.3-70b-instruct"  # or "gpt-4o"
date= '0227' # to indicate date in filenames
output_dir = f'/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/Outputs/{date}/'



In [10]:
## FUNCTIONS

sys.path.insert(0, root_dir) # Add the project root directory to the system path

from API import get_chat_response, num_tokens_from_string # Import the API functions

In [None]:
## Data

# Import all target data (target_code + target_content)
target_data = pd.read_csv(target_data_dir, sep=";") # note: the separator is a semicolon (;) and not a comma (,)


# Load NACE data
from Data.NACEdata import NACElevel1, NACElevel2, NACElevel3


# Import report1 and report1_annexes as pdf and convert to plain text

with pdfplumber.open(report1_dir) as pdf:
    # Extract the text from the PDF
    report1 = ""
    for page in pdf.pages:
        report1 += page.extract_text()

with pdfplumber.open(report1_annex_dir) as pdf:
    # Extract the text from the PDF
    report1_annex = ""
    for page in pdf.pages:
        report1_annex += page.extract_text()

# Preprocess the text
report1 = report1.strip()  # Remove leading and trailing whitespaces
report1 = report1.replace("\n", " ") # Replace new lines with spaces
report1 = report1.replace("\t", " ") # Replace tabs with spaces

report1_annex = report1_annex.strip()
report1_annex = report1_annex.replace("\n", " ")
report1_annex = report1_annex.replace("\t", " ")

In [None]:
# GENERATE ANSWERS


# Model parameters
chunks = chunks
seed = seed 
temperature = temperature
model = model
date= date
output_dir = output_dir



# (Loop tools)
answers_content = [] # create empty list that will store the answers content
answers_metadata = pd.DataFrame(columns=["chunk",       # create empty panda dataframe with the following columns so to gather a bit more data on the responses and ultimately try to assess consistency
                                         "replicate",
                                         "model",
                                         "seed",
                                         "temperature",
                                         "system_fingerprint", 
                                         "prompt_tokens", 
                                         "completion_tokens"])  

targets_chunks = np.array_split(ary=target_data, indices_or_sections=chunks) # splits the target_data into chunks


# Loop

for x in range(len(targets_chunks)):

    # Subset per chunk to avoid overloading the model
    targets_subset = targets_chunks[x] # run the request for chunk n°'x'
    targets_list = [f"{row['target_code']}: {row['target_content']}" for index, row in targets_subset.iterrows()] # Concatenate target_code and target_content into a list so that it can be added to the prompt as text

    
    # Define request
    prompt = f'''Hello,

            Data input & Context:
            - NACE classification categories:  {NACElevel1} + {NACElevel2} + {NACElevel3}.
            - List of targets: {targets_list}.
            - Report n°1 about "DELIVERING THE EU GREEN DEAL Progress towards targets (2025)": {report1}
            
            
            Task: 
            - In the context of report n°1, for each target, analyse its content description and assign to each target a NACE category for each level (1,2,3). 

            Answer format: provide your answer as a table in csv format please (separator: ";"), with the following columns:
            - target_code (e.g., TA1.9)
            - target_content (e.g., The contribution of the sectors covered by the EU ETS with respect to the EU Climate ambition should be of -62 % compared to 2005 (increasing the linear emissions reduction factor from 2.2 % per year up to 4.4 %)) 
            - NACE_level1 (e.g., D - Electricity, Gas, Steam and Air Conditioning Supply)
            - NACE_level1_extra1 (e.g.if other categories overlap)
            - NACE_level1_extra2 (e.g.if other categories overlap)
            - NACE_level2 (e.g., D35 - Electricity, gas, steam and air conditioning supply)
            - NACE_level2_extra1 (e.g.if other categories overlap)
            - NACE_level2_extra2 (e.g.if other categories overlap)
            - NACE_level_3 (e.g., D35.1 - Electric power generation, transmission and distribution)
            - NACE_level_3_extra1 (e.g.if other categories overlap)
            - NACE level3_extra2 (e.g.if other categories overlap)
            - justification
            - confidence_score (e.g. confidence value from 0 to 10 about the assignation choices that are made)

            Specifications:
            - If there is some overlap, add the multiple possible fitting NACE categories (up to maximum 3 per NACE level). 
            - Include the name of the NACE categories.
            - Don't forget to provide the title of the target. 
            - If there is no target content, do not invent new content, just state it as empty.
            - For each target, write one to two sentences justifying your choice.
            - Output only the csv table and no additional commentary text.

            Thank you.'''


    # Print to double check amount of tokens in prompt (JRC llama have a max of 120k)
    tokens_per_chunk = f'''Chunk:{x+1}/{len(targets_chunks)} \nTargets per chunk: {len(targets_chunks[x])} \nPrompt length: {len(prompt)} \nPrompt tokens (o200k_base encoding): {num_tokens_from_string(prompt, "o200k_base")} \nPrompt tokens (cl100k_base encoding): {num_tokens_from_string(prompt, "cl100k_base")} \n'''
    print(tokens_per_chunk)

    # Generate answer
    answer= get_chat_response(prompt=prompt,
                              seed=seed,
                              model=model,
                              temperature=temperature  # The temperature parameter influences the randomness of the generated responses. A higher value, such as 0.8, makes the answers more diverse, while a lower value, like 0.2, makes them more focused and deterministic.
                              )

    answers_content.append((f'chunk{x+1}', answer['response_content'])) # add the different replicats for answers over a single TA in a same list so i can analyse the similarity later
    answers_metadata.loc[x] = (f'chunk{x+1}', #TA code
                               f'', # replicate nbr
                               model,  
                               seed, 
                               temperature, 
                               answer["system_fingerprint"],
                               answer["prompt_tokens"],
                               answer["completion_tokens"]
                               ) 
    

    
    # Save response as csv file
    output_name = f'{date}output_chk{x+1}_s{seed}_t{temperature}.csv'     # -> to split by chunks

    with open((os.path.join(output_dir, output_name)), 'w') as f:
         f.write(answer["response_content"])


# Save triplicats metadata as csv
answers_metadata.to_csv(path_or_buf= f'{output_dir}{date}output_chk{len(targets_chunks)}_s{seed}_t{temperature}_metadata.csv', sep=';', index=False)  # split by chunks method



In [None]:
# (Optional block)

text_file = open("prompt.txt", "w") # save the prompt as a text file to view it in its entirety an double check its content.
n = text_file.write(prompt) 

# Close file
text_file.close()