In [2]:
# Import libraries
import sys
import os
import time

import pandas as pd
import numpy as np

import pdfplumber
import docx2txt

In [None]:
## DEFINE DIRECTORIES

# Define the project root directory
root_dir = '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/'

# File paths
report1_dir = '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/REPORT_1/report1_trimmed.pdf'
report2_dir = '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/REPORT_2/access_20250310'
target_data_dir  = '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data'

# Define absolute python path
sys.path.insert(0, root_dir) 

In [3]:
# IMPORT FUNCTIONS & DATA


# Load API and import functions
from Code.API import get_chat_response, num_tokens_from_string


# Import all target data (target_code + target_content)
target_data_250 = pd.read_csv(f'{target_data_dir}/targets_data_250.csv', sep=";")  #extensive target list from target_NACE_classification.xlsx
target_data_150 = pd.read_csv(f'{target_data_dir}/targets_data_150.csv', sep=";")  #target list as in report 1


# Import & parse report1
with pdfplumber.open(report1_dir) as pdf:
    # Extract the text from the PDF
    report1 = ""
    for page in pdf.pages:
        report1 += page.extract_text()

# Clean-up report1
report1 = report1.strip() 
report1 = report1.replace("\n", " ")
report1 = report1.replace("\t", " ")

# Import & parse report2

# create a dictionary to access different chapters of report2   
report2 = { 
    'chapter1': docx2txt.process(f'{report2_dir}/NEW_Chapter1_CLEAN - Introduction & setting the scene_LM_trimmed.docx'),
    'chapter2': docx2txt.process(f'{report2_dir}/NEW_Chapter2 (ex chp3) - Environmental impacts_ZOTERO_trimmed.docx'),
    'chapter3': docx2txt.process(f'{report2_dir}/NEW_Chapter3 (ex chp4) with BIBLIO - Challenges and enablers for EGD objectives_trimmed.docx'),
    'chapter4': docx2txt.process(f'{report2_dir}/NEW_Chapter4 (ex chp5) - Enabling the green transition_trimmed.docx'),
    'chapter5': docx2txt.process(f'{report2_dir}/NEW_Chapter5 - Fair and just transition_trimmed.docx'),
    'chapter6': docx2txt.process(f'{report2_dir}/NEW_Chapter6 - Financing the green transition.docx')
}

# clean up report2 chapters
for chapter, text in report2.items():
    report2[chapter] = text.strip()
    report2[chapter] = report2[chapter].replace("\n", " ")
    report2[chapter] = report2[chapter].replace("\t", " ")


# Import a list of subthemes that were manually selected based on my Obsidian Canvases and Report 1.
from Data.subthemes import subthemes_list
subthemes_list = subthemes_list


In [11]:
## CHUNK GENERATION

# Parameters for chunk generation
chunks = 10  # 10 chunks give prompts of aproximately 85k tokens (based on trial&error)
data_to_split = subthemes_list

# Generate chunks 
target_data_chunks = np.array_split(ary=data_to_split, indices_or_sections=chunks)

# Generate all potential pairs of chunks
chunk_pairs = {}
chunk_list = list(range(chunks))
chunk_id = 0

for i in range(len(chunk_list)):
    for j in range(len(chunk_list)):
        if i != j:
            chunk_pairs[chunk_id] = [chunk_list[i], chunk_list[j]]
            chunk_id += 1


In [None]:
# GENERATE ANSWERS


# Set parameters
seed = None 
temperature = 0.1
model = "llama-3.3-70b-instruct" #"llama-3.3-70b-instruct" "gpt-4o" "nous-hermes-2-mixtral-8x7b-dpo"
date = '0319' 
output_directory = f'/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/Outputs/{date}/'
os.makedirs(output_directory, exist_ok=True) # Create the 'date' folder if it doesn't exist



# (loop tools and formatting)
answers_metadata = pd.DataFrame(columns=["chunk_pairs_nbr",
                                         "chunk_pairs",        # create empty panda dataframe with the following columns so to gather a bit more data on the responses and ultimately try to assess consistency
                                         "model",
                                         "seed",
                                         "temperature",
                                         "system_fingerprint", 
                                         "prompt_tokens", 
                                         "completion_tokens"])  


# Loop
for x in range(len(chunk_pairs)): 
#for x in range(34, 90):
#for x in list([56,57]):

    success = False  # Initialize a flag to track whether the operation was successful
    retry_count = 0  # Initialize a counter to track the number of retries
    max_retries = 5  # adjust this value to set the desired number of retries

    while not success and retry_count < max_retries:
        try:
            # Subset data to avoid overloading the model
            sub1 = [f"{row['target_code']}: {row['target_content']}. Assessment: {row['target_assessment']}" for index, row in target_data_chunks[chunk_pairs[x][0]].iterrows()] # select chunk of targets  corresponding to the 1st element ('0') of the chunk pair number 'x' and concatenate target_code and respective target_content into a list of strings, so it can be added to the prompts
            sub2 = [f"{row['target_code']}: {row['target_content']}. Assessment: {row['target_assessment']}" for index, row in target_data_chunks[chunk_pairs[x][1]].iterrows()] # select chunk of targets corresponding to the 2d element ('1') of the chunk pair number 'x', and concatenate target_code and respective target_content into a list of strings, so it can be added to the prompts


            # Define prompt
            prompt = f'''
                Data input & Context:
                - List of European Green Deal (EGD) targets A: {sub1}
                - List of European Green Deal (EGD) targets B: {sub2}
                - Report n°2: {report2['chapter1']} + {report2['chapter2']} + {report2['chapter3']} + {report2['chapter4']} + {report2['chapter5']}

                Task: 
                - Based on the content of Report n°2 and the content of EGD targets in list A and list B (i.e., their content and assessment), determine which targets in list A are likely to have a positive or negative impact on targets in list B, and reversely.
                - Determine which sub-themes in list A are likely to have a positive or negative impact on sub-themes in list B, and reversely.

                Answer format: provide your answer as a table in csv format please (separator: ";"), with the following columns:
                - source_subtheme (e.g., GHG Reduction)
                - source_subtheme_targets (e.g.,TA1.3,TA1.7,TA1.9,TA1.11,TA1.13,TA5.7) 
                - impact_subtheme (the name of the subtheme that is likely to be positively or negatively affected by the implementation and requirements of the subtheme in the 'source_subtheme' column)
                - impact_type (positive '+' or negative '-')
                - justification

                Specifications:
                - If some sub-themes do not have any connections at all (i.e., are isolated), still add them but add 'NA' to the impact_subtheme and impact_type columns.
                - One row per connection, if you deem that one sub-theme has an impact on multiple other sub-themes, add as many rows for a same subtheme as necessary.
                - Ensure that your analysis accounts for both directionalities.
                - It is critical that your analysis is based on the context of the report and not just on the semantics of the target contents.
                - Don't forget negative connections as well.
                - This is mandatory: for each sub-theme connection, write one to two sentences justifying your choice. 
                - Output only the CSV table. Do not include additional commentary.
            '''

            # Print pre-generation metadata (to double check amount of tokens in prompt, JRC llama3.3 should have a max of 120k)
            prompt_metadata = f'''Chunks pair: {x} \nChunks: {chunk_pairs[x]} \nPrompt length: {len(prompt)} \nPrompt tokens (o200k_base encoding): {num_tokens_from_string(prompt, "o200k_base")} \nPrompt tokens (cl100k_base encoding): {num_tokens_from_string(prompt, "cl100k_base")} \n'''
            print(prompt_metadata)

            # Generate answer
            answer = get_chat_response(prompt=prompt,
                                      seed=seed,
                                      model=model,
                                      temperature=temperature)

            # Print post-generation metadata 
            print(f'Answer generated.')
            print(f'Prompt tokens: {answer["prompt_tokens"]} \nCompletion tokens: {answer["completion_tokens"]}')


            # Add the metadata of the generated answer to a dataframe
            answers_metadata.loc[x] = (x,
                                       chunk_pairs[x],
                                       model,
                                       seed,
                                       temperature,
                                       answer["system_fingerprint"],
                                       answer["prompt_tokens"],
                                       answer["completion_tokens"])

            # Save the generated answer as a CSV file
            output_name = f'{date}_network_pair{x}.csv'

            with open((os.path.join(output_directory, output_name)), 'w') as f:
                f.write(answer["response_content"])

            # If success, set the success flag to True
            success = True

            # If success, add a 2-minute pause between answer requests to avoid RateLimitErrors
            print(f"-- 1 min pause \n")
            time.sleep(60)

        except Exception as e:
            
            retry_count += 1  # Increment the retry counter if an error occurs
            error_type = type(e).__name__  # Get the type of error that occurred
            error_message = str(e) # Get the error message
            print(f"An error occurred ({error_type}): {error_message}. Retrying ({retry_count}/{max_retries})")  # Print an error message with the type and message


    # Print a message if the operation failed after the maximum number of retries
    if not success:
        print(f"Failed to generate answer for pair{x} ({chunk_pairs[x]}) after {max_retries} retries.")

# Save the metadata dataframe as a CSV file
answers_metadata.to_csv(path_or_buf=os.path.join(output_directory, f'{date}_network_metadata.csv'),
                         sep=';',
                         index=False)
