In [None]:
# Import libraries
import sys
import os
import time
import glob


import pandas as pd
import numpy as np


import pdfplumber
import docx2txt


# Define absolute python path
sys.path.insert(0, '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/') 


## FUNCTIONS

# Load API and import request function
from Code.API import get_chat_response, num_tokens_from_string


In [None]:
# DATA

target_data_directory = '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data'
report1_directory = '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/REPORT_1/report1_trimmed.pdf'
report2_directory = '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/REPORT_2/access_20250310'

# Import all target data (target_code + target_content)
target_data_250 = pd.read_csv(f'{target_data_directory}/targets_data_250.csv', sep=";")  #extensive target list from target_NACE_classification.xlsx
target_data_150 = pd.read_csv(f'{target_data_directory}/targets_data_150.csv', sep=";")  #target list as in report 1


# Import & parse report1
with pdfplumber.open(report1_directory) as pdf:
    # Extract the text from the PDF
    report1 = ""
    for page in pdf.pages:
        report1 += page.extract_text()

# Clean-up report1
report1 = report1.strip() 
report1 = report1.replace("\n", " ")
report1 = report1.replace("\t", " ")

# Import & parse report2

# create a dictionary to access different chapters of report2   
report2 = { 
    'chapter1': docx2txt.process(f'{report2_directory}/NEW_Chapter1_CLEAN - Introduction & setting the scene_LM_trimmed.docx'),
    'chapter2': docx2txt.process(f'{report2_directory}/NEW_Chapter2 (ex chp3) - Environmental impacts_ZOTERO_trimmed.docx'),
    'chapter3': docx2txt.process(f'{report2_directory}/NEW_Chapter3 (ex chp4) with BIBLIO - Challenges and enablers for EGD objectives_trimmed.docx'),
    'chapter4': docx2txt.process(f'{report2_directory}/NEW_Chapter4 (ex chp5) - Enabling the green transition_trimmed.docx'),
    'chapter5': docx2txt.process(f'{report2_directory}/NEW_Chapter5 - Fair and just transition_trimmed.docx'),
    'chapter6': docx2txt.process(f'{report2_directory}/NEW_Chapter6 - Financing the green transition.docx')
}

# clean up report2 chapters
for chapter, text in report2.items():
    report2[chapter] = text.strip()
    report2[chapter] = report2[chapter].replace("\n", " ")
    report2[chapter] = report2[chapter].replace("\t", " ")



subthemes_list = {
    'TA1': [
        "Climate Resilience",
        "GHG Reduction",
        "GHG Reduction - Buildings",
        "GHG Reduction - Transports"
        "GHG Removal",
    ],
    'TA2': [
        "Renewable Energy",
        "Renewable Energy - Heating & Cooling",
        "Renewable Energy - Hydrogen Production",
        "Renewable Energy - Ocean/Offshore",
        "Renewable Energy - Solar",
        "Energy Efficiency",
        "Energy Efficiency - Buildings",
        "Energy Infrastructure",
        "Social Security - Energy",
    ],
    'TA3': [
        	"Waste Reduction",
          "Waste Reduction - Municipal Waste",
          "Waste Reduction - Food Waste",
          "Waste Reduction - Plastic & Packaging",
          "Circularity/Recycling",
          "Circularity/Recycling - Municipal Waste",
          "Circularity/Recycling - Textile Waste",
          "Circularity/Recycling - Plastic & Packaging",
          "Circularity/Recycling - Plastic & Packaging - Bio-based plastics",
          "Circularity/Recycling - Vehicle Circularity",
          "Circularity/Recycling - Critical Raw Materials - Batteries Recycling",
          "Critical Raw Materials - Extraction & Import",
          "Net-Zero Technology - Manufacturing",
    ],
    'TA4': [
        	"Rail",
          "Net-Zero Technology - Road Vehicles",
          "Net-Zero Technology - Maritime Transport",
          "Net-Zero Technology - Aviation",
          "Biofuels",
          "Other Low-Carbon Fuels",
          "Hydrogen Distribution",
          "Urban Mobility",
          "Transport Logistics",
    ],
    'TA5': [
        	"Food quality",
          "Food quality - Animal Welfare",
          "Food quality - Healthy Food",
          "Food affordability",
          "Pesticides Reduction",
          "Competitive Agriculture",
          "Social Security - Workers Protection",
    ],
    'TA6': [
        	"Terrestrial Ecosystems Restoration",
          "Terrestrial Ecosystems Restoration - Rivers",
          "Terrestrial Ecosystems Restoration - Agricultural Ecosystems",
          "Terrestrial Ecosystems Restoration - Forests",
          "Marine Ecosystem Restoration",
          "Biodiversity Protection & Conservation",
          "Biodiversity Protection & Conservation - Fisheries",
          "Biodiversity Protection & Conservation - Monitoring",
          "Biodiversity Protection & Conservation - Urban Nature",
    ],
    'TA7': [
        "Forest Bioeconomy",
        "Improve Air Quality",
        "Improve Water Quality",
        "Improve Soils Health",
        "Noise Reduction",
        "Social Security - Sanitation"
          ],
}


impact_weight_meanings = {     # in json object format so that the LLM can most efficiently understand its structure, based on Nilssen et al. (2016)
  "weights": [
    {
      "weight": "+3",
      "name": "Indivisible",
      "explanation": "Inextricably linked to the achievement of another target.",
      "example": "Ending all forms of discrimination against women and girls is indivisible from ensuring women’s full and effective participation and equal opportunities for leadership."
    },
    {
      "weight": "+2",
      "name": "Reinforcing",
      "explanation": "Aids the achievement of another target.",
      "example": "Providing access to electricity reinforces water‐pumping and irrigation systems. Strengthening the capacity to adapt to climate‐related hazards reduces losses caused by disasters."
    },
    {
      "weight": "+1",
      "name": "Enabling",
      "explanation": "Creates conditions that further another target.",
      "example": "Providing electricity access in rural homes enables education, because it makes it possible to do homework at night with electric lighting."
    },
    {
      "weight": "-1",
      "name": "Constraining",
      "explanation": "Limits options on another target.",
      "example": "Improved water efficiency can constrain agricultural irrigation. Reducing climate change can constrain the options for energy access."
    },
    {
      "weight": "-2",
      "name": "Counteracting",
      "explanation": "Clashes with another target.",
      "example": "Boosting consumption for growth can counteract waste reduction and climate mitigation."
    },
    {
      "weight": "-3",
      "name": "Cancelling",
      "explanation": "Makes it impossible to reach another goal.",
      "example": "Fully ensuring public transparency and democratic accountability cannot be combined with national‐security goals. Full protection of natural reserves excludes public access for recreation."
    }
  ]
}

In [None]:
# Reshape and store all target data into one dictionary

# Final structure: 'target_data_dict' / thematic_area_code / sub_theme / ta_code | ta_content | ta_assessment


# Indicate the main dataset to work with
data = target_data_250

# 1) Create a first sub-dictionary to store the subthemes and their respective target data
subthemes_dict = {}

# Iterate over the unique 'sub_theme' values
for theme in data['sub_theme'].unique():
    # Filter the DataFrame for the current 'sub_theme'
    theme_df = data[data['sub_theme'] == theme][['target_code', 'target_content', 'target_assessment']]
    # Add the filtered DataFrame to the dictionary
    subthemes_dict[theme] = theme_df

# 2) Create the final overarching dictionary and sort into it the sub-themes and their target data per thematic areas
target_data_dict = {}

# Iterate over the unique 'thematic_area_code' values
for ta in data['thematic_area_code'].unique():
    # Initialize the thematic area in the dictionary
    target_data_dict[ta] = {}
    
    # Iterate over the unique 'sub_theme' values for the current thematic area
    for theme in data[data['thematic_area_code'] == ta]['sub_theme'].unique():
        theme_df = subthemes_dict[theme] # Get the sub-theme DataFrame from the subthemes_dict
        target_data_dict[ta][theme] = theme_df # Add the sub-theme DataFrame to the target_data_150_dict


#print(target_data_dict['TA3']['Circularity/Recycling - Vehicle Circularity']) # example of how to access a specific sub-theme in the dictionary


In [10]:
# Generate all potential pairs of thematic areas (2x21= 42 pairs)
ta_pairs = {} # dictionary to store all thematic area pairs
ta_list = ['TA1', 'TA2', 'TA3', 'TA4', 'TA5', 'TA6', 'TA7'] # list of thematic areas
pair_id = 0 

for i in range(len(ta_list)):
    for j in range(len(ta_list)):
        if i != j:
            ta_pairs[pair_id] = [ta_list[i], ta_list[j]] 
            pair_id += 1

In [11]:
# GENERATE ANSWERS


# Set parameters
seed = None 
temperature = 0.1
model = "llama-3.3-70b-instruct" #"llama-3.3-70b-instruct" "gpt-4o" "nous-hermes-2-mixtral-8x7b-dpo"
date = '0328' 
output_directory = f'/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/Outputs/{date}/'
os.makedirs(output_directory, exist_ok=True) # Create the 'date' folder if it doesn't exist



# (loop tools and formatting)
answers_metadata = pd.DataFrame(columns=["ta_pairs_nbr",
                                         "ta_pairs_pairs",        # create empty panda dataframe with the following columns so to gather a bit more data on the responses and ultimately try to assess consistency
                                         "model",
                                         "seed",
                                         "temperature",
                                         "system_fingerprint", 
                                         "prompt_tokens", 
                                         "completion_tokens"])  


# Loop


for x in range(len(ta_pairs)):
#for x in range(8, 42):
#for x in list([12,13,14]):

    success = False  # Initialize a flag to track whether the operation was successful
    retry_count = 0  # Initialize a counter to track the number of retries
    max_retries = 5  # adjust this value to set the desired number of retries

    while not success and retry_count < max_retries:
        try:
            # Subset data to avoid overloading the model
            sub1 = [f"{target_data_dict[ta_pairs[x][0]]}"] # this will access the data stored in target_data_dict for the thematic_area_code stored in ta_pairs[x][0] (e.g., target_data_dict[ta_pairs[0][0]] <=> target_data_dict['TA1])
            sub2 = [f"{target_data_dict[ta_pairs[x][1]]}"] # same thing here, but for the second element of the ta_pair[x]


            # Define prompt
            prompt = f'''
                Data input & Context:
                - List A: first list of European Green Deal (EGD) targets grouped by sub-themes:{sub1}.
                - List B: second list of EGD targets grouped by sub-themes: {sub2}.
                - Report n°2: [ {report2['chapter1']} + {report2['chapter2']} + {report2['chapter3']} + {report2['chapter4']} + {report2['chapter5']} ].

                Task: 
                - Determine how and how much sub-themes in List A may positiviely or negatively influence sub-themes in List B (i.e., determine potential synergies and/or trade-offs).
                - Take into account the context and information of Report n°2 as well as the information available about the targets in both lists.

                Answer format: provide your answer as a table in csv format please (separator: ";"), with the following columns:
                - source_subtheme (e.g., GHG Reduction).
                - source_subtheme_targets (e.g.,TA1.3,TA1.7,TA1.9,TA1.11,TA1.13,TA5.7) .
                - impact_subtheme (the name of the subtheme that is likely to be positively or negatively affected by the implementation and requirements of the sub-theme in the 'source_subtheme' column).
                - impact_type (positive '+' or negative '-').
                - impact_weight (-3,-2,-1,1,2,3).
                - justification.

                Specifications:
                - The impacts can have different weights, which have the following meanings: {impact_weight_meanings}
                - Only the following sub-themes can be added to the table: {subthemes_list[ta_pairs[x][0]]} and {subthemes_list[ta_pairs[x][1]]}.
                - This is crucial: do not invent new sub-themes.
                - Connections can only be made from sub-themes in List A to sub-themes in List B, not the contrary.
                - If some sub-themes do not have any connections at all (i.e., are isolated), do not add any row.
                - One row per connection, if you deem that one sub-theme has an impact on multiple other sub-themes, add as many rows for a same sub-theme as necessary.
                - It is critical that your analysis is based on the context of the report and not just on the semantics of the target contents.
                - This is mandatory: for each sub-theme connection, write 1-2 concise sentences justifying your choice. 
                - Output only the CSV table. Do not include additional commentary.
            '''

            # Print pre-generation metadata (to double check amount of tokens in prompt, JRC llama3.3 should have a max of 120k)
            prompt_metadata = f'''TA_pair: {x} - {ta_pairs[x]} \nPrompt length: {len(prompt)} \nPrompt tokens (o200k_base encoding): {num_tokens_from_string(prompt, "o200k_base")} \nPrompt tokens (cl100k_base encoding): {num_tokens_from_string(prompt, "cl100k_base")} \n'''
            print(prompt_metadata)

            # Generate answer
            answer = get_chat_response(prompt=prompt,
                                      seed=seed,
                                      model=model,
                                      temperature=temperature)

            # Print post-generation metadata 
            print(f'Answer generated.')
            print(f'Prompt tokens: {answer["prompt_tokens"]} \nCompletion tokens: {answer["completion_tokens"]}')


            # Add the metadata of the generated answer to a dataframe
            answers_metadata.loc[x] = (x,
                                       ta_pairs[x],
                                       model,
                                       seed,
                                       temperature,
                                       answer["system_fingerprint"],
                                       answer["prompt_tokens"],
                                       answer["completion_tokens"])

            # Save the generated answer as a CSV file
            output_name = f'{date}_network_pair{x}.csv'

            with open((os.path.join(output_directory, output_name)), 'w') as f:
                f.write(answer["response_content"])

            # If success, set the success flag to True
            success = True

            # If success, add a 2-minute pause between answer requests to avoid RateLimitErrors
            print(f"-- 1 min pause \n")
            time.sleep(60)

        except Exception as e:
            
            retry_count += 1  # Increment the retry counter if an error occurs
            error_type = type(e).__name__  # Get the type of error that occurred
            error_message = str(e) # Get the error message
            print(f"An error occurred ({error_type}): {error_message}. Retrying ({retry_count}/{max_retries})")  # Print an error message with the type and message


    # Print a message if the operation failed after the maximum number of retries
    if not success:
        print(f"Failed to generate answer for pair{x} ({ta_pairs[x]}) after {max_retries} retries.")

# Save the metadata dataframe as a CSV file
answers_metadata.to_csv(path_or_buf=os.path.join(output_directory, f'{date}_network_metadata.csv'),
                         sep=';',
                         index=False)


TA_pair: 0 - ['TA1', 'TA2'] 
Prompt length: 403387 
Prompt tokens (o200k_base encoding): 76669 
Prompt tokens (cl100k_base encoding): 77230 

Answer generated.
Prompt tokens: 77223 
Completion tokens: 534
-- 1 min pause 

TA_pair: 1 - ['TA1', 'TA3'] 
Prompt length: 405158 
Prompt tokens (o200k_base encoding): 77024 
Prompt tokens (cl100k_base encoding): 77571 

Answer generated.
Prompt tokens: 77564 
Completion tokens: 773
-- 1 min pause 

TA_pair: 2 - ['TA1', 'TA4'] 
Prompt length: 409722 
Prompt tokens (o200k_base encoding): 78122 
Prompt tokens (cl100k_base encoding): 78646 

Answer generated.
Prompt tokens: 78639 
Completion tokens: 568
-- 1 min pause 

TA_pair: 3 - ['TA1', 'TA5'] 
Prompt length: 412767 
Prompt tokens (o200k_base encoding): 78744 
Prompt tokens (cl100k_base encoding): 79277 

Answer generated.
Prompt tokens: 79270 
Completion tokens: 411
-- 1 min pause 

TA_pair: 4 - ['TA1', 'TA6'] 
Prompt length: 406199 
Prompt tokens (o200k_base encoding): 77256 
Prompt tokens (c

In [12]:
# Aggregate results chunks into a single file


# Specify the directory path and file pattern
date = date
output_directory = f'/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/Outputs/{date}/'
file_pattern = f'{date}_network_pair*.csv'

# Get a list of all CSV files matching the pattern
csv_files = glob.glob(output_directory + '/' + file_pattern)

# Initialize an empty list to store the dataframes
dataframes = []

# Iterate over each CSV file, read it into a dataframe, and append to the list
for file in csv_files:
    try:
        df = pd.read_csv(file, on_bad_lines='skip', sep=';')
        dataframes.append(df)
    except Exception as e:
        print(f"Error reading file {file}: {e}")

# Concatenate all dataframes into a single dataframe
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
else:
    print("No dataframes to concatenate.")

# Write the combined dataframe to a new CSV file
combined_df.to_csv(f'{output_directory}{date}_network_aggregated.csv', index=True, sep=';')
