In [None]:
# Import libraries
import sys
import os
import time
import glob

import pandas as pd
import numpy as np



In [None]:
## DEFINE DIRECTORIES

# Define the project root directory
root_dir = '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/'

# File paths
target_data_dir  = '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data'

# Define absolute python path
sys.path.insert(0, root_dir) 

In [None]:
## IMPORT FUNCTIONS & DATA

# Load API and import request function
from Code.API import get_chat_response, num_tokens_from_string

# Import target data (target_code + target_content)
target_data_250 = pd.read_csv(f'{target_data_dir}/targets_data_250.csv', sep=";")  #extensive target list from target_NACE_classification.xlsx
target_data_150 = pd.read_csv(f'{target_data_dir}/targets_data_150.csv', sep=";")  #target list as in report 1 + assessments

# Import list of experts
from Data.REPORT_1.experts_report1 import experts_report1_list

# Import a list of the thematic areas, and a list of sub-themes determined manually in Obsidian Canvases (building up on sub-themes used in report 1), that are not yet sorted per TA
from Data.subthemes import subthemes_perTA_list,thematic_areas

# List of TA codes (used later for the loop)
TA = (['TA1','TA2','TA3','TA4','TA5','TA6','TA7']) 


In [None]:
## DEFINE LLM PARAMETERS

target_data = target_data_250
seed = None
temperature = 0.1
model = "llama-3.3-70b-instruct" 
date = '0408'
output_dir = f'/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/Outputs/{date}/'

In [None]:
# GENERATE ANSWERS


# (Loop tools)
os.makedirs(output_dir, exist_ok=True) # Create the 'date' folder if it doesn't exist
answers_content = []
answers_metadata = pd.DataFrame(columns=["TA",          # create empty panda dataframe with the following columns so to gather a bit more data on the responses and ultimately try to assess consistency
                                         "seed",
                                         "temperature",
                                         "system_fingerprint", 
                                         "prompt_tokens", 
                                         "completion_tokens"])  

# Loop
for x in range(len(TA)):
#for x in list([5,6]):

    # Split the data per TA
    target_subset = target_data[target_data['target_code'].str.contains(TA[x])]  # subset rows containing one of the characters in  TA[] (ie, select only a specific TA and its targets, because selecting everything in one go is too big for the AI to process)                                                                                                                                          # eg: TA[0] = 'TA1'
    target_list = [f"{row['target_code']}: {row['target_content']}" for index, row in target_subset.iterrows()] # Concatenate target_code and target_content into a list so that it can be added to the prompt as text
    
    
    # Define request
    prompt = f'''
            Data input & Context:
            - Thematic area (TA): {thematic_areas[TA[x]]}
            - List of European Green Deal (EGD) targets: {target_list}.
            - List of sub-themes: {subthemes_perTA_list[TA[x]]}.
            - List of experts: {experts_report1_list[TA[x]]}.

            Task: 
            - For each subtheme, assign the best fitting experts based on their specialisation and the content of the targets of each subtheme.

            Answer format: provide your answer as a table in csv format (separator: ";"), with the following columns:
            - thematic_area
            - sub_theme 
            - experts (i.e., list of the most fitting expert names, maximum 3 experts per sub-theme)
            - justification

            Specifications:
            - For each subtheme, write 1-2 sentences to justify why one or more specific experts were assigned to a specific sub-theme.
            - Output only the CSV table. Do not include additional commentary.
            
            '''
    

    # Print pre-generation metadata (to double check amount of tokens in prompt, JRC llama3.3 should have a max of 120k)
    prompt_metadata = f'''{TA[x]}:  \nPrompt length: {len(prompt)} \nPrompt tokens (o200k_base encoding): {num_tokens_from_string(prompt, "o200k_base")} \nPrompt tokens (cl100k_base encoding): {num_tokens_from_string(prompt, "cl100k_base")} \n'''
    print(prompt_metadata)

    # Generate answer
    answer= get_chat_response(prompt=prompt,
                              seed=seed,
                              model=model,
                              temperature=temperature  # The temperature parameter influences the randomness of the generated responses. A higher value, such as 0.8, makes the answers more diverse, while a lower value, like 0.2, makes them more focused and deterministic.
                              )
    # Print post-generation metadata 
    print(f'Answer generated.\n')
    print(f'Prompt tokens: {answer["prompt_tokens"]} \nCompletion tokens: {answer["completion_tokens"]}')


    answers_metadata.loc[x] = (f'{TA[x]}', #TA code
                               seed, 
                               temperature, 
                               answer["system_fingerprint"],
                               answer["prompt_tokens"],
                               answer["completion_tokens"]
                               ) 
    
    # Save response as csv file
    output_name = f'{date}_experts_to_subthemes_{TA[x]}.csv'

    with open((os.path.join(output_dir, output_name)), 'w') as f:
         f.write(answer["response_content"])


# Save triplicats metadata as csv
answers_metadata.to_csv(path_or_buf= f'/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/Outputs/{date}/{date}output_s{seed}_t{temperature}_metadata.csv', 
                        sep=';', 
                        index=False)


In [10]:
# RESULTS AGGREGATION


# (If necessary, re-specify the directory path and file pattern)
date = date
output_dir = output_dir
file_pattern = f'{date}_experts_to_subthemes_*.csv' # pattern to match all files generated in the current session

csv_files = glob.glob(output_dir + '/' + file_pattern) # Get a list of all CSV files matching the pattern

# Initialize an empty list to store the dataframes
dataframes = []

# Iterate over each CSV file, read it into a dataframe, and append to the list
for file in csv_files:
    try:
        df = pd.read_csv(file, on_bad_lines='skip', sep=';')
        dataframes.append(df)
    except Exception as e:
        print(f"Error reading file {file}: {e}")

# Concatenate all dataframes into a single dataframe
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
else:
    print("No dataframes to concatenate.")

# Write the combined dataframe to a new CSV file
combined_df.to_csv(f'{output_dir}{date}_network_aggregated.csv', index=True, sep=';')
