In [7]:
# Import libraries
import pandas as pd
import numpy as np
import openai
from openai import OpenAI
import os
 
from IPython.display import display, Markdown


# Define absolute python path
import sys
sys.path.insert(0, '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/') 


In [8]:
from Code.API import get_chat_response, num_tokens_from_string


In [9]:
import pdfplumber


## DATA

# Load NACE data
from Data.NACEdata import NACElevel0, NACElevel1, NACElevel2, NACElevel3     


# Import report1 and report1_annexes as pdf and covnert to plain text

with pdfplumber.open('/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/report1.pdf') as pdf:
    # Extract the text from the PDF
    report1 = ""
    for page in pdf.pages:
        report1 += page.extract_text()

with pdfplumber.open('/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/report1_annex.pdf') as pdf:
    # Extract the text from the PDF
    report1_annex = ""
    for page in pdf.pages:
        report1_annex += page.extract_text()

    # Preprocess the text
report1 = report1.strip()
report1 = report1.replace("\n", " ")
report1 = report1.replace("\t", " ")

report1_annex = report1_annex.strip()
report1_annex = report1_annex.replace("\n", " ")
report1_annex = report1_annex.replace("\t", " ")



In [11]:
# Import all target data (target_code + target_content)
targets_pd = pd.read_csv('/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/XLSX_target_data_v1.2_LLM.csv', sep=";")

# Split targets list into smaller chunks so that each can be processed by the AI (which has a limit of 120k tokens per run)

chunks = 5
targets_chunks = np.array_split(ary=targets_pd, indices_or_sections=chunks) # = splits the targets_pd dataframe into 10 slices


print(f'Chunks: {len(targets_chunks)} \nTargets per chunk: {len(targets_chunks[0])}')

Chunks: 5 
Targets per chunk: 58


  return bound(*args, **kwds)


In [12]:
len(targets_pd)

287

In [8]:
# GENERATE ANSWERS


# Select data to loop through


# TA = (['TA1','TA1','TA1',   # n=3xTA
#        'TA2','TA2','TA2',
#        'TA3','TA3','TA3',
#        'TA4','TA4','TA4',
#        'TA5','TA5','TA5',
#        'TA6','TA6','TA6',
#        'TA7','TA7','TA7',]) 
#TA = ['TA1','TA1','TA1'] # if want to test on only TA (smaller subset to go faster)
#TA = ['TA1','TA2','TA3','TA4','TA5','TA6','TA7'] # n= 1xTA
#TA = ['TA1'] # n=1xTA1


# Model parameters
chunks = chunks
seed = None 
temperature = 0.2
model = "llama-3.3-70b-instruct" #"gpt-4o"

date= '0218' # to indicate date in filenames
output_directory = f'/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/Data/Outputs/{date}/'


# (Loop tools)
loop_counter = 0
answers_content = []
answers_metadata = pd.DataFrame(columns=["chunk",  # or "TA"
                                         "replicate",
                                         "model",
                                         "seed",
                                         "temperature",
                                         "system_fingerprint", 
                                         "prompt_tokens", 
                                         "completion_tokens"])  # create empty panda dataframe with the following columns so to gather a bit more data on the responses and ultimately try to assess consistency

# Loop
for x in range(len(targets_chunks)):

    # Subset data to avoid overloading the GPT

    # Subset per TA
    #targets_subset = targets_pd[targets_pd['target_code'].str.contains(TA[x])]  # subset rows containing one of the characters in  TA[] (ie, select only a specific TA and its targets, because selecting everything in one go is too big for the AI to process)                                                                                                                                          # eg: TA[0] = 'TA1'
    #targets_list = [f"{row['target_code']}: {row['target_content']}" for index, row in targets_subset.iterrows()] # Concatenate target_code and target_content into a list so that it can be added to the prompt as text
    
    # Subset per chunk
    targets_subset = targets_chunks[x] 
    targets_list = [f"{row['target_code']}: {row['target_content']}" for index, row in targets_subset.iterrows()] # Concatenate target_code and target_content into a list so that it can be added to the prompt as text

    
    # Define request
    prompt = f'''Hello, invent a short haiku based on this text: {targets_list}; '''



    # Print to double check amount of tokens in prompt (JRC llama have a max of 120k)
    print(f'''Chunk:{x+1}/{len(targets_chunks)} \nTokens with o200k_base encoding: {num_tokens_from_string(prompt, "o200k_base")}\nTokens with cl100k_base encoding: {num_tokens_from_string(prompt, "cl100k_base")}\n ''')

    # Generate answer
    answer= get_chat_response(prompt=prompt,
                              seed=seed,
                              model=model,
                              temperature=temperature  # The temperature parameter influences the randomness of the generated responses. A higher value, such as 0.8, makes the answers more diverse, while a lower value, like 0.2, makes them more focused and deterministic.
                              )

    answers_content.append((f'chunk{x+1}', answer['response_content'])) # add the different replicats for answers over a single TA in a same list so i can analyse the similarity later
    answers_metadata.loc[x] = (f'{x+1}', #TA code
                               f'', # replicate nbr
                               model,  
                               seed, 
                               temperature, 
                               answer["system_fingerprint"],
                               answer["prompt_tokens"],
                               answer["completion_tokens"]
                               ) 
    

    
    # Save response as csv file
    #output_name = f'{date}output_{TA[x]}.{loop_counter+1}_s{seed}_t{temperature}.csv'    # -> to split by TA
    output_name = f'{date}output_chk{x+1}_s{seed}_t{temperature}.csv'     # -> to split by chunks

    with open((os.path.join(output_directory, output_name)), 'w') as f:
         f.write(answer["response_content"])

    # (Extra loop tools)
    loop_counter += 1         # incremental loop counter that resets to 0 every 3 loops so that it can add the ".1,2,3" at the end of each triplicats file names
    if loop_counter % 3 == 0:
        loop_counter = 0

    


# Save triplicats metadata as csv
#answers_metadata.to_csv(path_or_buf= f'{output_directory}{date}output_s{seed}_t{temperature}_metadata.csv', sep=';', index=False)  # split by TA method
answers_metadata.to_csv(path_or_buf= f'{output_directory}{date}output_chk{len(targets_chunks)}_s{seed}_t{temperature}_metadata.csv', sep=';', index=False)  # split by chunks method



Chunk:1/5 
Tokens with o200k_base encoding: 105581
Tokens with cl100k_base encoding: 106285
 


KeyboardInterrupt: 

In [None]:

# prompt = f'''Hello,

#             Data input: get acquainted with the following data:
#             - NACE classification categories:  {NACElevel1} + {NACElevel2} +{NACElevel3}.
#             - List of targets: {target_list}.
            
#             Task: 
#             - For each target, analyse its content description and assign to each target a NACE category for each level (0,1,2,3). 

#             Answer format: provide your answer as a table in csv format please (separator: ";"), with the following columns:
#             - Target code (e.g., TA1.9)
#             - Target content (e.g., The contribution of the sectors covered by the EU ETS with respect to the EU Climate ambition should be of -62 % compared to 2005 (increasing the linear emissions reduction factor from 2.2 % per year up to 4.4 %)) 
#             - NACE_level1 (e.g., D - Electricity, Gas, Steam and Air Conditioning Supply)
#             - NACE_level1_extra1 (e.g.if other categories overlap)
#             - NACE_level1_extra2 (e.g.if other categories overlap)
#             - NACE_level2 (e.g., D35 - Electricity, gas, steam and air conditioning supply)
#             - NACE_level2_extra1 (e.g.if other categories overlap)
#             - NACE_level2_extra2 (e.g.if other categories overlap)
#             - NACE_level_3 (e.g., D35.1 - Electric power generation, transmission and distribution)
#             - NACE_level_3_extra1 (e.g. if other categories overlap)
#             - NACE level3_extra2 (e.g. if other categories overlap)
#             - Justification
#             - Confidence (e.g. confidence value from 0 to 10 about the assignation choices that are made)

#             Specifications:
#             - If there is some overlap, add the multiple categories possibilites (up to maximum 3 per NACE level). 
#             - Include the name of the NACE categories.
#             - Don't forget to provide the title of the target. If there is no target content, do not invent new content, just state it as empty.
#             - For each target, write one or max 2 sentences justifying your choice.
#             - Output only the csv table and no additional commentary text.

#             Thank you.'''

# # generate answer
# answer= run_API_request(prompt)


In [None]:
#print(answer)

#print answer in Markdown format
display(Markdown(answer))


In [None]:
import pandas as pd


file_name = 'estat_nama_10_a64_e'
directory = f'/Users/giorgiobolchi2/Documents/JRC/LLM/Data/ESTAT/'


tsv = pd.read_csv(f'{directory}{file_name}.tsv', sep='\t')
tsv.to_csv(f'{directory}{file_name}.csv', sep=';', index=True)

In [3]:
# QUICK ANSWER 

# Define absolute python path
import sys
sys.path.insert(0, '/Users/giorgiobolchi2/Documents/GitHub/jrc-egd/LLM/') 

from Code.API import get_chat_response


seed = None 
temperature = 0.2
model = "llama-3.3-70b-instruct" 
prompt = "what are your technical limitations as an API"


answer= get_chat_response(prompt=prompt,
                              seed=seed,
                              model=model,
                              temperature=temperature
                              )


print(answer['response_content'])

I'm a large language model, I have several technical limitations as an API. Here are some of them:

1. **Request Size Limit**: The maximum size of a request is 2048 characters. This means that if you try to send a request that exceeds this limit, it will be truncated, and I may not be able to process it correctly.
2. **Response Size Limit**: The maximum size of a response is 2048 characters. If my response exceeds this limit, it will be truncated, and you may not receive the full response.
3. **Rate Limiting**: To prevent abuse and ensure fair usage, I have rate limits in place. These limits vary depending on the type of API key you have (e.g., free, paid, or enterprise). If you exceed these limits, you may receive an error response or be temporarily blocked.
4. **Concurrency Limit**: I can handle a limited number of concurrent requests. If you send too many requests at the same time, some of them may be delayed or rejected.
5. **Timeouts**: I have timeouts in place to prevent requests