In [1]:

import asyncio
import os
from openai import AzureOpenAI
from dotenv import load_dotenv
from structure import Structure
import json
from config import *
from tqdm import tqdm
from prompts import *
from pydantic import BaseModel
from transformers import GPT2TokenizerFast
from copy import deepcopy
from prompts import *

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
load_dotenv()


client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version="2024-10-21",
)


tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/gpt-4o')

In [3]:

class TestStructure(BaseModel):
    original_language: str
    translated_language: str
    original_text: str
    translated_text: str


In [245]:
pdf = open("../data/pdf/BAU-EPD_Knauf-2025-21-ecoinvent-Fireboard_12,5.txt", "r", encoding="utf-8").read()

In [4]:
completion = client.beta.chat.completions.parse(
    model=os.getenv("AZURE_OPENAI_MODEL_NAME"),
    messages=[
        {"role": "system", "content": "Translate to danish"},
        {"role": "user", "content": "My name is Emil"}
    ],
    response_format=TestStructure,
    temperature=0.0,
    logprobs=True
)

In [247]:
json.dump(completion.choices[0].logprobs.to_dict(), open("../data/logprobs/BAU-EPD_Knauf-2025-21-ecoinvent-Fireboard_12,5.json", "w"), indent=4)
json.dump(json.loads(completion.choices[0].message.content), open("../data/test_output/BAU-EPD_Knauf-2025-21-ecoinvent-Fireboard_12,5.json", "w"), indent=4)

In [5]:
content = completion.choices[0].message.content
print(content)
# dud = json.loads(content)

howar

# tokens = tokenizer(json.dumps(dud, separators=(",", ":")))
# for token in tokens['input_ids']:
#     print(tokenizer.decode([token]))


{"original_language":"English","translated_language":"Danish","original_text":"My name is Emil","translated_text":"Mit navn er Emil"}


In [12]:
for hej in completion.choices[0].logprobs.content:
    print(hej.token, "\t\t", hej.logprob)

{" 		 0.0
original 		 0.0
_language 		 -7.89631e-07
":" 		 0.0
English 		 -0.000181849
"," 		 0.0
translated 		 0.0
_language 		 0.0
":" 		 0.0
D 		 -1.147242e-06
anish 		 0.0
"," 		 0.0
original 		 0.0
_text 		 0.0
":" 		 0.0
My 		 0.0
 name 		 0.0
 is 		 0.0
 Emil 		 0.0
"," 		 -0.0019287518
translated 		 0.0
_text 		 0.0
":" 		 0.0
Mit 		 -0.004632688
 navn 		 -2.577686e-06
 er 		 -4.3202e-07
 Emil 		 0.0
"} 		 -1.0206721e-05


In [189]:
print(completion.choices[0].logprobs.to_json())

{
  "content": [
    {
      "token": "{\"",
      "bytes": [
        123,
        34
      ],
      "logprob": 0.0,
      "top_logprobs": []
    },
    {
      "token": "original",
      "bytes": [
        111,
        114,
        105,
        103,
        105,
        110,
        97,
        108
      ],
      "logprob": 0.0,
      "top_logprobs": []
    },
    {
      "token": "_language",
      "bytes": [
        95,
        108,
        97,
        110,
        103,
        117,
        97,
        103,
        101
      ],
      "logprob": -6.704273e-7,
      "top_logprobs": []
    },
    {
      "token": "\":\"",
      "bytes": [
        34,
        58,
        34
      ],
      "logprob": 0.0,
      "top_logprobs": []
    },
    {
      "token": "English",
      "bytes": [
        69,
        110,
        103,
        108,
        105,
        115,
        104
      ],
      "logprob": -0.00016825978,
      "top_logprobs": []
    },
    {
      "token": "\",\"",
      "bytes"

In [94]:
target = json.loads(open("../data/nice_epds/BAU-EPD-Mischek-2025-4-ecoinvent-Massivwand-5kg.json", 'r').read())

In [179]:
tables = [
        'environmental_impact',
        'additional_environmental_impact',
        'resource_use',
        'end_of_life_waste',
        'end_of_life_flow']

columns = ["A1",	"A2",	"A3",	"A1-A3", "A4",	"A5",	"B1",	"B2",	"B3",	"B4",	"B5",	"B6",	"B7",	"C1",	"C2",	"C3",	"C4",	"D"]

filtered_output_start = {}
filtered_output_end = {}

MARKER = "###"
MARKER_TOKEN = tokenizer(MARKER)['input_ids'][0]

for table in tables:
    filtered_output_start[table] = []
    filtered_output_end[table] = []
    rows = target[table]
    
    for row in rows:
        filtered_row = {}
        filtered_row['parameter'] = row['parameter']
        filtered_row['values'] = []
        for value in row['values']:
            filtered_row['values'].append(MARKER)
            filtered_output_start[table].append(deepcopy(filtered_row))
            filtered_row['values'].pop()
            filtered_row['values'].append(value)
            filtered_row['values'].append(MARKER)
            filtered_output_end[table].append(deepcopy(filtered_row))
            filtered_row['values'].pop()
            
            break
        break
    break

print(filtered_output_start)
print(filtered_output_end)

{'environmental_impact': [{'parameter': 'GWP-total', 'values': ['###']}]}
{'environmental_impact': [{'parameter': 'GWP-total', 'values': [{'value': '7,28E+01', 'module': 'A1-A3', 'scenario': None}, '###']}]}


In [191]:
start_string = json.dumps(filtered_output_start, )

dict

In [226]:

filtered_string_start = json.dumps(filtered_output_start, separators=(',', ':'))
filtered_string_end = json.dumps(filtered_output_end, separators=(',', ':'))

start_tokens = tokenizer(filtered_string_start)['input_ids']
end_tokens = tokenizer(filtered_string_end)['input_ids']

start_index = start_tokens.index(MARKER_TOKEN)
end_index = end_tokens.index(MARKER_TOKEN)


start_tokens_prime = start_tokens[:start_index]
end_tokens_prime = end_tokens[:end_index]

In [234]:
#filtered_string_start[:filtered_string_start.index(MARKER_TOKEN)]
print(tokenizer.decode(start_tokens))
print(tokenizer.decode(end_tokens[len(start_tokens_prime):len(end_tokens_prime) - 1]))

{"environmental_impact":[{"parameter":"GWP-total","values":["###"]}]}
{"value":"7,28E+01","module":"A1-A3","scenario":null


In [25]:
import math


-1000 < -math.inf

False