In [1]:
# get openai api key from openai_config.json file
import json
with open('../openai_config.json') as f:
    openai_api_key = json.load(f)['openai_api_key']

In [2]:
%%capture

%env OPENAI_API_KEY = {openai_api_key}

In [3]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o")
llm.invoke("Hello, how are you?").content

"Hello! I'm just a computer program, so I don't have feelings, but I'm here to help you. How can I assist you today?"

In [58]:
# from langchain.document_loaders import CSVLoader
# input_loader_kor = CSVLoader(r"../../guidance_for_environmental_impact_factor_mapping_on_aws/assets/input/coupang_product_names_groceries_v3_kor.csv", encoding='utf-8-sig')
# input_doc_kor = input_loader_kor.load()

# input_doc_kor[100].page_content

'품목명: 아이시스 8.0 생수, 200ml, 40개'

In [5]:
from langchain.document_loaders import CSVLoader
input_loader_eng = CSVLoader(r"../../guidance_for_environmental_impact_factor_mapping_on_aws/assets/input/sample_amazon_product_names_groceries_eng.csv", encoding='utf-8-sig')
input_doc_eng = input_loader_eng.load()

input_doc_eng[0].page_content

'PRODUCT_NAME: Weikfield Spaghetti Pasta, 400g'

In [6]:
from langchain_core.prompts import PromptTemplate

prompt = """You are a Lifecycle Analysis expert matching grocery products to their Harmonized System (HS) Codes.

I want to do of LCA of grocery products based on Environmentally Extended Input Output (EEIO) Environmental Impact Factors (EIF). I am interested in the environmental impact associated with the materials and manufacturing phase of the product. I am given a grocery product and three possible corresponding HS codes and descriptions. 

I want to pick the HS code and description that best match the given product. Include justification for your choice.
Format the output in JSON with the keys BestHSCode, BestHSDescription, Justification.

Product:
{product_name}

What HS Code is the best match for the provided product? 

Make the most of the given information. DO NOT say that information is limited or ask for more information.
YOU MUST choose a best code and title. YOU MUST include a justification for your choice.
Avoid filler words such as "Based on the details" or "happy to assist", keep your response to the point.
Do not repeat the given instructions or information. 
DO NOT say you have insufficient information for an LCA.

Respond with the JSON output and nothing else."""
prompt = PromptTemplate.from_template(prompt)

In [7]:
# Initialize results storage
results_eng = []

# Process each row using the graph
for doc in input_doc_eng:
    # # Extract CommodityDescription from the page_content using regex
    # match = re.search(r"CommodityDescription: (.+)", doc.page_content)
    # activity = match.group(1).strip() if match else None  # Extract the description

    activity = doc.page_content # Extract the Product Name

    if activity:
        msg = prompt.invoke({"product_name": activity})
        result = json.loads(llm.invoke(msg).content.strip("```").lstrip("json").replace("\n", "").replace(" ", ""))
        result['PRODUCT_NAME'] = activity
        result["Ground Truth"] = result.get("BestHSCode").replace(".", "").rjust(6, "0")
        
        # Extract context and answer from the state after processing
        results_eng.append(result)

# Print the results for debugging
print("Final results:", results_eng[0])

Final results: {'BestHSCode': '190219', 'BestHSDescription': 'Pasta,whetherornotcookedorstuffed(withmeatorothersubstances)orotherwiseprepared,otherthanuncookedpasta,notstuffedorotherwiseprepared', 'Justification': 'WeikfieldSpaghettiPastaisanuncookedpastaproduct,whichalignswiththedescriptionofHScode190219,coveringpastaproductsthatarenotcooked,stuffed,orotherwiseprepared.', 'PRODUCT_NAME': 'PRODUCT_NAME: Weikfield Spaghetti Pasta, 400g', 'Ground Truth': '190219'}


In [8]:
import csv
to_csv = results_eng
keys = to_csv[0].keys()

with open("sample_amazon_gt", 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(to_csv)

In [9]:
import pandas as pd
df = pd.read_csv("sample_amazon_gt")
df.drop(columns=["BestHSCode", "BestHSDescription", "Justification"], inplace=True)
df.to_csv("sample_amazon_gt_clean.csv", index=False)