# Products classification through AI
In this notebook, I get the products information contained in the csv file and the intention is to classify each one as Hazmat (Hazardous Material) or not. 


## Classify products based on the data obtained (title and attributes from ML API)
Given that I am using Groq/Gemini for free tier, I'll classify the products in batches of 50 products per LLM call. The amount of products in the same batch must be optimized for improvement.

In [1]:
from pydantic import BaseModel, Field
from typing import List, Optional
from enum import Enum

# Important definitions

class Confidence(Enum):
    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"

class HazmatClassification(BaseModel):
    product_id: str = Field(..., description="The unique identifier of the product.")
    is_hazmat: bool = Field(..., description="Indicates whether the product is classified as a Hazmat.")
    reason: str = Field(None, description="The reason for the classification, if the product is a Hazmat.")
    confidence: Confidence = Field(None, description="The confidence level of the classification, if the product is a Hazmat.")

DATASET = 'dataset_1'

In [2]:
# Get hazmat definition from validated file
with open("data/hazmat-definition.md", "r", encoding='utf8') as f:
    hazmat_def = f.read()

# Get products information from csv file
import pandas as pd
products_df = pd.read_csv(f"data/{DATASET}/{DATASET}.csv") 

batch_size = 100

# # Review only products without classification
# products_df = products_df[products_df['IS_HAZMAT'].isna()].reset_index(drop=True)

# drop columns that are not needed for classification
products_df.drop(columns=['IS_HAZMAT', 'REASON', 'CONFIDENCE'], inplace=True, errors='ignore')

In [3]:
products_df

Unnamed: 0,PRODUCT_ID,TITLE,ATTRIBUTES,SOURCE_FILE
0,MLB29523289,Base Estrela Secretaria Em Aço C/ Rodízios E C...,"{'Marca': 'cadeira giratória diretora', 'Fabri...",info_products_10_escritório.json
1,MLB34230359,Escrivaninha Office Nt2105 Notável Móveis Cor ...,"{'Marca': 'Notável Móveis', 'Modelo': 'NT2105'...",info_products_10_escritório.json
2,MLB15904595,Cadeira de Escritório Mymax Gamer MX5 Cor Preto,"{'Marca': 'Mymax', 'Linha': 'Gamer', 'Modelo':...",info_products_10_escritório.json
3,MLB26801865,Marmita Dupla Camada Á Prova De Vazamento Imed...,"{'Marca': 'Marmita,Lancheira,Escritorio', 'Mod...",info_products_10_escritório.json
4,MLB22522670,Cadeira de escritório TGTTGT-OFC-001 TGT ergon...,"{'Marca': 'TGT', 'Linha': 'cadeira fixa escrit...",info_products_10_escritório.json
...,...,...,...,...
12643,MLB23573140,Jd Móveis eccos+genova gabinete para pia e esp...,"{'Marca': 'Jd Móveis', 'Linha': 'Banheiro', 'M...",info_products_9_banheiro.json
12644,MLB26685521,Prateleira Docol Square Níquel Escovado 388244,"{'Marca': 'Docol', 'Linha': 'Square', 'Modelo'...",info_products_9_banheiro.json
12645,MLB21906090,Gabinete Armário Banheiro C/ Rodízios Branco E...,"{'Marca': 'Tema', 'Modelo': 'Tekim', 'Cor do m...",info_products_9_banheiro.json
12646,MLB38278657,Suporte Duplo De Canto P/ Banheiro Aço Inox Re...,"{'Marca': 'EmbralumiEmbralumi', 'Linha': 'Prem...",info_products_9_banheiro.json


In [4]:
hazmat_classifier_system_msg = f"""
You are a domain-expert Hazmat classifier. Your task is to analyze the products below and determine, for each, if it is Hazmat or not, based on the definition provided between <hazmat_definition> tags.

You must base your analysis on the following JSON schema, which describes the required analysis for each product in the fields:
<json_schema>{HazmatClassification.model_json_schema()}</json_schema>

Before answering, you must output your detailed reasoning process.

Hazmat definition: <hazmat_definition>{hazmat_def}</hazmat_definition>

Guidelines:
- Always refer to the Hazmat definition to address the classification. Do not suppose anything. If not certain of the classification, output as hazmat with lower confidence.
- Only output a product as non-hazmat if you are absolutely certain that it is not a Hazmat according to the definition provided.
"""

hazmat_json_extractor_system_msg = f"""
You are a domain-expert Hazmat classifier. Based on the analysis below, extract and output the final answer as a jsonl structure, located between <jsonl> tags, with each line following this schema (one line per product): <json_schema>{HazmatClassification.model_json_schema()}</json_schema>.

Guidelines:
- For the tag <jsonl>: The final answer must be a valid jsonl structure, with each line following the schema provided.
- If not certain of the classification, output as hazmat with lower confidence.
- Only output a product as non-hazmat if you are absolutely certain that it is not a Hazmat according to the definition provided.
"""

In [5]:
from defs_and_tools import call_llm, extract_from_tag
from dotenv import load_dotenv

load_dotenv()

# json_extractor_models = ["groq/llama-3.3-70b-versatile",
#                         "groq/llama3-70b-8192",
#                         "gemini/gemini-2.0-flash"]
# json_extractor_model = "gemini/gemini-2.0-flash" # Did not create the tags correctly for output parsing
json_extractor_model = "gemini/gemini-2.5-flash"
hazmat_classifier_model = "gemini/gemini-2.5-flash"

In [40]:
def classify_products(products_df, batch_size=30, output_jsonl="classified_products.jsonl", log_file="log_file.txt"):
    """Classify products in batches and save results."""

    for i in range(0, len(products_df), batch_size):
        batch = products_df.iloc[i:i + batch_size]
        batch_list = batch.to_dict(orient="records")
        
        print(f"Processing batch {i//batch_size + 1} with {len(batch_list)} products...")
        raw_response = call_llm(
            system=hazmat_classifier_system_msg,
            prompt=f"Products to classify:\n{batch_list}",
            model=hazmat_classifier_model,
        )
        
        print("Raw response received, formatting to JSONL...")
        formatted_response = call_llm(
            system=hazmat_json_extractor_system_msg,
            prompt=raw_response,
            model=json_extractor_model,
        )
        
        # Save JSONL output
        jsonl_content = extract_from_tag(formatted_response, "jsonl")
        if jsonl_content:
            print(f"Batch {i//batch_size + 1} jsonl content extracted!")
            with open(output_jsonl, "a", encoding="utf-8") as f:
                f.write(jsonl_content + "\n")
        
        # Save raw log
        with open(log_file, "a", encoding="utf-8") as f:
            f.write(f"Batch {i//batch_size + 1}:\n{raw_response}\n\n")
        
        print(f"Batch {i//batch_size + 1} processed and saved to {output_jsonl} and {log_file}!")
        print(40*"-")

classify_products(products_df, 
                  output_jsonl=f"data/{DATASET}/{DATASET}_classified_products.jsonl",
                  log_file=f"data/{DATASET}/{DATASET}_raw_log.txt",
                  batch_size=batch_size)

Processing batch 1 with 60 products...
Raw response received, formatting to JSONL...
Raw response received, formatting to JSONL...
Batch 1 jsonl content extracted!
Batch 1 processed and saved to data/dataset_1/dataset_1_classified_products.jsonl and data/dataset_1/dataset_1_raw_log.txt!
----------------------------------------
Batch 1 jsonl content extracted!
Batch 1 processed and saved to data/dataset_1/dataset_1_classified_products.jsonl and data/dataset_1/dataset_1_raw_log.txt!
----------------------------------------


In [6]:
import json

# Prompt: Open jsonl file and insert result into dataframe products_df. The json file may contain different encodings, so handle it properly.

def try_decode_line(line):
    for encoding in ["utf-8", "latin1", "utf-8-sig"]:
        try:
            return line.decode(encoding)
        except Exception:
            continue
    return None  # if all fail

jsonl_path = f"data/{DATASET}/{DATASET}_classified_products.jsonl"
classified_rows = []

with open(jsonl_path, "rb") as f:  # read as binary to handle mixed encoding
    for bline in f:
        decoded_line = try_decode_line(bline)
        if decoded_line and decoded_line.strip():
            try:
                row = json.loads(decoded_line)
                # Convert all keys to uppercase for consistency
                row = {k.upper(): v for k, v in row.items()}
                classified_rows.append(row)
            except json.JSONDecodeError:
                continue  # skip corrupted JSON lines

classified_df = pd.DataFrame(classified_rows)
classified_df.columns = [col.upper() for col in classified_df.columns]

# Convert all columns in products_df to uppercase for consistency
products_df.columns = [col.upper() for col in products_df.columns]

# Merge classified_df with products_df on 'PRODUCT_ID'
products_df = products_df.merge(
    classified_df,
    on='PRODUCT_ID',
    how='left',
    suffixes=("", "_CLASSIFIED")
)

products_df.head()
# Save the updated products_df with classifications
products_df.to_csv(f"data/{DATASET}/{DATASET}_classified_products.csv", index=False, encoding="utf-8")

In [7]:
products_df

Unnamed: 0,PRODUCT_ID,TITLE,ATTRIBUTES,SOURCE_FILE,IS_HAZMAT,REASON,CONFIDENCE
0,MLB29523289,Base Estrela Secretaria Em Aço C/ Rodízios E C...,"{'Marca': 'cadeira giratória diretora', 'Fabri...",info_products_10_escritório.json,False,,
1,MLB34230359,Escrivaninha Office Nt2105 Notável Móveis Cor ...,"{'Marca': 'Notável Móveis', 'Modelo': 'NT2105'...",info_products_10_escritório.json,False,,
2,MLB15904595,Cadeira de Escritório Mymax Gamer MX5 Cor Preto,"{'Marca': 'Mymax', 'Linha': 'Gamer', 'Modelo':...",info_products_10_escritório.json,False,,
3,MLB26801865,Marmita Dupla Camada Á Prova De Vazamento Imed...,"{'Marca': 'Marmita,Lancheira,Escritorio', 'Mod...",info_products_10_escritório.json,False,,
4,MLB22522670,Cadeira de escritório TGTTGT-OFC-001 TGT ergon...,"{'Marca': 'TGT', 'Linha': 'cadeira fixa escrit...",info_products_10_escritório.json,False,,
...,...,...,...,...,...,...,...
13681,MLB23573140,Jd Móveis eccos+genova gabinete para pia e esp...,"{'Marca': 'Jd Móveis', 'Linha': 'Banheiro', 'M...",info_products_9_banheiro.json,False,,
13682,MLB26685521,Prateleira Docol Square Níquel Escovado 388244,"{'Marca': 'Docol', 'Linha': 'Square', 'Modelo'...",info_products_9_banheiro.json,False,,
13683,MLB21906090,Gabinete Armário Banheiro C/ Rodízios Branco E...,"{'Marca': 'Tema', 'Modelo': 'Tekim', 'Cor do m...",info_products_9_banheiro.json,False,,
13684,MLB38278657,Suporte Duplo De Canto P/ Banheiro Aço Inox Re...,"{'Marca': 'EmbralumiEmbralumi', 'Linha': 'Prem...",info_products_9_banheiro.json,False,,


[]