# LLM Playground: Tagging items

# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from datetime import timedelta

from openai import OpenAI
from typing import List
import dill
import numpy as np
import pandas as pd
import plotly.express as px
import torch
from datasets import load_dataset
from dotenv import load_dotenv
from loguru import logger
from pydantic import BaseModel
from tqdm.auto import tqdm

import mlflow

load_dotenv()

sys.path.insert(0, "..")

import src.viz
from src.ann import AnnIndex
from llm.tagger import PROMPT as ITEM_TAGGING_PROMPT

# Controller

In [3]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = False
    run_name: str = "llm-tag-items"
    notebook_persist_dp: str = None
    random_seed: int = 41
    device: str = None

    item_metadata_pipeline_fp: str = "../data/item_metadata_pipeline.dill"
    qdrant_url: str = None
    qdrant_collection_name: str = "item_desc_sbert"

    # Feature flags
    use_sbert_features: bool = True

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    top_K: int = 100
    top_k: int = 10

    mlf_model_name: str = "ranker"

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "log_to_mlflow": false,
  "run_name": "llm-tag-items",
  "notebook_persist_dp": "/Users/dvq/frostmourne/recsys-mvp/notebooks/data/llm-tag-items",
  "random_seed": 41,
  "device": null,
  "item_metadata_pipeline_fp": "../data/item_metadata_pipeline.dill",
  "qdrant_url": null,
  "qdrant_collection_name": "item_desc_sbert",
  "use_sbert_features": true,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "top_K": 100,
  "top_k": 10,
  "mlf_model_name": "ranker"
}


# Load data

In [4]:
metadata_raw = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Video_Games", trust_remote_code=True
)
metadata_df = (
    metadata_raw["full"]
    .to_pandas()
)
metadata_df

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Video Games,Dash 8-300 Professional Add-On,5.0,1,[Features Dash 8-300 and 8-Q300 ('Q' rollout l...,[The Dash 8-300 Professional Add-On lets you p...,,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Aerosoft,"[Video Games, PC, Games]","{""Pricing"": ""The strikethrough price is the Li...",B000FH0MHO,,,
1,Video Games,Phantasmagoria: A Puzzle of Flesh,4.1,18,[Windows 95],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Sierra,"[Video Games, PC, Games]","{""Best Sellers Rank"": {""Video Games"": 137612, ...",B00069EVOG,,,
2,Video Games,NBA 2K17 - Early Tip Off Edition - PlayStation 4,4.3,223,[The #1 rated NBA video game simulation series...,[Following the record-breaking launch of NBA 2...,58.0,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['NBA 2K17 - Kobe: Haters vs Players...,2K,"[Video Games, PlayStation 4, Games]","{""Release date"": ""September 16, 2016"", ""Best S...",B00Z9TLVK0,,,
3,Video Games,Nintendo Selects: The Legend of Zelda Ocarina ...,4.9,22,[Authentic Nintendo Selects: The Legend of Zel...,[],37.42,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Amazon Renewed,"[Video Games, Legacy Systems, Nintendo Systems...","{""Best Sellers Rank"": {""Video Games"": 51019, ""...",B07SZJZV88,,,
4,Video Games,Thrustmaster Elite Fitness Pack for Nintendo Wii,3.0,3,"[Includes (9) Total Accessories, Pedometer, Wi...",[The Thrustmaster Motion Plus Elite Fitness Pa...,,"{'hi_res': [None, None, None, None, None, None...","{'title': [], 'url': [], 'user_id': []}",THRUSTMASTER,"[Video Games, Legacy Systems, Nintendo Systems...","{""Release date"": ""November 1, 2009"", ""Pricing""...",B002WH4ZJG,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137264,,Story of Seasons: Pioneers Of Olive Town (Nint...,4.5,397,[A wild world of discovery - tame the wilderne...,"[Product Description, Inspired by Tales of you...",31.04,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Marvelous Europe,"[Video Games, Nintendo Switch, Games]","{""Release date"": ""March 26, 2021"", ""Best Selle...",B09XQJS4CZ,,,
137265,Video Games,MotoGP 18 (PC DVD) UK IMPORT REGION FREE,4.0,1,[Brand new game engine - MotoGP18 has been reb...,[Become the champion of the 2018 MotoGP Season...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Milestone,"[Video Games, Game Genre of the Month]","{""Pricing"": ""The strikethrough price is the Li...",B07DGPTGNV,,,
137266,Cell Phones & Accessories,Century Accessory Soft Silicone Protective Ski...,2.9,19,"[Easy access to all buttons, controls and port...",[This soft case cover will add a splash of col...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Century Accessory,"[Video Games, Legacy Systems, Xbox Systems, Xb...","{""Package Dimensions"": ""2.76 x 2.76 x 0.2 inch...",B00HUWCQBW,,,
137267,,Hasbro Interactive Mr. Potato Head Activity Pa...,3.9,5,[],"[Amazon.com, Everyone's favorite master-of-dis...",,"{'hi_res': [None, 'https://m.media-amazon.com/...","{'title': [], 'url': [], 'user_id': []}",Hasbro,"[Video Games, PC, Games]","{""Release date"": ""July 24, 1999"", ""Best Seller...",B00002S9MH,,,


In [5]:
metadata_df.sample(10, random_state=args.random_seed)['title'].values.tolist()

['Gliging 120Pcs/Set MX Switch Films Mechanical Keyboard Switches stabilizer Switch Film Repair for Cherry MX kailh Gateron Switch',
 'NEW HOLLAND SKYLINE [Xbox 360]',
 'HORI Gaming Headset (Pikachu POP) for Nintendo Switch & Switch Lite - Officially Licensed by Nintendo & Pokemon Company International - Nintendo Switch',
 'Rampage 2: Universal Tour',
 'Warner Home Video Lego Jurassic World PS4',
 'Skulls of the Shogun: Bone-a-Fide Edition [Online Game Code]',
 'Mysteryville - PC (Jewel case)',
 'Wireless Controller Replacement for WII Controller,GLOWANT 2 Pack Wii Remote Controller Compatible with Wii/Wii U Console with Wrist Strap and Silicone Case',
 'Wolfenstein The Old Blood\u3000[ceroZ]',
 'Hard Drive Data Migration Transfer Cable Kit For XBOX 360 [Electronics]']

# Call OpenAI to get tags

In [6]:
sampled_items = metadata_df.sample(10, random_state=args.random_seed)['title'].values.tolist()

In [7]:
prompt = ITEM_TAGGING_PROMPT.format(input_list=sampled_items)
prompt

'\nFor each of the following items, generate a set of tags that capture the main features, target audience, platform compatibility, and item type (e.g., accessory, game, hardware).\n\nReturn the output as a list of JSON objects, where each object includes the original item title and a list of tags. Focus on adding tags that help shoppers easily find these items based on gaming platforms, brand associations, special edition details, and product functionality.\n\n<EXAMPLE>\n\nExample item titles with JSON input and output:\n\nInput:  \n[\n    { "title": "Gliging 120Pcs/Set MX Switch Films Mechanical Keyboard Switches stabilizer Switch Film Repair for Cherry MX kailh Gateron Switch" },\n    { "title": "NEW HOLLAND SKYLINE [Xbox 360]" },\n    { "title": "HORI Gaming Headset (Pikachu POP) for Nintendo Switch & Switch Lite - Officially Licensed by Nintendo & Pokemon Company International - Nintendo Switch" }\n]\n\nOutput:  \n[\n    {\n        "item_title": "Gliging 120Pcs/Set MX Switch Films

In [8]:
client = OpenAI()

# Define the structured output format using Pydantic for a single item
class ItemTag(BaseModel):
    item_title: str
    tags: List[str]

# Define the output format to wrap the list of items within an object
class ItemTaggingOutput(BaseModel):
    items: List[ItemTag]  # Wrap list in an "items" field

# Make the API call with structured output
completion = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ],
    response_format=ItemTaggingOutput,  # Set response format to the Pydantic model
)

# Extract and print the parsed output
tagged_items = completion.choices[0].message.parsed
print(tagged_items)

items=[ItemTag(item_title='Gliging 120Pcs/Set MX Switch Films Mechanical Keyboard Switches stabilizer Switch Film Repair for Cherry MX kailh Gateron Switch', tags=['Keyboard Accessory', 'Cherry MX', 'Mechanical Keyboard', 'Switch Film', 'Gateron', 'Repair']), ItemTag(item_title='NEW HOLLAND SKYLINE [Xbox 360]', tags=['Xbox 360 Game', 'Racing', 'NEW HOLLAND', 'Skyline', 'Retro']), ItemTag(item_title='HORI Gaming Headset (Pikachu POP) for Nintendo Switch & Switch Lite - Officially Licensed by Nintendo & Pokemon Company International - Nintendo Switch', tags=['Headset', 'Nintendo Switch', 'Switch Lite', 'Pikachu', 'Pokemon', 'Gaming Accessory']), ItemTag(item_title='Rampage 2: Universal Tour', tags=['Game', 'PlayStation', 'Rampage Series', 'Arcade', 'Retro']), ItemTag(item_title='Warner Home Video Lego Jurassic World PS4', tags=['PS4 Game', 'Lego', 'Jurassic World', 'Action-Adventure', 'Family Friendly']), ItemTag(item_title='Skulls of the Shogun: Bone-a-Fide Edition [Online Game Code]', 