# Generate prompts for the variant `LLMSeqPromptGenItem` (Section 4.1)

In [None]:
import pandas as pd
import numpy as np
import json
import random
import tiktoken
from main.data.session_dataset import SessionDataset

In [None]:
DATASET: Literal["beauty", "steam"] = "beauty"
TEST_DATA_PICKLE_NAME = f"{DATASET}_dataset.pickle"
ITEM_NAMES_DF = f"{DATASET}_item_names.csv"

In [None]:
dataset: SessionDataset = SessionDataset.from_pickle(open(TEST_DATA_PICKLE_NAME, "rb"))

In [None]:
item_df = pd.read_csv(ITEM_NAMES_DF, usecols=["ItemId", "name"])
item_df

In [None]:
item_df[item_df['name'].isna()]

In [None]:
unnamed_item_ids = item_df[beauty_product_df['name'].isna()]\
    ['ItemId'].unique()
unnamed_item_ids

In [None]:
sessions = dataset.get_train_data().groupby('SessionId')

train_prompts = {}
train_ground_truths = {}

# For each session in the train data
for session_id, session_data in sessions:
    items = session_data['ItemId'].to_numpy()
    
    # Remove sessions completely when they have an unnamed product
    if np.any(np.isin(items, unnamed_item_ids)):
        print("Skip!")
        continue
        
    # Split the train prompt into a 'prompt' and 'ground-truth'.
    # In a session of 'n' items, the first 'n-1' items are the prompt and the 'n'th' item is the ground truth. 
    train_prompts[session_id] = items[:-1]
    train_ground_truths[session_id] = items[-1:]

len(train_prompts)

In [None]:
product_id_to_name = item_df.set_index('ItemId')\
    ['name'].to_dict()
product_id_to_name

In [None]:
textified_train_prompts = {}

for session, rec_items in train_prompts.items():
    textified_train_prompts[session] = [
        product_id_to_name[product_id] for product_id in rec_items
    ]

textified_train_prompts

In [None]:
SYSTEM_PROMPT = """Provide a unique item recommendation that is complementary to the user's item list. 
Ensure the recommendation is from items included in the data you are fine-tuned with. List only the item name."""

USER_PROMPT_TEMPLATE = """The user's item list:\n{user_item_list}"""

ASSISTANT_PROMPT_TEMPLATE = """{ground_truth}"""

def create_prompt(train_prompt, ground_truth):

    prompt = {}
    prompt['messages'] = []
    prompt['messages'].append({
        "role": "system",
        "content": SYSTEM_PROMPT
    })
    prompt['messages'].append({
        "role": "user",
        "content": USER_PROMPT_TEMPLATE.format(
            user_item_list='\n'.join(train_prompt)
        )
    })
    prompt['messages'].append({
        "role": "assistant",
        "content": ASSISTANT_PROMPT_TEMPLATE.format(
            ground_truth=ground_truth
        )
    })

    return prompt


In [None]:
min_session_length = 2
num_tokens = 0
num_validation_cases = int(0.2 * len(train_prompts))
tokens_per_message = 3
tokens_per_name = 1

train_cases = []
validation_cases = []

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

# Shuffle session ids to get a random validation set.
random.seed(42)
session_ids = list(train_prompts.keys())
random.shuffle(session_ids)

for i, session_id in enumerate(session_ids): 
    train_session = train_prompts[session_id]

    # We skip sessions that are too short.
    if len(train_session) < min_session_length:
        continue
    
    # Create prompt
    train_prompt = textified_train_prompts[session_id]
    prompt = create_prompt(
        train_prompt=train_prompt,
        ground_truth=train_ground_truths[session_id]
    )

    # We skip sessions that are too long.
    num_prompt_tokens = 0
    for message in prompt['messages']:
        num_prompt_tokens += tokens_per_message
        for key, value in message.items():
            num_prompt_tokens += len(encoding.encode(value))
            if key == "name":
                num_prompt_tokens += tokens_per_name
    num_prompt_tokens += 3  # every reply is primed 
                            # with <|start|>assistant<|message|>
    
    if num_prompt_tokens > 4096:
        continue

    # Add to validation or training set.
    if i < num_validation_cases: 
        validation_cases.append(prompt)
    else: 
        num_tokens += num_prompt_tokens
        train_cases.append(prompt)

# Convert training to JSONL.
train_cases = [
    json.dumps(train_case) 
    for train_case in train_cases
]
train_string = '\n'.join(train_cases)

# Convert validation to JSONL.
validation_cases = [
    json.dumps(validation_case) 
    for validation_case in validation_cases
]
validation_string = '\n'.join(validation_cases)

with open("train_cases_llmseqprompt_genitem.jsonl", "w") as f:
    f.write(train_string) 

with open("validation_cases_llmseqprompt_genitem.jsonl", "w") as f:
    f.write(validation_string) 

print(f'Training cases: {len(train_cases)}')
print(f'Validation cases: {len(validation_cases)}')
print(f"Num tokens: {num_tokens}")
cost = num_tokens * (0.008 / 1000)
print(f"Costs to train GPT-3 turbo one epoch, roughly: ${cost}")