# Generate prompts for the variant `LLMSeqPromptGenList` (Section 4.2)

In [None]:
import pandas as pd
import numpy as np
import json
import random
import tiktoken
from main.data.session_dataset import SessionDataset
from main.llm_based.similarity_model.llm_seq_sim import LLMSeqSim

In [None]:
DATASET: Literal["beauty", "steam"] = "beauty"
TEST_DATA_PICKLE_NAME = f"{DATASET}_dataset.pickle"
ITEM_NAMES_DF = f"{DATASET}_item_names.csv"

In [None]:
dataset: SessionDataset = SessionDataset.from_pickle(open(TEST_DATA_PICKLE_NAME, "rb"))

In [None]:
config = {
    "max_session_length_for_decay_precomputation": 500,
    "filter_prompt_items": True,
    "batch_size": 500,
    "dim_reduction_config": {
        "reduction_config": {
            "config": {},
            "reduction_technique": "pca"
        },
    "normalize": True,
    "reduced_dim_size": 512
  },
  "is_verbose": True,
  "cores": 15,
  "similarity_measure": "cosine",
  "embedding_combination_strategy": "mean",
  "combination_decay": "harmonic"
}

In [None]:
model: LLMSeqSim = LLMSeqSim(**config)

model.train(dataset.get_train_data(), dataset.get_item_data())

## 4. Compute the Recommendations

In [None]:
item_df = pd.read_csv(ITEM_NAMES_DF, usecols=["ItemId", "name"])
item_df

In [None]:
item_df[item_df['name'].isna()]

In [None]:
unnamed_item_ids = item_df[beauty_product_df['name'].isna()]\
    ['ItemId'].unique()
unnamed_item_ids

In [None]:
sessions = dataset.get_train_data().groupby('SessionId')

train_prompts = {}
train_ground_truths = {}

# For each session in the train data
for session_id, session_data in sessions:
    items = session_data['ItemId'].to_numpy()
    
    # Remove sessions completely when they have an unnamed product
    if np.any(np.isin(items, unnamed_item_ids)):
        print("Skip!")
        continue
        
    # Split the train prompt into a 'prompt' and 'ground-truth'.
    # In a session of 'n' items, the first 'n-1' items are the prompt and the 'n'th' item is the ground truth. 
    train_prompts[session_id] = items[:-1]
    train_ground_truths[session_id] = items[-1:]

len(train_prompts)

In [None]:
TOP_K = 20

recommendations: dict[int, np.ndarray] = model.predict(
    train_prompts, top_k=TOP_K
)

In [None]:
i = 0
print(f"Training prompt {train_prompts[0]} with ground truth {train_ground_truths[0]}")
print(f"Recommended items for train prompt {recommendations[0]}")

In [None]:
len(recommendations), len(train_ground_truths)

In [None]:
grounded_recommendations = {}
gt_not_in_recs = 0

# For each top-k recommendatons by the model
for session_id, rec_items in recommendations.items():
    
    # Skip recommendations which are unnamed products.
    if any(item in unnamed_product_ids for item in rec_items):
        print(f"Skip session {session_id}!")
        continue
        
    # Get the ground truth (this is the last item of the original train session)
    ground_truth = train_ground_truths[session_id][0]
    
    # If this ground truth is not part of the recommendation items, we prepend it.
    if ground_truth not in rec_items:
        gt_not_in_recs += 1
        grounded_rec_items = [ground_truth] + rec_items[:-1].tolist()
    else: # Otherwise we make sure the ground truth is on top of the list.
        cleaned_recs = np.delete(rec_items, np.where(rec_items == ground_truth))
        grounded_rec_items = [ground_truth] + cleaned_recs.tolist()
    grounded_recommendations[session_id] = grounded_rec_items
    
# NOTE THAT GROUND_RECOMMENDATIONS MIGHT HAVE LENGTHS LONGER THAN TOP-K 

print(f"For {(gt_not_in_recs / len(recommendations.items())) * 100}% of sessions the ground truth was not part of the recommendations.")

In [None]:
# Verify that the ground truth is 'on top'.
grounded_recommendations[0], train_ground_truths[0]

In [None]:
product_id_to_name = item_df.set_index('ItemId')\
    ['name'].to_dict()
product_id_to_name

In [None]:
textified_train_prompts = {}

for session, rec_items in train_prompts.items():
    textified_train_prompts[session] = [
        product_id_to_name[product_id] for product_id in rec_items
    ]

textified_train_prompts

In [None]:
textified_recommendations = {}

for session, rec_items in grounded_recommendations.items():
    textified_recommendations[session] = [
        product_id_to_name[product_id] for product_id in rec_items
    ]

textified_recommendations

In [None]:
SYSTEM_PROMPT = """You are a recommender system assistant.
Provide 20 unique item recommendations complementary to the user's item list, ordered by the confidence level of each recommendation.
Ensure all recommendations are from items included in the data you are fine-tuned with. List only the item names.
"""

USER_PROMPT_TEMPLATE = """The user's item list are in the following lines
delimited by BEGIN and END. Each item is in a separate line:
BEGIN
{user_item_list}
END
"""

ASSISTANT_PROMPT_TEMPLATE = """The recommendations are in the following
lines, in decreasing confidence order. The recommendations are delimited by
BEGIN and END. Each recommendation is in a separate line:
BEGIN
{ranked_recommendations}
END
"""

def stringify_ranked_list(list_of_items):
    stringified_ranked_list = ""
    for i, item in enumerate(list_of_items, 1):
        stringified_ranked_list += f"{i}. {item}\n"
    return stringified_ranked_list


def create_prompt(train_prompt, recommendations, ranked_recommendations):

    prompt = {}
    prompt['messages'] = []
    prompt['messages'].append({
        "role": "system",
        "content": SYSTEM_PROMPT
    })
    prompt['messages'].append({
        "role": "user",
        "content": USER_PROMPT_TEMPLATE.format(
            user_item_list='\n'.join(train_prompt),
            recommendations='\n'.join(recommendations)
        )
    })
    prompt['messages'].append({
        "role": "assistant",
        "content": ASSISTANT_PROMPT_TEMPLATE.format(
            ranked_recommendations=stringify_ranked_list(ranked_recommendations)
        )
    })

    return prompt

create_prompt(train_prompt=textified_train_prompts[0],
              recommendations=textified_recommendations[0],
              ranked_recommendations=textified_recommendations[0])


In [None]:
min_session_length = 2
num_tokens = 0
num_validation_cases = int(0.2 * len(train_prompts))
tokens_per_message = 3
tokens_per_name = 1

train_cases = []
validation_cases = []

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

# Shuffle session ids to get a random validation set.
random.seed(42)
session_ids = list(train_prompts.keys())
random.shuffle(session_ids)

for i, session_id in enumerate(session_ids): 
    train_session = train_prompts[session_id]

    # We skip sessions that are too short.
    if len(train_session) < min_session_length:
        continue
    
    # Create prompt
    train_prompt = textified_train_prompts[session_id]
    session_recommendations = textified_recommendations[session_id]
    shuffled_session_recommendations = session_recommendations.copy()
    random.shuffle(shuffled_session_recommendations)
    print(session_id, train_prompt, session_recommendations)
    prompt = create_prompt(
        train_prompt=train_prompt,
        recommendations=shuffled_session_recommendations,
        ranked_recommendations=session_recommendations
    )

    # We skip sessions that are too long.
    num_prompt_tokens = 0
    for message in prompt['messages']:
        num_prompt_tokens += tokens_per_message
        for key, value in message.items():
            num_prompt_tokens += len(encoding.encode(value))
            if key == "name":
                num_prompt_tokens += tokens_per_name
    num_prompt_tokens += 3  # every reply is primed 
                            # with <|start|>assistant<|message|>
    
    if num_prompt_tokens > 4096:
        continue

    # Add to validation or training set.
    if i < num_validation_cases: 
        validation_cases.append(prompt)
    else: 
        num_tokens += num_prompt_tokens
        train_cases.append(prompt)

# Convert training to JSONL.
train_cases = [
    json.dumps(train_case) 
    for train_case in train_cases
]
train_string = '\n'.join(train_cases)

# Convert validation to JSONL.
validation_cases = [
    json.dumps(validation_case) 
    for validation_case in validation_cases
]
validation_string = '\n'.join(validation_cases)

with open("train_cases_llmseqprompt_genlist.jsonl", "w") as f:
    f.write(train_string) 

with open("validation_cases_llmseqprompt_genlist.jsonl", "w") as f:
    f.write(validation_string) 

print(f'Training cases: {len(train_cases)}')
print(f'Validation cases: {len(validation_cases)}')
print(f"Num tokens: {num_tokens}")
cost = num_tokens * (0.008 / 1000)
print(f"Costs to train GPT-3 turbo one epoch, roughly: ${cost}")