# Generate prompts for the variant `LLMSeqPromptClassify` (Section 4.3)

In [None]:
import pickle
import pandas as pd
import numpy as np
from google.cloud import storage
import json
import random
import tiktoken
import math
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from main.data.session_dataset import SessionDataset
from main.popularity.session import SessionBasedPopular

In [None]:
DATASET: Literal["beauty", "steam"] = "beauty"

# Name of the pickle with the test data for Beauty.
TEST_DATA_PICKLE_NAME = f"{DATASET}_dataset.pickle"

ITEM_NAMES_DF = f"{DATASET}_item_names.pickle"

EMBEDDINGS_NAME =  f"embeddings_{DATASET}.csv.gz"

NUM_CLUSTERS = 200

In [None]:
dataset: SessionDataset = SessionDataset.from_pickle(open(TEST_DATA_PICKLE_NAME, "rb"))

In [None]:
product_embeddings = pd.read_csv(
    EMBEDDINGS_NAME, compression="gzip"
)
product_embeddings

In [None]:
product_id_to_name = (
    product_embeddings[["ItemId", "name"]]
    .set_index("ItemId")
    .to_dict()["name"]
)
product_name_to_id = (
    product_embeddings[["ItemId", "name"]]
    .set_index("name")
    .to_dict()["ItemId"]
)
product_index_to_embedding = (
    product_embeddings[["ItemId", "embedding"]]
    .set_index("ItemId")
    .to_dict()["embedding"]
)
product_index_to_embedding = {
    k: np.array(json.loads(v)) for k, v in product_index_to_embedding.items()
}
product_index_to_embedding = np.array(list(product_index_to_embedding.values()))
product_index_to_id = list(product_id_to_name.keys())
product_id_to_index = {idx: i for i, idx in enumerate(product_index_to_id)}

In [None]:
product_names = list(product_name_to_id.keys())

In [None]:
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=0, n_init="auto")
clustering = kmeans.fit_predict(product_index_to_embedding)

In [None]:
global_product_id_to_cluster = {}
counter_per_cluster = {}
for cluster in list(set(clustering)):
    counter_per_cluster[cluster] = 0
    
for i, cluster in enumerate(clustering):
    counter_per_cluster[cluster] += 1
    global_product_id_to_cluster[product_index_to_id[i]] = cluster
counter_per_cluster 

In [None]:
model: SessionBasedPopular = SessionBasedPopular()
model.train(dataset.get_train_data())

In [None]:
cluster_to_popular_item = {}
for cluster in range(num_clusters):
    # Iterate through items from most to least popular
    for item in model.items:
        if global_product_id_to_cluster[item] == cluster:
            # Some products are NaN, we skip those
            if not isinstance(product_id_to_name[item], str) and math.isnan(product_id_to_name[item]):
                continue
            cluster_to_popular_item[cluster] = item
            break
            
cluster_to_popular_item[0]

In [None]:
item_df = pd.read_csv(ITEM_NAMES_DF, usecols=["ItemId", "name"])
item_df

In [None]:
item_df[item_df['name'].isna()]

In [None]:
unnamed_item_ids = beauty_product_df[beauty_product_df['name'].isna()]\
    ['ItemId'].unique()
unnamed_item_ids

In [None]:
sessions = dataset.get_train_data().groupby('SessionId')

train_prompts = {}
train_ground_truths = {}

# For each session in the train data
for session_id, session_data in sessions:
    items = session_data['ItemId'].to_numpy()
    
    # Remove sessions completely when they have an unnamed product
    if np.any(np.isin(items, unnamed_item_ids)):
        continue
        
    # Split the train prompt into a 'prompt' and 'ground-truth'.
    # In a session of 'n' items, the first 'n-1' items are the prompt and the 'n'th' item is the ground truth. 
    train_prompts[session_id] = items[:-1]
    train_ground_truths[session_id] = items[-1:]

len(train_prompts)

In [None]:
product_id_to_name = item_df.set_index('ItemId')\
    ['name'].to_dict()
product_id_to_name

In [None]:
textified_train_prompts = {}

for session, rec_items in train_prompts.items():
    textified_train_prompts[session] = [
        product_id_to_name[product_id] for product_id in rec_items
    ]

textified_train_prompts

In [None]:
textified_categories = {}
product_id_to_index = {}

# Map global product id to index in embedding matrix
for i, index in enumerate(product_index_to_id):
    product_id_to_index[index] = i

for session, ground_truth in train_ground_truths.items():
    # Get embedding of ground truth
    # Sort distance to kmeans.cluster_centers_
    # Check if number 1 cluster corresponds to the cluster that was predicted for this ground truths (in clustering variable)
    # Map top-20 clusters to their corresponding product (cluster-to-product)
    ground_truth = ground_truth[0]
    
    gt_embedding = product_index_to_embedding[product_id_to_index[ground_truth]]
    gt_embedding = np.array([gt_embedding], dtype=np.float64)    
    predictions = euclidean_distances(kmeans.cluster_centers_, gt_embedding).T[0]
    nearest_clusters = predictions.argsort()[:TOP_K]
    
    textified_categories[session] = [product_id_to_name[cluster_to_popular_item[nn]] for nn in nearest_clusters]
    
    if nearest_clusters[0] != global_product_id_to_cluster[ground_truth]:
        break
    # textified_recommendations[session] = [
    #     product_id_to_name[product_id] for product_id in rec_items
    # ]
textified_categories

In [None]:
all_categories = [product_id_to_name[item] for item in cluster_to_popular_item.values()]
all_categories[0]

In [None]:
SYSTEM_PROMPT = """You are a recommender system assistant. You have access to the user's previous purchases and a list of availabe products.
Provide 20 product recommendations for this user, only select from the available products.
"""

USER_PROMPT_TEMPLATE = """
The user's previous purchases: 
{user_item_list}

Available products:
{potential_recommendation_categories}


Please remember to only select recommendations from the available products.
"""

ASSISTANT_PROMPT_TEMPLATE = """{top_recommendation_categories}"""

def stringify_ranked_list(list_of_items):
    stringified_ranked_list = ""
    for i, item in enumerate(list_of_items, 1):
        stringified_ranked_list += f"{i}. {item}\n"
    return stringified_ranked_list


def create_prompt(train_prompt, recommendation_categories, top_recommendation_categories):
    prompt = {}
    prompt['messages'] = []
    prompt['messages'].append({
        "role": "system",
        "content": SYSTEM_PROMPT
    })
    prompt['messages'].append({
        "role": "user",
        "content": USER_PROMPT_TEMPLATE.format(
            user_item_list='\n'.join(train_prompt),
            potential_recommendation_categories='\n'.join(recommendation_categories)
        )
    })
    prompt['messages'].append({
        "role": "assistant",
        "content": ASSISTANT_PROMPT_TEMPLATE.format(
            top_recommendation_categories=stringify_ranked_list(top_recommendation_categories)
        )
    })

    return prompt
create_prompt(train_prompt=textified_train_prompts[0],
              recommendation_categories=all_categories,
              top_recommendation_categories=textified_categories[0])

In [None]:
min_session_length = 2
num_tokens = 0
num_validation_cases = int(0.2 * len(train_prompts))
tokens_per_message = 3
tokens_per_name = 1

train_cases = []
validation_cases = []

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

# Shuffle session ids to get a random validation set.
random.seed(42)
session_ids = list(train_prompts.keys())
random.shuffle(session_ids)

for i, session_id in enumerate(session_ids): 
    train_session = train_prompts[session_id]

    # We skip sessions that are too short.
    if len(train_session) < min_session_length:
        continue
    
    # Create prompt
    train_prompt = textified_train_prompts[session_id]
    session_recommendations = textified_categories[session_id]
    prompt = create_prompt(
        train_prompt=train_prompt,
        recommendation_categories=all_categories,
        top_recommendation_categories=session_recommendations
    )

    # We skip sessions that are too long.
    num_prompt_tokens = 0
    for message in prompt['messages']:
        num_prompt_tokens += tokens_per_message
        for key, value in message.items():
            num_prompt_tokens += len(encoding.encode(value))
            if key == "name":
                num_prompt_tokens += tokens_per_name
    num_prompt_tokens += 3  # every reply is primed 
                            # with <|start|>assistant<|message|>
    
    # if num_prompt_tokens > 4096:
    #     continue

    # Add to validation or training set.
    if i < num_validation_cases: 
        validation_cases.append(prompt)
    else: 
        num_tokens += num_prompt_tokens
        train_cases.append(prompt)

# Convert training to JSONL.
train_cases = [
    json.dumps(train_case) 
    for train_case in train_cases
]
train_string = '\n'.join(train_cases)

# Convert validation to JSONL.
validation_cases = [
    json.dumps(validation_case) 
    for validation_case in validation_cases
]
validation_string = '\n'.join(validation_cases)

with open("train_cases_llmseqprompt_classify.jsonl", "w") as f:
    f.write(train_string) 

with open("validation_cases_llmseqprompt_classify.jsonl", "w") as f:
    f.write(validation_string) 

print(f'Training cases: {len(train_cases)}')
print(f'Validation cases: {len(validation_cases)}')
print(f"Num tokens: {num_tokens}")
cost = num_tokens * (0.008 / 1000)
print(f"Costs to train GPT-3 turbo one epoch, roughly: ${cost}")

In [None]:
with open("cluster_to_popular_item.pkl", "wb") as outfile:
    pickle.dump(cluster_to_popular_item, outfile)

In [None]:
with open("global_product_id_to_cluster.json", "wb") as outfile:
    json.dump(global_product_id_to_cluster, outfile)