In [None]:
import openai
import re
from typing import Literal
import pickle
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm
from openai import AsyncOpenAI
import vertexai
from vertexai.preview.language_models import TextGenerationModel

In [None]:
vertexai.init()

In [None]:
# Fill in name of finetuned_model.
MODEL_NAME = ""

DATASET: Literal["beauty", "steam"] = "beauty"

# Name of the pickle with the test data for Beauty.
TEST_DATA_PICKLE_NAME = f"test_data_{DATASET}.pickle"

# Name of the embeddings DF for 
EMBEDDINGS_NAME = f"embeddings_{DATASET}.csv.gz"

# Fill in OpenAI key
OPENAI_KEY = ""


# Hyperparameters
TOP_K = 20
TEMPERATURE = 0
TOP_P = 1.0

# load model
model = TextGenerationModel.get_tuned_model(MODEL_NAME)

# define model parameters
parameters = {
    "temperature": TEMPERATURE,
    "top_p": TOP_P,
}

# Correspond to respectively 4.1 to 4.4
VARIANT: Literal["genitem", "genlist", "class", "rank"] = "genlist"

TOTAL_MODEL_NAME = f"{MODEL_NAME}_{VARIANT}_temp_{TEMPERATURE}_top_p_{TOP_P}"

In [None]:
def parse_completion_genlist(completion: str) -> list[str]:
    # Use regular expression to find the second occurrence of "BEGIN" and "END"
    begin_matches = [m.start() for m in re.finditer(r"BEGIN", completion)]
    end_matches = [m.start() for m in re.finditer(r"END", completion)]

    recommendations = []
    if len(begin_matches) >= 2 and len(end_matches) >= 2:
        recommendations_text = completion[
            begin_matches[1] + len("BEGIN") : end_matches[1]
        ]
        # Use regular expression to extract items from the recommendations text
        items = re.findall(r"\d+\.\s(.*?)\n", recommendations_text)

        for i, item in enumerate(items, start=1):
            recommendations.append(item)
    else:
        # Alternative parse strategy.
        # Sometimes we don't have an END token.
        recommendations_text = completion.replace(
            """The recommendations are in the following
lines, in decreasing confidence order. The recommendations are delimited by
BEGIN and END. Each recommendation is in a separate line:
BEGIN""",
            "",
        )
        # Use regular expression to extract items from the recommendations text
        items = re.findall(r"\d+\.\s(.*?)\n", recommendations_text)
        for i, item in enumerate(items, start=1):
            recommendations.append(item)

        if len(recommendations) == 0:
            # Alternative parse strategy.
            # Sometimes we don't have an END token and only newlines per product.
            recommendations = completion.split("\n")
    # Remove empty strings
    recommendations = [x for x in recommendations if x]
    return recommendations[:TOP_K]


In [None]:
system_message = {
    "role": "system",
    "content": "You are a recommender system assistant.\nProvide 20 unique item recommendations complementary to the user's item list, ordered by the confidence level of each recommendation.\nEnsure all recommendations are from items included in the data you are fine-tuned with. List only the item names.\n",
}
user_message = {
    "role": "user",
    "content": "The user's item list are in the following lines\ndelimited by BEGIN and END. Each item is in a separate line:\nBEGIN\n{ITEMS}\nEND\n",
}
parse_method = parse_completion_genlist

## Load test prompts

In [None]:
test_prompts, _ = pickle.load(open(f"{TEST_DATA_PICKLE_NAME}", "rb"))
test_prompts[list(test_prompts.keys())[0]]

## Get embeddings and build lookup tables

In [None]:
product_embeddings = pd.read_csv(
    f"{EMBEDDINGS_NAME}", compression="gzip"
)
product_embeddings

In [None]:
product_id_to_name = (
    product_embeddings[["ItemId", "name"]]
    .set_index("ItemId")
    .to_dict()["name"]
)
product_name_to_id = (
    product_embeddings[["ItemId", "name"]]
    .set_index("name")
    .to_dict()["ItemId"]
)
product_index_to_embedding = (
    product_embeddings[["ItemId", "embedding"]]
    .set_index("ItemId")
    .to_dict()["embedding"]
)
product_index_to_embedding = {
    k: np.array(json.loads(v)) for k, v in product_index_to_embedding.items()
}
product_index_to_embedding = np.array(list(product_index_to_embedding.values()))
product_index_to_id = list(product_id_to_name.keys())
product_id_to_index = {idx: i for i, idx in enumerate(product_index_to_id)}

## Compute test prompts

In [None]:
test_messages: list[tuple[int, list[str]]] = []

for session_id, prompt in test_prompts.items():
    custom_user_message = user_message.copy()
    custom_user_message["content"] = custom_user_message["content"].replace("{ITEMS}", "\n".join([product_id_to_name[i] for i in prompt]))
    test_messages.append((session_id, [system_message, custom_user_message]))
test_messages[0]

In [None]:
vertexai_prompts = []

for session_id, prompt in test_messages:
    vertexai_prompts.append((session_id, f'{prompt[0]["content"]}\n{prompt[1]["content"]}'))
vertexai_prompts[0]

# Compute completions

In [None]:
import asyncio
import time
completions: list[tuple[int, str]] = []

# Use async API to get parallel requests.
# Make sure batch_size is not too high otherwise we might hit rate limits.
async def run_completions():
    batch_size = 250
    for i in tqdm(range(0, len(vertexai_prompts), batch_size)):
        start_batch = i
        end_batch = i + batch_size

        start_time = time.perf_counter()
        print(f"Completion batch {start_batch} - {end_batch}")

        requests = []
        for _, messages in vertexai_prompts[start_batch:end_batch]:
            requests.append(
                model.predict_async(
                    prompt=messages,
                    temperature=TEMPERATURE,
                    top_p=TOP_P,
                    max_output_tokens=50*20,
                )
            )
        responses = await asyncio.gather(*requests)
        for (session_id, _), response in zip(vertexai_prompts[start_batch:end_batch], responses):
            completions.append((session_id, response.candidates[0].text))
            
        print(f"Finished batch {start_batch} - {end_batch}. Took {time.perf_counter() - start_time} seconds.")


await run_completions()

In [None]:
pickle.dump(completions, open(f"completions_vertexai_{TOTAL_MODEL_NAME}.pkl", "wb"))

### Parse completions


In [None]:
parsed_completions: list[tuple[int, list[str]]] = []
for session_id, response in tqdm(completions):
    parsed_response: list[str] = parse_method(response)
    if parsed_response is None:
        break
    parsed_completions.append((session_id, parsed_response))
parsed_completions[0]

# Completed product names to global product ids
First we try to map to the exact product name and otherwise we use embeddings to find the closest item. 

In [None]:
recommendations: dict[int, list[int | str]] = {}
unmappable_items: set = set()
for session_id, items in tqdm(parsed_completions):
    recommendations[session_id] = []
    for item in items:
        # We either transform the product name to its id if we have an exact match.
        # Otherwise we keep the product_name (instead of an id) and find the corresponding closest id later on.
        if item in product_name_to_id:
            recommendations[session_id].append(product_name_to_id[item])
        else:
            recommendations[session_id].append(item)
            unmappable_items.add(item)
print(f"No exact match for {len(unmappable_items)} items. Will use embedding based search to find closest item.")
len(recommendations), recommendations[list(recommendations.keys())[0]]   

In [None]:
unmappable_items_embeddings: dict[str, list[float]] = {}
unmappable_items: list[str] = list(unmappable_items)
async def get_embeddings():
    client = AsyncOpenAI(
        api_key=OPENAI_KEY,
    )

    batch_size = 2000
    for i in tqdm(range(0, len(unmappable_items), batch_size)):
        start_batch = i
        end_batch = i + batch_size

        start_time = time.perf_counter()
        print(f"Embeddings batch {start_batch} - {end_batch}")
        response = await client.embeddings.create(input = unmappable_items[start_batch:end_batch], model="text-embedding-ada-002")
        for item, embedding in zip(unmappable_items[start_batch:end_batch], response.data):
            unmappable_items_embeddings[item] = embedding.embedding
            
        print(f"Finished batch {start_batch} - {end_batch}. Took {time.perf_counter() - start_time} seconds.")
await get_embeddings()

Find closest actual item (with global product id) . Try to prevent duplicates.

In [None]:
final_recommendations: dict[int, list[int]] = {}
for session_id, recs in tqdm(recommendations.items()):
    # The unique recommendations are the ones for which we already have a global product id.
    unique_recs: set = set([r for r in recs if isinstance(r, int)])
    new_recs: list[int] = []
    for r in recs:
        if isinstance(r, int):
            new_recs.append(r)
            continue

        item_embedding = unmappable_items_embeddings[r]
        item_embedding = np.array([item_embedding], dtype=np.float64)

        # Dot product is the same as cosine similarity of embeddings with length 1.
        predictions = (product_index_to_embedding @ item_embedding.T).T[0]

        for nearest_neighbor in predictions.argsort()[::-1]:
            global_product_id = product_index_to_id[nearest_neighbor]
            if global_product_id in unique_recs:
                #print(f"Had to continue because item {global_product_id} was already in recommendation slate.")
                continue
            else:
                #print(f"Matching '{r}' to '{product_id_to_name[global_product_id]}' with confidence '{predictions[nearest_neighbor]}'")
                new_recs.append(global_product_id)
                unique_recs.add(global_product_id)
                break


    final_recommendations[session_id] = new_recs

# Save file

In [None]:
pickle.dump(recommendations, open(f"recs_vertexai_{TOTAL_MODEL_NAME}.pickle", "wb"))