In [None]:
!pip install wheel setuptools pip --upgrade
!pip install --upgrade openai

## Data Processing

In [None]:
import pandas as pd
import random
from openai import OpenAI
import time
import numpy as np
import gzip
import re


def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

# Function to truncate the string to 20 words or less
def truncate_to_20_words(s):
    # Check if the input is a string
    if isinstance(s, str):
        words = s.split()
        return ' '.join(words[:20])
    else:
        # Return the input unchanged if it's not a string
        return s

Beautydf = getDF('reviews_Beauty_5.json.gz')
Beautymetadf = getDF('meta_Beauty.json.gz')

# Apply the function to the column
Beautymetadf['title'] = Beautymetadf['title'].apply(truncate_to_20_words)

merged_df = pd.merge(Beautydf, Beautymetadf, on='asin', how='left')
merged_df = merged_df.dropna(subset=['title','description','categories','brand'])
merged_df = merged_df.groupby('title').filter(lambda x: x['asin'].nunique() == 1)

# Filter groups by size and apply the function
merged_df6 = (merged_df.groupby('reviewerID').filter(lambda x: len(x) >= 6))  # Keep only users with >= 6 purchase history

beauty_df = merged_df6.reset_index(drop = True)
all_items = list( beauty_df['title'].unique() )
all_cand_items = list( beauty_df['title'].unique() )

# # Or, you may try with a smaller subset
# unique_users = beauty_df['reviewerID'].dropna().unique()[:10]
# beauty_df = beauty_df[beauty_df['reviewerID'].isin(unique_users)]
# all_items = list( beauty_df['title'].unique() )


In [None]:
API_KEY = ''
client = OpenAI(api_key = API_KEY)
model_id = 'gpt-3.5-turbo-0125' # 'gpt-4-0125-preview'


for id in beauty_df['reviewerID'].unique():

    user_df = beauty_df[ beauty_df['reviewerID'] == id ]
    user_df = user_df.sort_values(by='unixReviewTime', ascending = True)

    user_items = list( user_df['title'].unique() )

    # keep last 15 items
    user_items_applied = user_items[-15:]
    break


APE_prompt = f"""
We will employ GPT-3.5 to perform personalized recommendation for Beauty Products, in which we will feed GPT-3.5 with a user's prior purchase history as well as a set of candidate items to select.
The user's prior history consists a list of beauty products, each represented by its product title. The following can be an example for the user's prior purchase history: {user_items_applied}
The candidate items are a list of beauty products, each also represented by its product title.

Now, please propose a novel, detailed, and step-by-step algorithm to reformulate the user purchase history into a format that is most suitable for GPT-3.5.
"""

prompts = []
prompt_performances = {}
for i in range(7):

    completion = client.chat.completions.create(
        model = model_id, temperature = 1.0,

        messages=[{"role": "system", "content": 'Please come up with a very detailed, clear, and novel candidate prompt according to the narrative provided.'},
                    {"role": "user", "content": APE_prompt}],
        timeout = 1200)

    candidate_prompt = completion.choices[0].message.content
    prompts.append(candidate_prompt)
    print(candidate_prompt)
    print()


To formulate the user's prior purchase history into a format suitable for GPT-3.5, we can employ a step-by-step algorithm that preprocesses and structures the data effectively. Here is a detailed process to achieve this:

Step 1: Extract Keywords from Product Titles
- Remove special characters, numbers, and unnecessary symbols from the product titles.
- Tokenize the remaining words to extract meaningful keywords.
- Utilize techniques like stemming or lemmatization to standardize the keywords.

Step 2: Remove Stopwords and Irrelevant Terms
- Eliminate common stopwords (e.g., 'and', 'the', 'for') and irrelevant terms that do not contribute to the context.
- Exclude brand names, generic words like 'facial,' 'mask,' 'oil,' etc., as they might not be crucial for understanding preferences.

Step 3: Group Similar Products
- Cluster or group similar products together based on their keyword similarity.
- Use techniques such as cosine similarity or hierarchical clustering to categorize related i

## Performance Evaluation

In [None]:
# Define the system message
system_msg = "Please serve as a Recommender System on Beauty Products, based on user's prior purchase information provided."

right_count = 0
compressed_right_count = 0
total = 0
for id in beauty_df['reviewerID'].unique():

    user_df = beauty_df[ beauty_df['reviewerID'] == id ]
    user_df = user_df.sort_values(by='unixReviewTime', ascending = True)

    user_items = list( user_df['title'].unique() )

    # keep last 15 items
    user_items_applied = user_items[-15:]

    # randomly generate 99 negative items (exclude all purchased items) + 1 positive item
    filtered_list = [x for x in all_cand_items if x not in user_items]
    sampled_items = list( random.sample(filtered_list, 99) ) # sampled items may include ground truth item (remove)

    sampled_items.append( user_items_applied[-1] )
    random.shuffle(sampled_items)

    target = user_items_applied[-1]


    augmented_prompt = (
            f"Given the user has purchased the following items in chronological order: "
            f"{user_items_applied[:-1]}; output a list of 10 items to recommend out of the following candidate items ONLY; do NOT explain anything, just output the items:"
            f"\n{sampled_items}"
        )

    completion = client.chat.completions.create(
            model = model_id, temperature = 0,
            messages=[{"role": "system", "content": system_msg},
                        {"role": "user", "content": augmented_prompt}],
            timeout = 1200)

    pred = completion.choices[0].message.content

    total += 1
    if target in pred:
        right_count += 1

    # Perform ADE:
    reformulation_prompt = f'Please thoroughly reformulate the user purchase history based on the following algorithm:\n\n{prompts[0]}\n\nUser purchase history to reformulate: {user_items_applied[:-1]}\n\nReturn the reformulation of the user purchase history ONLY.'

    completion = client.chat.completions.create(
            model = model_id, temperature = 1.0,

            messages=[{"role": "system", "content": 'Please reformulate the user purchase history to be much more informative and detailed based on the narrative provided.'},
                        {"role": "user", "content": reformulation_prompt}],
            timeout = 1200)

    reformulated_history = completion.choices[0].message.content


    compressed_prompt = (
            f"Given the user has purchased the following items in chronological order:\n\n"
            f"{reformulated_history}\n\nOutput a list of 10 items to recommend out of the following 100 candidate items ONLY; do NOT explain anything, just output the items:"
            f"\n{sampled_items}"
        )

    completion = client.chat.completions.create(
            model = model_id, temperature = 0,
            messages=[{"role": "system", "content": system_msg},
                        {"role": "user", "content": compressed_prompt}],
            timeout = 1200)

    compressed_pred = completion.choices[0].message.content

    if target in compressed_pred:
        compressed_right_count += 1

    if total % 20 == 0 or total == beauty_df['reviewerID'].nunique():
        print(f"Accuracy: {right_count/total}")
        print(f"Compressed Accuracy: {compressed_right_count/total}")
        print()

