In [None]:
!pip install openai==0.28

In [1]:
import os, warnings
import requests

import pandas as pd
import openai
import tiktoken
import utils

warnings.filterwarnings('ignore')

api_key = 'sk-KEY'

openai.api_key = api_key


In [2]:
token_hf = 'hf-KEY'
path='D:/Disco/Data/huggingface/'
reviews_path = 'D:/Disco/Data/datasets/amazon_us_reviews/'

In [3]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens


In [50]:
class Utils:
    def __init__(self) -> None:
        self.reviews = []

    def load_reviews(self, file_name):
        self.reviews = pd.read_parquet(reviews_path + file_name)

    def get_reviews_by_product_and_category(self, product, category):
        return self.reviews[(self.reviews['product_category'] == category) & (self.reviews['product_title'] == product)]

    def get_prompt(self, product):
        return f"Please analyze the provided reviews of the product '{product}'. \n  \
        Create a concise summary that encapsulates the key opinions and sentiments expressed in these reviews. \n \
        The summary should be structured as if it's a single comprehensive review of the product. \n \
        The summary should mimic the style and tone of a customer reviews, making it relatable and genuine. \n \
        Also, provide a list of 5 tags that represent what the customers are saying about the product, give the balance between positive and negative aspects about the product, the tags have this format: #TagName. \
        Format your response as follows: \n \
            Product: {product}\n \
            Summary: [Your summary here]\n \
            Tags: #tag1 #tag2 #tag3 #tag4 #tag5\n \
        List of reviews: \n"
    
    def build_summary(self, review_qty=50, model='gpt-4'):
        results = pd.DataFrame(columns=['category', 'product', 'prompt', 'reviews', 'review_qty', 'token_qty', 'response'])

        for row, qty in self.reviews[['product_category', 'product_title']].value_counts()[:10].items():
            try:
                print(row[0], row[1], qty)
                category = row[0]
                product = row[1]

                product_reviews = self.reviews[(self.reviews['product_category'] == category) & (self.reviews['product_title'] == product)][:review_qty]
                
                reviews = '\n'.join(product_reviews['review_body'])
                prompt = self.get_prompt(product)

                message_prompt=[{
                    "role": "user",
                    "content": f"{prompt + reviews}:{product_reviews['review_body']}"}]
                
                token_qty = num_tokens_from_messages(message_prompt, model=model)
                print('tokens: ', token_qty)

                chat_completion = openai.ChatCompletion.create(model=model, messages=message_prompt)
                response = chat_completion["choices"][0]["message"]["content"]

                results = results.append({'category': category, 'product': product, 'prompt': prompt, 'reviews': reviews, 'review_qty':len(product_reviews), 'token_qty': token_qty, 'response': response}, ignore_index=True)
            except Exception as e:
                print('error processing: ', row[0], row[1], qty)
                pass

        return results

In [51]:
utl = Utils()
utl.load_reviews('reviews_aws_sample.parquet')

In [55]:
results = utl.build_summary(70, 'gpt-3.5-turbo-1106')
results

Home Entertainment Google Chromecast HDMI Streaming Media Player 1656
tokens:  5567
PC Kindle Fire HDX 7", HDX Display (Previous Generation - 3rd) 1415
tokens:  3908
PC Fire HD 7, 7" HD Display, Wi-Fi, 8 GB 1353
tokens:  2547
Electronics Panasonic ErgoFit In-Ear Earbud Headphone 1128
tokens:  4024
Toys Cards Against Humanity 1111
tokens:  2310
PC Kindle Fire (Previous Generation - 1st) 1091
tokens:  10548
PC Kindle Paperwhite, 6" High-Resolution Display (212 ppi) with Built-in Light, Wi-Fi 1086
tokens:  3235
PC SanDisk Ultra microSDHC Card Plus Adapter 1018
tokens:  3883
PC Fire HD 6 966
tokens:  2795
PC Kindle Fire HD 7", Dolby Audio, Dual-Band Wi-Fi 897
tokens:  7437


Unnamed: 0,category,product,prompt,reviews,response,review_qty,token_qty
0,Home Entertainment,Google Chromecast HDMI Streaming Media Player,Please analyze the provided reviews of the pro...,I ordered the google Chromecast because of the...,Product: Google Chromecast HDMI Streaming Medi...,70.0,5567.0
1,PC,"Kindle Fire HDX 7"", HDX Display (Previous Gene...",Please analyze the provided reviews of the pro...,I've really enjoyed my kindle fire hdx from li...,"Product: Kindle Fire HDX 7"", HDX Display (Prev...",70.0,3908.0
2,PC,"Fire HD 7, 7"" HD Display, Wi-Fi, 8 GB",Please analyze the provided reviews of the pro...,"Easy to set up and use, love the little cover...","Product: Fire HD 7, 7"" HD Display, Wi-Fi, 8 GB...",70.0,2547.0
3,Electronics,Panasonic ErgoFit In-Ear Earbud Headphone,Please analyze the provided reviews of the pro...,"Great!\nI didnt care for these personally, (th...",Product: Panasonic ErgoFit In-Ear Earbud Headp...,70.0,4024.0
4,Toys,Cards Against Humanity,Please analyze the provided reviews of the pro...,"It's funny, it's dark, it'll make you realize ...",Product: Cards Against Humanity \nSummary: Car...,70.0,2310.0
5,PC,Kindle Fire (Previous Generation - 1st),Please analyze the provided reviews of the pro...,"Good battery life, bright screen. I read in a...",Product: Kindle Fire (Previous Generation - 1s...,70.0,10548.0
6,PC,"Kindle Paperwhite, 6"" High-Resolution Display ...",Please analyze the provided reviews of the pro...,Love it\nWonderful!!\nlove it &#62;Ihave macul...,"Product: Kindle Paperwhite, 6"" High-Resolution...",70.0,3235.0
7,PC,SanDisk Ultra microSDHC Card Plus Adapter,Please analyze the provided reviews of the pro...,It formats the exFAT and my computer will not ...,Product: SanDisk Ultra microSDHC Card Plus Ada...,70.0,3883.0
8,PC,Fire HD 6,Please analyze the provided reviews of the pro...,perfect....just what I needed!\nFantastic devi...,Product: Fire HD 6\nSummary: The Fire HD 6 see...,70.0,2795.0
9,PC,"Kindle Fire HD 7"", Dolby Audio, Dual-Band Wi-Fi",Please analyze the provided reviews of the pro...,it only worked for about 3 months do not buy u...,"Product: Kindle Fire HD 7"", Dolby Audio, Dual-...",70.0,7437.0


In [56]:
results.to_csv(reviews_path + 'results_gpt3.5.csv', index=False)

In [40]:
category = 'Watches'
product = 'Timex Women\'s Easy Reader Leather Strap Watch'

product_reviews = utl.reviews[(utl.reviews['product_category'] == category) & (utl.reviews['product_title'] == product)]

prompt = utl.get_prompt(product) + '\n'.join(product_reviews['review_body'][:])

message_prompt=[{
        "role": "user",
        "content": f"{prompt}:{product_reviews['review_body']}"}]

#"gpt-3.5-turbo"
chat_completion = openai.ChatCompletion.create(model="gpt-3.5-turbo-1106", messages=message_prompt)

print(chat_completion["choices"][0]["message"]["content"])

Product: Timex Women's Easy Reader Leather Strap Watch
Summary: I bought this watch for my elderly mom and she loved it. The wide variety of materials and colors made it difficult to choose but I'm happy with the great color and dimensions of this watch. The oversize face is great and while the band is a bit stiff, I'm happy with how it looks. It's easy to read, but it does run slow and it's a bit tricky to reset the time. The indigo light is great for nighttime. However, the leather band started losing color and fading within a couple of months and getting it wet will temporarily stop it from working. Overall, it's a good and affordable watch, but think twice before buying it with a particular band.
Tags: #ElderlyMom #ColorVariety #OversizedFace #FadingLeatherBand #RunsSlow


Product: Timex Women's Easy Reader Leather Strap Watch
Summary: The Timex Women's Easy Reader watch seems to be a hit among many for its oversized face, making it easy to read even without glasses - a feature greatly appreciated especially by older folks. Its Indigo light feature also adds to the ease of reading in the dark. The watch offers a variety in band materials, colors, and sizes which gives room for personal preference. Despite its slightly heavy feel and stiff band that takes a while to soften and mold to the wrist, the aesthetic appeal of the watch still shines through. However, there are some significant drawbacks to take into consideration. The watch seems to have a reputation for running slow and the time resetting function appears a bit tricky. Attention needs to be paid to the durability of the band as well, as it may show noticeable fading after extended use. The watch also may not perform well when in contact with water.      
Tags: #LargeDial #AestheticAppeal #Questi

In [None]:
product[['review_body']].to_csv('D:/Disco/Data/datasets/amazon_us_reviews/reviews_aws_product.csv', index=False)