In [5]:
!pip install openai==0.28 tiktoken

Collecting tiktoken
  Downloading tiktoken-0.5.1-cp310-cp310-win_amd64.whl (759 kB)
     ------------------------------------ 759.8/759.8 kB 448.3 kB/s eta 0:00:00
Collecting regex>=2022.1.18
  Using cached regex-2023.10.3-cp310-cp310-win_amd64.whl (269 kB)
Installing collected packages: regex, tiktoken
Successfully installed regex-2023.10.3 tiktoken-0.5.1


In [16]:
import os, warnings
import requests

import pandas as pd
import openai
import tiktoken
import utils

warnings.filterwarnings('ignore')

api_key = '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'

openai.api_key = api_key


In [17]:
token_hf = 'hf-KEY'
path='D:/Disco/Data/huggingface/'
reviews_path = 'C:/Disco/Data/datasets/amazon_us_reviews/'

In [18]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens


In [24]:
class Utils:
    def __init__(self) -> None:
        self.reviews = []

    def load_reviews(self, file_name):
        self.reviews = pd.read_parquet(reviews_path + file_name)

    def get_reviews_by_product_and_category(self, product, category):
        return self.reviews[(self.reviews['product_category'] == category) & (self.reviews['product_title'] == product)]

    def get_prompt(self, product):
        return f"Please analyze the provided reviews of the product '{product}'. \n  \
        Create a concise summary that encapsulates the key opinions and sentiments expressed in these reviews. \n \
        The summary should be structured as if it's a single comprehensive review of the product. \n \
        The summary should mimic the style and tone of a customer reviews, making it relatable and genuine. \n \
        Also, provide a list of 5 tags that represent what the customers are saying about the product, give the balance between positive and negative aspects about the product, the tags have this format: #TagName. \
        Format your response as follows: \n \
            Product: {product}\n \
            Summary: [Your summary here]\n \
            Tags: #tag1 #tag2 #tag3 #tag4 #tag5\n \
        List of reviews: \n"
    
    def build_summary(self, review_qty=50, model='gpt-4'):
        results = pd.DataFrame(columns=['category', 'product', 'prompt', 'reviews', 'review_qty', 'token_qty', 'response'])

        for row, qty in self.reviews[['product_category', 'product_title']].value_counts()[:10].items():
            try:
                print(row[0], row[1], qty)
                category = row[0]
                product = row[1]

                product_reviews = self.reviews[(self.reviews['product_category'] == category) & (self.reviews['product_title'] == product)][:review_qty]
                
                reviews = '\n'.join(product_reviews['review_body'])
                prompt = self.get_prompt(product)

                message_prompt=[{
                    "role": "user",
                    "content": f"{prompt + reviews}:{product_reviews['review_body']}"}]
                
                token_qty = num_tokens_from_messages(message_prompt, model=model)
                print('tokens: ', token_qty)

                chat_completion = openai.ChatCompletion.create(model=model, messages=message_prompt, ssl_check=False)
                response = chat_completion["choices"][0]["message"]["content"]

                results = results.append({'category': category, 'product': product, 'prompt': prompt, 'reviews': reviews, 'review_qty':len(product_reviews), 'token_qty': token_qty, 'response': response}, ignore_index=True)
            except Exception as e:
                print('error processing: ', row[0], row[1], qty, e)
                pass

        return results

In [25]:
utl = Utils()
utl.load_reviews('reviews_aws_sample.parquet')

In [26]:
results = utl.build_summary(8, 'gpt-4')
results

Home Entertainment Google Chromecast HDMI Streaming Media Player 1656
tokens:  843
error processing:  Home Entertainment Google Chromecast HDMI Streaming Media Player 1656 Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:997)')))
PC Kindle Fire HDX 7", HDX Display (Previous Generation - 3rd) 1415
tokens:  686
error processing:  PC Kindle Fire HDX 7", HDX Display (Previous Generation - 3rd) 1415 Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:997)')))
PC Fire HD 7, 7" HD Display, Wi-Fi, 8 GB

Unnamed: 0,category,product,prompt,reviews,review_qty,token_qty,response


In [15]:
results.to_csv(reviews_path + 'results_gpt4.csv', index=False)

In [40]:
category = 'Watches'
product = 'Timex Women\'s Easy Reader Leather Strap Watch'

product_reviews = utl.reviews[(utl.reviews['product_category'] == category) & (utl.reviews['product_title'] == product)]

prompt = utl.get_prompt(product) + '\n'.join(product_reviews['review_body'][:])

message_prompt=[{
        "role": "user",
        "content": f"{prompt}:{product_reviews['review_body']}"}]

#"gpt-3.5-turbo"
chat_completion = openai.ChatCompletion.create(model="gpt-3.5-turbo-1106", messages=message_prompt)

print(chat_completion["choices"][0]["message"]["content"])

Product: Timex Women's Easy Reader Leather Strap Watch
Summary: I bought this watch for my elderly mom and she loved it. The wide variety of materials and colors made it difficult to choose but I'm happy with the great color and dimensions of this watch. The oversize face is great and while the band is a bit stiff, I'm happy with how it looks. It's easy to read, but it does run slow and it's a bit tricky to reset the time. The indigo light is great for nighttime. However, the leather band started losing color and fading within a couple of months and getting it wet will temporarily stop it from working. Overall, it's a good and affordable watch, but think twice before buying it with a particular band.
Tags: #ElderlyMom #ColorVariety #OversizedFace #FadingLeatherBand #RunsSlow


Product: Timex Women's Easy Reader Leather Strap Watch
Summary: The Timex Women's Easy Reader watch seems to be a hit among many for its oversized face, making it easy to read even without glasses - a feature greatly appreciated especially by older folks. Its Indigo light feature also adds to the ease of reading in the dark. The watch offers a variety in band materials, colors, and sizes which gives room for personal preference. Despite its slightly heavy feel and stiff band that takes a while to soften and mold to the wrist, the aesthetic appeal of the watch still shines through. However, there are some significant drawbacks to take into consideration. The watch seems to have a reputation for running slow and the time resetting function appears a bit tricky. Attention needs to be paid to the durability of the band as well, as it may show noticeable fading after extended use. The watch also may not perform well when in contact with water.      
Tags: #LargeDial #AestheticAppeal #Questi

In [None]:
product[['review_body']].to_csv('D:/Disco/Data/datasets/amazon_us_reviews/reviews_aws_product.csv', index=False)