In [24]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.5.1-cp310-cp310-win_amd64.whl.metadata (6.8 kB)
Downloading tiktoken-0.5.1-cp310-cp310-win_amd64.whl (759 kB)
   ---------------------------------------- 0.0/759.8 kB ? eta -:--:--
   - ------------------------------------- 30.7/759.8 kB 660.6 kB/s eta 0:00:02
   ------ --------------------------------- 122.9/759.8 kB 1.4 MB/s eta 0:00:01
   ----------------------- ---------------- 450.6/759.8 kB 3.5 MB/s eta 0:00:01
   ---------------------------------------- 759.8/759.8 kB 4.8 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.5.1


In [25]:
import os
import transformers
import torch
import pandas as pd
import tiktoken
#import utils

#from google.colab import drive
from transformers import AutoTokenizer, pipeline, AutoModel, AutoModelForCausalLM

os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'

In [26]:
token_hf = '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'
path='D:/Disco/Data/huggingface/'
reviews_path = 'D:/Disco/Data/datasets/amazon_us_reviews/'

In [27]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens


In [22]:
class Utils:
    def __init__(self) -> None:
        self.reviews = []

    def load_reviews(self, file_name):
        self.reviews = pd.read_parquet(reviews_path + file_name)

    def get_reviews_by_product_and_category(self, product, category):
        return self.reviews[(self.reviews['product_category'] == category) & (self.reviews['product_title'] == product)]

    def get_prompt(self, product):
        return f"Please analyze the provided reviews of the product '{product}'. \n  \
        Create a concise summary that encapsulates the key opinions and sentiments expressed in these reviews. \n \
        The summary should be structured as if it's a single comprehensive review of the product. \n \
        The summary should mimic the style and tone of a customer reviews, making it relatable and genuine. \n \
        Also, provide a list of 5 tags that represent what the customers are saying about the product, give the balance between positive and negative aspects about the product, the tags have this format: #TagName. \
        Format your response as follows: \n \
            Product: {product}\n \
            Summary: [Your summary here]\n \
            Tags: #tag1 #tag2 #tag3 #tag4 #tag5\n \
        List of reviews: \n"
    
    def build_summary(self, review_qty=50, model='gpt-4'):
        results = pd.DataFrame(columns=['category', 'product', 'prompt', 'reviews', 'review_qty', 'token_qty', 'response'])

        for row, qty in self.reviews[['product_category', 'product_title']].value_counts()[:10].items():
            try:
                print(row[0], row[1], qty)
                category = row[0]
                product = row[1]

                product_reviews = self.reviews[(self.reviews['product_category'] == category) & (self.reviews['product_title'] == product)][:review_qty]
                
                reviews = '\n'.join(product_reviews['review_body'])
                prompt = self.get_prompt(product)

                message_prompt=[{
                    "role": "user",
                    "content": f"{prompt + reviews}:{product_reviews['review_body']}"}]
                
                token_qty = num_tokens_from_messages(message_prompt, model=model)
                print('tokens: ', token_qty)

                chat_completion = openai.ChatCompletion.create(model=model, messages=message_prompt)
                response = chat_completion["choices"][0]["message"]["content"]

                results = results.append({'category': category, 'product': product, 'prompt': prompt, 'reviews': reviews, 'review_qty':len(product_reviews), 'token_qty': token_qty, 'response': response}, ignore_index=True)
            except Exception as e:
                print('error processing: ', row[0], row[1], qty)
                pass

        return results

In [7]:
model_name = "ehartford_dolphin-2.2.1-mistral-7b"

tokenizer = AutoTokenizer.from_pretrained(path + model_name)
model = AutoModelForCausalLM.from_pretrained(path + model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [01:11<00:00, 35.71s/it]


In [16]:

pipeline_llm = transformers.pipeline(
    "text-generation",
    model=path + model_name,
    torch_dtype=torch.float32,#16 is GPU, 32 is CPU
    #device_map="auto"#, 
    device='cpu'
)

Loading checkpoint shards: 100%|██████████| 2/2 [01:50<00:00, 55.39s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
def prompt_llm(prompt_text):
  sequences = pipeline_llm(
      prompt_text,
      do_sample=False,
      top_k=0,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
      max_length=1000,
  )

  result = ('Result from llm:')
  
  for seq in sequences:
      result += (f"Result: {seq['generated_text']}")

  return result

In [2]:
utl = Utils()
utl.load_reviews('reviews_aws_sample.parquet')

In [3]:
category = 'Watches'
product = 'Timex Women\'s Easy Reader Leather Strap Watch'

product_reviews = utl.reviews[(utl.reviews['product_category'] == category) & (utl.reviews['product_title'] == product)]

prompt = utl.get_prompt(product) + '\n'.join(product_reviews['review_body'][:])
result = prompt_llm(prompt)

print(result)

In [4]:
print(utl.get_prompt(product) + '\n'.join(product_reviews['review_body'][:]))

Please analyze the provided reviews of the product 'Timex Women's Easy Reader Leather Strap Watch'. 
          Create a concise summary that encapsulates the key opinions and sentiments expressed in these reviews. 
         The summary should be structured as if it's a single comprehensive review of the product. 
         The summary should mimic the style and tone of a customer reviews, making it relatable and genuine. 
         Also, provide a list of 5 tags that represent what the customers are saying about the product, give the balance between positive and negative aspects about the product, the tags have this format: #TagName.         Format your response as follows: 
             Product: Timex Women's Easy Reader Leather Strap Watch
             Summary: [Your summary here]
             Tags: #tag1 #tag2 #tag3 #tag4 #tag5
         List of reviews: 
Purchased as a gift for my elderly Mom.  She loved it.  What more can I add.
Nice !
Timex offers a wide variety of materials, colors

In [18]:
prompt = utl.get_prompt(product) + '\n'.join(product_reviews['review_body'][:])
prompt_llm(prompt)

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


"Result from llm:Result: Please analyze the provided reviews of the product 'Timex Women's Easy Reader Leather Strap Watch'. \n          Create a concise summary that encapsulates the key opinions and sentiments expressed in these reviews. \n         The summary should be structured as if it's a single comprehensive review of the product. \n         The summary should mimic the style and tone of a customer reviews, making it relatable and genuine. \n         Also, provide a list of 5 tags that represent what the customers are saying about the product, give the balance between positive and negative aspects about the product, the tags have this format: #TagName.         Format your response as follows: \n             Product: Timex Women's Easy Reader Leather Strap Watch\n             Summary: [Your summary here]\n             Tags: #tag1 #tag2 #tag3 #tag4 #tag5\n         List of reviews: \nPurchased as a gift for my elderly Mom.  She loved it.  What more can I add.\nNice !\nTimex offers