In [None]:
!pip install tiktoken transformers accelerate sentencepiece



In [None]:
import os
import transformers
import torch
import pandas as pd
import tiktoken
import warnings

from google.colab import drive
from transformers import AutoTokenizer, pipeline, AutoModel, AutoModelForCausalLM

warnings.filterwarnings('ignore')
os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'TOKEN'

In [None]:
path = '/content/drive/'

drive.mount(path)
os.chdir(path + 'MyDrive/Data/reviews/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
token_hf = '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'
path='/content/drive/'#'D:/Disco/Data/huggingface/'
reviews_path = 'MyDrive/Data/reviews/' #' D:/Disco/Data/datasets/amazon_us_reviews/'

In [None]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens


In [None]:
model_name = "ehartford/dolphin-2.2.1-mistral-7b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

pipeline_llm = transformers.pipeline(
    "text-generation",
    model=model_name,
    torch_dtype=torch.float16,#16 is GPU, 32 is CPU
    device_map="auto"#,
    #device='cpu'
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def prompt_llm(prompt_text, product):
  sequences = pipeline_llm(
      prompt_text,
      do_sample=False,
      top_k=0,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
      max_length=2500,
  )

  for seq in sequences:
        generated_text = seq['generated_text']
        # Define o padrão exclusivo para o início do resumo
        start_pattern = f"Product: {product}\nSummary:"
        start_index = generated_text.find(start_pattern)
        print(start_index)
        # Se o padrão for encontrado, retorna o texto a partir desse ponto
        if start_index != -1:
            return generated_text[start_index:]
        else:
            return "Resumo não encontrado."

In [None]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [None]:
class Utils:
    def __init__(self) -> None:
        self.reviews = []

    def load_reviews(self, file_name):
        self.reviews = pd.read_parquet(path+reviews_path + file_name)

    def get_reviews_by_product_and_category(self, product, category):
        return self.reviews[(self.reviews['product_category'] == category) & (self.reviews['product_title'] == product)]

    def get_prompt(self, product):
        return f"Please analyze the provided reviews of the product '{product}'. \n  \
        Create a concise summary that encapsulates the key opinions and sentiments expressed in these reviews. \n \
        The summary should be structured as if it's a single comprehensive review of the product. \n \
        The summary should mimic the style and tone of a customer reviews, making it relatable and genuine. \n \
        Also, provide a list of 5 tags that represent what the customers are saying about the product, give the balance between positive and negative aspects about the product, the tags have this format: #TagName. \
        Format your response as follows: \n \
            response:[ \n \
            Product: {product}\n \
            Summary: [Your summary here]\n \
            Tags: #tag1 #tag2 #tag3 #tag4 #tag5 ] \n \
        List of reviews: \n"

    def build_summary(self, review_qty=50, model='gpt-4'):
        results = pd.DataFrame(columns=['category', 'product', 'prompt', 'reviews', 'review_qty', 'token_qty', 'response'])
        for row, qty in self.reviews[['product_category', 'product_title']].value_counts()[:10].items():
            try:
                category = row[0]
                product = row[1]
                print(category, ': ', product, qty)

                product_reviews = self.reviews[(self.reviews['product_category'] == category) & (self.reviews['product_title'] == product)][:review_qty]

                for index, row in product_reviews.iterrows():
                  try:
                    text_to_summarize = row['review_body']

                    summary = summarizer(text_to_summarize, max_length=10, min_length=1, do_sample=False)

                    summarized_text = summary[0]['summary_text']
                    #print(len(text_to_summarize), len(summarized_text))
                    product_reviews.at[index, 'review_body'] = summarized_text
                  except Exception as e:
                    pass

                reviews = '\n'.join(product_reviews['review_body'])
                prompt = self.get_prompt(product)

                message_prompt=[{
                    "role": "user",
                    "content": f"{prompt}:{reviews}"}]

                token_qty = num_tokens_from_messages(message_prompt, model=model)
                print('tokens: ', token_qty)
                print(len(prompt + reviews))
                response = prompt_llm(prompt + reviews, product)

                results = results.append({'category': category, 'product': product, 'prompt': prompt, 'reviews': reviews, 'review_qty':len(product_reviews), 'token_qty': token_qty, 'response': response}, ignore_index=True)
            except Exception as e:
                print('error processing: ', row[0], row[1], qty, e)
                pass

        return results

In [None]:
utl = Utils()
utl.load_reviews('reviews_aws_sample.parquet')

In [None]:
results = utl.build_summary(5)
results

Home Entertainment :  Google Chromecast HDMI Streaming Media Player 1656


Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


tokens:  223
1054
-1
PC :  Kindle Fire HDX 7", HDX Display (Previous Generation - 3rd) 1415


Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


tokens:  244
1088
2647
PC :  Fire HD 7, 7" HD Display, Wi-Fi, 8 GB 1353


Your max_length is set to 10, but your input_length is only 3. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1)
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


tokens:  241
1028
1221
Electronics :  Panasonic ErgoFit In-Ear Earbud Headphone 1128


Your max_length is set to 10, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


tokens:  231
1054
-1
Toys :  Cards Against Humanity 1111


Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


tokens:  212
1005
1047
PC :  Kindle Fire (Previous Generation - 1st) 1091


Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


tokens:  229
1053
-1
PC :  Kindle Paperwhite, 6" High-Resolution Display (212 ppi) with Built-in Light, Wi-Fi 1086


Your max_length is set to 10, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)
Your max_length is set to 10, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


tokens:  254
1121
-1
PC :  SanDisk Ultra microSDHC Card Plus Adapter 1018


Your max_length is set to 10, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


tokens:  226
1027
-1
PC :  Fire HD 6 966


Your max_length is set to 10, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length is set to 10, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


tokens:  213
963
-1
PC :  Kindle Fire HD 7", Dolby Audio, Dual-Band Wi-Fi 897


Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


tokens:  240
1048
1575


Unnamed: 0,category,product,prompt,reviews,review_qty,token_qty,response
0,Home Entertainment,Google Chromecast HDMI Streaming Media Player,Please analyze the provided reviews of the pro...,Chromecast costs $29.\nThere is the occasional...,5,223,Resumo não encontrado.
1,PC,"Kindle Fire HDX 7"", HDX Display (Previous Gene...",Please analyze the provided reviews of the pro...,"""I've really enjoyed my kind\ni love my kindle...",5,244,"Product: Kindle Fire HDX 7"", HDX Display (Prev..."
2,PC,"Fire HD 7, 7"" HD Display, Wi-Fi, 8 GB",Please analyze the provided reviews of the pro...,"Easy to set up and use,\nBig enough screen, gr...",5,241,"Product: Fire HD 7, 7"" HD Display, Wi-Fi, 8 GB..."
3,Electronics,Panasonic ErgoFit In-Ear Earbud Headphone,Please analyze the provided reviews of the pro...,CNN.com will feature iRep\nI didn't care for t...,5,231,Resumo não encontrado.
4,Toys,Cards Against Humanity,Please analyze the provided reviews of the pro...,The only downside is that you'll\nFantastic ga...,5,212,Product: Cards Against Humanity\nSummary: Card...
5,PC,Kindle Fire (Previous Generation - 1st),Please analyze the provided reviews of the pro...,"Good battery life, bright screen.\nI love ever...",5,229,Resumo não encontrado.
6,PC,"Kindle Paperwhite, 6"" High-Resolution Display ...",Please analyze the provided reviews of the pro...,CNN.com will feature iRep\nMr. Wonderful has b...,5,254,Resumo não encontrado.
7,PC,SanDisk Ultra microSDHC Card Plus Adapter,Please analyze the provided reviews of the pro...,It formats the exFAT and\nThe Sandisk Sansa Cl...,5,226,Resumo não encontrado.
8,PC,Fire HD 6,Please analyze the provided reviews of the pro...,"""It was. perfect....\nThe kids free time app i...",5,213,Resumo não encontrado.
9,PC,"Kindle Fire HD 7"", Dolby Audio, Dual-Band Wi-Fi",Please analyze the provided reviews of the pro...,Do not buy used stuff from this\nI love my new...,5,240,"Product: Kindle Fire HD 7"", Dolby Audio, Dual-..."


In [None]:
results.to_csv(path+reviews_path + 'results_mistral_dolphin_2.csv', index=True)

In [None]:
results