In [21]:
"""
!pip install nltk
!pip install pandas
"""




In [485]:
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfgen import canvas
from reportlab.platypus import Paragraph
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.styles import ParagraphStyle
import nltk

from nltk.stem import WordNetLemmatizer
from string import digits
import re
nltk.download('wordnet')
nltk.download('stopwords')
import pandas as pd
import matplotlib.pyplot as plt

from transformers import pipeline

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jojoshulk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jojoshulk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [418]:

def generate_pdf(product: str,
                 review: str,
                 product_image: str,
                 output_path: str,
                 average_rating: float,
                 rating_frequencies: str,
                 rating_history: str):
    """
    Function that generates a product review pdf
    Args:
        product:
        review:
        product_image:
        output_path:
        average_rating:
        rating_frequencies:
        rating_history:

    Returns:

    """
    # Register the font with reportlab
    pdfmetrics.registerFont(TTFont('VeraBd', 'VeraBd.ttf'))

    # Create a PDF canvas
    c = canvas.Canvas(f"{output_path}", pagesize=A4)
    styles = getSampleStyleSheet()

    # Draw the product image
    c.drawImage(product_image, 50, 410, height=350, width=200, preserveAspectRatio=True)

    # Set the font for the product name
    # c.rect(0, 0, 1000, 1000, stroke=1, fill=0)
    c.setFont("VeraBd", 30)
    x = c._pagesize[0] / 2
    c.drawCentredString(x, 790, f"{product}")


    # --- Review text box --- #
    # Set the font for the review
    c.setFont("Arial", 30)

    # Draw a rectangle for the review text box
    style = ParagraphStyle(name='Normal_1', fontSize=12)
    c.rect(290, 590, 270, 160, stroke=1, fill=0)
    p = Paragraph(review, style=style)
    p.wrapOn(c, 250, 0)
    p.drawOn(c, 300, 600)


    # --- Average Rating --- #
    # Set the font for the product name
    c.setFont("VeraBd", 20)
    x = 300 + 250 /2
    c.drawCentredString(x, 550, f"Rating: {average_rating:.2f}/5")

    # Draw the average rating image
    c.drawImage(rating_frequencies, 290, 360, width=260,preserveAspectRatio=True)

    # Draw the average rating image
    c.drawImage(rating_history, 20, 150, width=300,preserveAspectRatio=True)

    # Save the PDF
    c.save()

counter += 1
output = f"review_{counter}.pdf"
generate_pdf("Beyond Order",
             "Liana Barrientos, 39, is charged with two counts of 'offering a false instrument for filing in the first degree' In total, she has been married 10 times, wiaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaath nine of her marriages occurring between 1999 and 2002. She is believed to still be married to four men.",
             product_image="./beyond_order.png",
             output_path=output,
             average_rating=4.68,
             rating_frequencies=r"./rating_frequencies.png",
             rating_history=r"./rating_history.png")


In [438]:
class PlotGenerator:
    """class that generates and saves plots given a csv containing reviews"""

    RATING_FREQUENCY: str = "rating_frequencies.png"
    RATING_HISTORY: str= "rating_history.png"


    def __init__(self, filename: str):
        self.filename = filename
        self.df = self.read_csv(filename)

    @staticmethod
    def read_csv(filename: str) -> pd.DataFrame:
        """Read a csv file and returns the output in a Dataframe"""
        df = pd.read_csv(filename)
        df.sort_values(by='date', inplace=True)
        df.reset_index(inplace=True, drop=True)
        return df

    def generate_plots(self) -> None:
        self._plot_rating_frequencies(output_path=self.RATING_FREQUENCY)
        self._plot_mean_ratings(output_path=self.RATING_HISTORY)


    def _plot_rating_frequencies(self, output_path: str) -> None:
        """Plot a bar chart of the frequencies of the ratings in a pandas DataFrame."""

        # Count the frequencies of the ratings
        ratings = self.df['rating'].value_counts()

        # Get the labels (ratings) and values (frequencies)
        labels = ratings.index
        values = ratings.values

        # Convert the frequencies to percentages
        total_ratings = sum(values)
        values = [value / total_ratings * 100 for value in values]

        # Create the bar plot
        plt.barh(labels, values, height=0.6, color='#FF9900', edgecolor='none')

        # Add labels to the bars
        for index, value in enumerate(values):
            plt.text(value+3, labels[index], f"{value:.0f}" + '%', va='center',  fontsize=20)

        # Set the y-axis labels
        plt.yticks(labels, [str(label) + ' stars'
                            if str(label) != "1" else str(label) + ' star'
                            for label in labels],  fontsize=20)
        plt.box(False)
        plt.xticks([])

        # Save the plot as a PDF file
        plt.savefig(output_path, format='png', bbox_inches = 'tight')
        plt.close()

    def _plot_mean_ratings(self, output_path: str) -> None:
        """
        Plot a line chart of the mean ratings over the years in a pandas DataFrame.
        """

        df = self.df.copy()
        # Convert the date column to datetime
        df['date'] = pd.to_datetime(df['date'])

        # Convert the date column to a period with the frequency "Q" (quarters)
        df['period'] = df['date'].dt.to_period('Q')

        # Group the ratings by year and quarter and calculate the mean rating for each year and quarter
        mean_ratings = df.groupby('period')['rating'].mean()

        # Group the reviews by year and quarter and count the number of reviews with ratings between 1 and 2
        low_ratings = df[(df['rating'] >= 1) & (df['rating'] <= 2)].groupby('period')['rating'].count()/df.groupby('period')['rating'].count()

        # Get the years and quarters and mean ratings and number of reviews with ratings between 1 and 2
        periods = mean_ratings.index
        mean_ratings = mean_ratings.values
        low_ratings = low_ratings.values

        # Convert the periods to strings in the format "YYYY-QQ"
        labels = [period.strftime('%Y-%m') for period in periods]

        # Create the figure and axis with twin y-axis
        fig, ax = plt.subplots()
        ax2 = ax.twinx()

        # Create the line plots
        ax.plot(labels, mean_ratings, color='#FF9900', linewidth=4)
        ax2.bar(labels, low_ratings, color='#1f77b4', linewidth=2, alpha=0.8)

        # Rotate the x-axis labels and start the y-axis from 0
        ax.set_xticklabels(labels, rotation=45)
        ax.set_ylim(0, 5,2)

        # Add axis labels and title
        ax.set_xlabel('Year and quarter',  fontsize=20)
        ax.set_ylabel('Mean rating',  fontsize=20, color='#FF9900')
        ax2.set_ylabel('Negative vs Total \nRatings',  fontsize=20, color='#1f77b4')

        # Save the plot as a PDF file
        _ = plt.savefig(output_path, format='png', bbox_inches = 'tight')
        plt.close()

In [439]:
def summarize(csv_path: str,
              png_path: str = None) -> None:
    """Creates a one page pdf containing a summary review of the product"""

    # save plots
    plot_generator = PlotGenerator(csv_path)
    plot_generator.generate_plots()

    # pre-process text
    text: pd.core.series.Series = plot_generator.df["text"].apply(lambda row: preprocess(row) if isinstance(row, str) else '')
    reviews = ""
    for review in text:
        reviews += review

summarize(csv_path =r"./amazon_reviews_beyond_order.csv")



                                                  text  rating        date
0    This is exactly what we all needs to be readin...       5  2021/03/02
1    The catastrophic over-diagnosis of ‘neurosis’ ...       5  2021/03/02
2                                           Great book       5  2021/03/02
3    Already off to a great start, got it today on ...       5  2021/03/02
4    This man has put his life in full display for ...       5  2021/03/02
..                                                 ...     ...         ...
760  I purchased the paperback which is quite expen...       2  2022/12/09
761  Nothing against the book itself but the physic...       2  2022/12/14
762                                         Great read       5  2022/12/15
763  History will record Jordan Peterson as one of ...       5  2022/12/18
764  Our times have seen few individuals capable of...       5  2022/12/27

[765 rows x 3 columns]


In [472]:
"""
Pre-processes the text, using nltk, in order to use in language models
"""
def preprocess(sentence):
    """
    Function to process a given sentence.
    INPUT: raw string (tweet)
    OUTPUT: processed string
    """
    # initialize lemmatizer
    stemmer = WordNetLemmatizer()

    # define trans for digits
    remove_digits = str.maketrans('', '', digits)

    #convert the sentence to lower
    sentence = sentence.lower()

    # Remove underscores
    sentence = re.sub(r'[-_]', '', sentence)
    sentence = re.sub(r'_[A-Za-z0-9]+', ' ', sentence)
    sentence = re.sub(r'^ _\s+', ' ', sentence)

    # Remove websites
    sentence = re.sub('https?://[A-Za-z0-9./]+', ' ', sentence)

    #remove all non words
    sentence = re.sub(r'\W', ' ', sentence)

    #remove all single characters
    sentence = re.sub(r'\b\w\b', ' ', sentence)

    # Remove numbers
    sentence = re.sub(r'[0-9]', ' ', sentence)
    sentence = sentence.translate(remove_digits)

    #remove multiple whitespaces
    sentence = re.sub(' +', ' ', sentence)

    # Split the sentence based on whitespaces (--> List of words)
    sentence = sentence.split()

    # Lemmatization
    sentence = [stemmer.lemmatize(word) for word in sentence]

    # Reconstruct the sentence by joining the words on each whitespace
    sentence = ' '.join(sentence)
    return sentence

In [511]:
def split(list_a, chunk_size):

  for i in range(0, len(list_a), chunk_size):
    yield list_a[i:i + chunk_size]

# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
#
#
# ARTICLE = " ".join(reviews.split(" ")[:900])
#
# print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))

In [457]:
 import openai

# Replace YOUR_API_KEY with your OpenAI API key
openai.api_key = "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"

def summarize_text(text):
  # Use the OpenAI API to summarize the text
  response = openai.Completion.create(
    model="text-davinci-002",
    prompt=f"what are the good points of this product: {text}",
    temperature=0.5,
    max_tokens=50,
    top_p=1,
    frequency_penalty=1,
    presence_penalty=1
  )
  summary = response["choices"][0]["text"]

  return summary

summary = summarize_text(text)
print(summary)

InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 46545 tokens (46495 in your prompt; 50 for the completion). Please reduce your prompt; or completion length.

In [461]:
  response = openai.Completion.create(
    model="text-davinci-002",
    prompt=f": Purchased this for my brother who like Jordan the author. Wanna sort yer life out. Listen to this man!!!!Easy to understand written well love itTo say Jordan Peterson is marmite is something of an understatement. He is both adored and hated in equal measures. Much of the hatred often, it seems to me, comes from people who don\'t seem to have read what he actually said and don\'t care. This volume experienced something of this in production from certain people at the publishers who wanted to cancel the book.",
    temperature=0.5,
    max_tokens=100,
    top_p=1,
    frequency_penalty=1,
    presence_penalty=1
  )
response["choices"][0]["text"]

" It's a shame they did not succeed as 12 Rules for Life is one of the most important books I've read in years. It should be given to every teenager on their 18th birthday."