In [38]:
import re
import pandas as pd
from dataclasses import dataclass
import xml.etree.ElementTree as ET

""" This script reads reviews from the Amazon Review Dataset and parses them into a list of Review objects. """

@dataclass
class Review:
    is_negative: int
    asin: str
    product_name: str
    product_type: str
    helpful: str
    rating: float
    title: str
    date: str
    reviewer: str
    reviewer_location: str
    review_text: str


class ReviewXMLParser:

    @staticmethod
    def get_element_text(element, tag, default=''):
        # Handle namespaces if present
        namespaces = {'ns': element.tag.split('}')[0].strip('{')} if '}' in element.tag else {}
        found_element = element.find(f'.//{tag}', namespaces)
        return found_element.text if found_element is not None else default

    @staticmethod
    def parse(xml, negative=None) -> list[Review]:
        root = ET.fromstring(xml)
        reviews = []
        for review_element in root.findall(".//review"):

            rating = float(ReviewXMLParser.get_element_text(review_element, "rating"))

            review = Review(
                is_negative=negative,  # Assuming all reviews in negative_reviews are negative
                asin=ReviewXMLParser.get_element_text(review_element, "asin").strip(),
                product_name=ReviewXMLParser.get_element_text(review_element, "product_name"),
                product_type=ReviewXMLParser.get_element_text(review_element, "product_type"),
                helpful=ReviewXMLParser.get_element_text(review_element, "helpful"),
                rating=rating,
                title=ReviewXMLParser.get_element_text(review_element, "title"),
                date=ReviewXMLParser.get_element_text(review_element, "date"),
                reviewer=ReviewXMLParser.get_element_text(review_element, "reviewer"),
                reviewer_location=ReviewXMLParser.get_element_text(review_element, "reviewer_location"),
                review_text=ReviewXMLParser.get_element_text(review_element, "review_text"),
            )
            reviews.append(review)
        return reviews

reviews = []

base_path = "data"
review_files = [
    # Positive reviews
    (f"{base_path}/books/negative.review", 1),
    (f"{base_path}/dvd/negative.review", 1),
    (f"{base_path}/electronics/negative.review", 1),
    (f"{base_path}/kitchen_housewares/negative.review", 1),
    
    # Negative reviews 
    (f"{base_path}/books/positive.review", 0),
    (f"{base_path}/dvd/positive.review", 0),
    (f"{base_path}/electronics/positive.review", 0),
    (f"{base_path}/kitchen_housewares/positive.review", 0),

    # Unlabeled reviews
    (f"{base_path}/dvd/unlabeled.review", None),
    (f"{base_path}/electronics/unlabeled.review", None),
    (f"{base_path}/kitchen_housewares/unlabeled.review", None),
]

for file, is_negative in review_files:
    with open(file) as f:
        print(f"Reading {file}")    
        normalized = f.read()
        # normalized = re.sub(r'[^<->-/-A-Za-z0-9-\s]+', '', normalized)
        normalized = re.sub(r'&quot', '', normalized)
        normalized = re.sub(r'&', '', normalized)
        normalized = re.sub(r'', '', normalized)
        normalized = re.sub(r'\?', '', normalized)
        normalized = re.sub(r'\n', ' ', normalized)
        normalized = re.sub(r'"', '\'', normalized)

        # print(normalized)
        reviews += ReviewXMLParser.parse(normalized, is_negative)

df = pd.DataFrame(reviews)

df_transformed = df.copy()
df

Reading data/books/negative.review
Reading data/dvd/negative.review
Reading data/electronics/negative.review
Reading data/kitchen_housewares/negative.review
Reading data/books/positive.review
Reading data/dvd/positive.review
Reading data/electronics/positive.review
Reading data/kitchen_housewares/positive.review
Reading data/dvd/unlabeled.review
Reading data/electronics/unlabeled.review
Reading data/kitchen_housewares/unlabeled.review


Unnamed: 0,is_negative,asin,product_name,product_type,helpful,rating,title,date,reviewer,reviewer_location,review_text
0,1.0,0312355645,Running with Scissors: A Memoir: Books: Augus...,books,4 of 9,1.0,"Horrible book, horrible.","November 14, 2006",Mark Gospri,,THis book was horrible. If it was possible t...
1,1.0,1559278676,Running with Scissors: A Memoir: Books: Augus...,books,1 of 6,1.0,shallow self-indulgence,"November 15, 2006",Joseph S. Perrott 'avid reader',"philadelphia, pa. United States",I like to use the Amazon reviews when purchas...
2,1.0,1559278676,Running with Scissors: A Memoir: Books: Augus...,books,4 of 9,1.0,"Horrible book, horrible.","November 14, 2006",Mark Gospri,,THis book was horrible. If it was possible t...
3,1.0,0425193373,Fierce Conversations: Achieving Sucess at Wor...,books,10 of 13,1.0,Disappointment,"March 13, 2006",Reader 'Reader',,"I'm not sure who's writing these reviews, but..."
4,1.0,0142004030,Lost in a Good Book (Thursday Next Novels): B...,books,6 of 7,2.0,A Disappointing Mess,"May 17, 2006",A. Ross,"Washington, DC",I picked up the first book in this series (Th...
...,...,...,...,...,...,...,...,...,...,...,...
72674,,B00006IUU0,Mr. Coffee Cocomotion Hot Chocolate Maker: Ki...,kitchen housewares,,5.0,It's cold outside!,"September 29, 2006",Adam G,"Sonoma County, CA USA",What a perfect little machine! I love my Coco...
72675,,B00006IUU0,Mr. Coffee Cocomotion Hot Chocolate Maker: Ki...,kitchen housewares,,5.0,Great Gift!,"February 27, 2006",M. Strouse 'pezhead859',"Brooklyn, New York",I gave this as a gift and was told this makes...
72676,,B00006IUU0,Mr. Coffee Cocomotion Hot Chocolate Maker: Ki...,kitchen housewares,1 of 1,5.0,A must have for coffee and hot drink users al...,"February 25, 2006",C. Trentham,"Plano, Texas USA",We got this as a Christmas gift in December a...
72677,,B00006IUU0,Mr. Coffee Cocomotion Hot Chocolate Maker: Ki...,kitchen housewares,4 of 5,5.0,Great little work horse that Makes more thing...,"February 18, 2006",Dasher 'marbleann',"Huntington, NY United States",I got this little machine when it first came ...


In [39]:
df.shape

(72679, 11)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72679 entries, 0 to 72678
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   is_negative        8000 non-null   float64
 1   asin               72679 non-null  object 
 2   product_name       72679 non-null  object 
 3   product_type       72679 non-null  object 
 4   helpful            72679 non-null  object 
 5   rating             72679 non-null  float64
 6   title              72679 non-null  object 
 7   date               72679 non-null  object 
 8   reviewer           72679 non-null  object 
 9   reviewer_location  72679 non-null  object 
 10  review_text        72679 non-null  object 
dtypes: float64(2), object(9)
memory usage: 6.1+ MB


In [41]:
""" Normalize `helpful` column """

# It would be possible to transform a `x of y` to a % proportion between 0 to 1

df_transformed['helpful'] = df['helpful'].str.replace(' of ', ';').str.replace(' ', '')

def transform_helpful(helpful):
    if helpful == '':
        return 0
    x, y = helpful.split(';')
    x = int(x)
    y = int(y)
    if x == 0:
        return 0
    
    return int(x) / int(y)

df_transformed['helpful'] = df_transformed['helpful'].apply(transform_helpful)
df_transformed['helpful'].value_counts()

helpful
1.000000    29582
0.000000    17840
0.500000     3676
0.666667     2607
0.750000     1820
            ...  
0.564103        1
0.542857        1
0.952830        1
0.228571        1
0.981132        1
Name: count, Length: 723, dtype: int64

In [42]:
""" Transform text reviews to lowercase """

from textblob import TextBlob


def transform_text(helpful):
    blob = TextBlob(helpful)
    blob = blob.lower()
    # blob = blob.correct()
    return str(blob)

df_transformed['review_text'] = df['review_text'].apply(transform_text)

In [43]:
df_transformed[['review_text', 'is_negative']]

Unnamed: 0,review_text,is_negative
0,this book was horrible. if it was possible t...,1.0
1,i like to use the amazon reviews when purchas...,1.0
2,this book was horrible. if it was possible t...,1.0
3,"i'm not sure who's writing these reviews, but...",1.0
4,i picked up the first book in this series (th...,1.0
...,...,...
72674,what a perfect little machine! i love my coco...,
72675,i gave this as a gift and was told this makes...,
72676,we got this as a christmas gift in december a...,
72677,i got this little machine when it first came ...,


In [44]:
df_transformed = df_transformed.dropna()

In [45]:
""" Spell correction """

import pkg_resources
from symspellpy.symspellpy import SymSpell

# Initialize SymSpell
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")

sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

_total = len(df_transformed['review_text'])
_count = 0
def correct_text(text):
    global _count
    # Correct the text using SymSpell
    corrected_text = sym_spell.lookup_compound(text, max_edit_distance=2)
    
    _count += 1
    if _count % 100 == 0 or _count == _total:  # Print progress every 100 iterations or at the end
        print(f"Progress: {_count} / {_total} ({(_count / _total) * 100:.2f}%)")
    return corrected_text[0].term if corrected_text else text

In [46]:
# Apply the spell correction function to the 'text' column
df_transformed['corrected_text'] = df_transformed['review_text'].apply(correct_text)
df_transformed['corrected_title'] = df_transformed['title'].apply(correct_text)


Progress: 100 / 8000 (1.25%)
Progress: 200 / 8000 (2.50%)
Progress: 300 / 8000 (3.75%)
Progress: 400 / 8000 (5.00%)
Progress: 500 / 8000 (6.25%)
Progress: 600 / 8000 (7.50%)
Progress: 700 / 8000 (8.75%)
Progress: 800 / 8000 (10.00%)
Progress: 900 / 8000 (11.25%)
Progress: 1000 / 8000 (12.50%)
Progress: 1100 / 8000 (13.75%)
Progress: 1200 / 8000 (15.00%)
Progress: 1300 / 8000 (16.25%)
Progress: 1400 / 8000 (17.50%)
Progress: 1500 / 8000 (18.75%)
Progress: 1600 / 8000 (20.00%)
Progress: 1700 / 8000 (21.25%)
Progress: 1800 / 8000 (22.50%)
Progress: 1900 / 8000 (23.75%)
Progress: 2000 / 8000 (25.00%)
Progress: 2100 / 8000 (26.25%)
Progress: 2200 / 8000 (27.50%)
Progress: 2300 / 8000 (28.75%)
Progress: 2400 / 8000 (30.00%)
Progress: 2500 / 8000 (31.25%)
Progress: 2600 / 8000 (32.50%)
Progress: 2700 / 8000 (33.75%)
Progress: 2800 / 8000 (35.00%)
Progress: 2900 / 8000 (36.25%)
Progress: 3000 / 8000 (37.50%)
Progress: 3100 / 8000 (38.75%)
Progress: 3200 / 8000 (40.00%)
Progress: 3300 / 8000 (4

In [47]:
df_transformed['text'] = df_transformed['corrected_text']
df_transformed['title'] = df_transformed['corrected_title']

df_transformed[['title', 'text', 'is_negative']].to_csv('reviews.csv', index=False)


In [48]:
df_transformed[['review_text', 'corrected_text']].head(20)

Unnamed: 0,review_text,corrected_text
0,this book was horrible. if it was possible t...,this book was horrible if it was possible to r...
1,i like to use the amazon reviews when purchas...,i like to use the amazon reviews when purchasi...
2,this book was horrible. if it was possible t...,this book was horrible if it was possible to r...
3,"i'm not sure who's writing these reviews, but...",i'm not sure who's writing these reviews but i...
4,i picked up the first book in this series (th...,i picked up the first book in this series the ...
5,"not only do i disagree with his opinions, but...",not only do i disagree with his opinions but s...
6,;i have received your new book against the hu...,i have received your new book against the huma...
7,this book was on somebody's amazon.com listma...,this book was on somebody's amazon com list ma...
8,i am not sure whatever possessed me to buy th...,i am not sure whatever possessed me to buy thi...
9,when professor polk describes the sweep of hi...,when professor polk describes the sweep of his...
