In [1]:
import pandas as pd
import gzip
import json
pd.set_option('display.max_colwidth', None)

In [2]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [3]:
meta_data = getDF('../data/sample_meta_Home_and_Kitchen.json.gz')
review_data = getDF('../data/sample_Home_and_Kitchen_5.json.gz')
meta_distinct = meta_data.drop_duplicates(subset=['asin'])
main_data = pd.merge(meta_distinct, review_data, on='asin', how='inner', suffixes=('_l', '_r'))
main_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41047 entries, 0 to 41046
Data columns (total 27 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   category        41047 non-null  object 
 1   description     37212 non-null  object 
 2   title           41047 non-null  object 
 3   brand           40886 non-null  object 
 4   feature         39079 non-null  object 
 5   rank            40991 non-null  object 
 6   also_view       10057 non-null  object 
 7   main_cat        41047 non-null  object 
 8   price           28155 non-null  object 
 9   asin            41047 non-null  object 
 10  image_l         35162 non-null  object 
 11  similar_item    30053 non-null  object 
 12  also_buy        18586 non-null  object 
 13  date            2986 non-null   object 
 14  tech1           670 non-null    object 
 15  details         232 non-null    object 
 16  overall         41047 non-null  float64
 17  verified        41047 non-null 

In [4]:
reviews_clean_1 = main_data[['asin', 'brand', 'title', 'reviewText', 'summary', 'reviewerID', 'unixReviewTime']].copy()
reviews_clean_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41047 entries, 0 to 41046
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   asin            41047 non-null  object
 1   brand           40886 non-null  object
 2   title           41047 non-null  object
 3   reviewText      41042 non-null  object
 4   summary         41041 non-null  object
 5   reviewerID      41047 non-null  object
 6   unixReviewTime  41047 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 2.5+ MB


In [5]:
reviews_clean_2 = reviews_clean_1.dropna(how='any') # drop null values
reviews_clean_3 = reviews_clean_2.drop_duplicates(subset=['asin', 'reviewerID', 'unixReviewTime']) # drop duplicates
reviews_clean_4 = reviews_clean_3[~reviews_clean_3.title.str.contains('getTime')] # remove rows with unformatted title, some 'title' may still contain html style content
reviews_clean_4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37183 entries, 0 to 41046
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   asin            37183 non-null  object
 1   brand           37183 non-null  object
 2   title           37183 non-null  object
 3   reviewText      37183 non-null  object
 4   summary         37183 non-null  object
 5   reviewerID      37183 non-null  object
 6   unixReviewTime  37183 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 2.3+ MB


In [40]:
reviews = reviews_clean_4[['asin', 'brand', 'title', 'reviewText', 'summary']].copy()
reviews.head()

Unnamed: 0,asin,brand,title,reviewText,summary
0,B00002N62Y,Eureka,Eureka 54312-12 Vacuum Cleaner Belt,good product and price,Four Stars
1,B00002N62Y,Eureka,Eureka 54312-12 Vacuum Cleaner Belt,These belts are $2+ retail and some retailers/sellers use generic/non OEM specs sop the belts are loose...not the case with these!,Great price - OEM belts!
2,B00002N62Y,Eureka,Eureka 54312-12 Vacuum Cleaner Belt,These came in a 2 pk and are perfect fit for my good old vac. Seem strong and durable. Not available local so I hope they last.,2fer wow
3,B00002N62Y,Eureka,Eureka 54312-12 Vacuum Cleaner Belt,"So my super fancy vacuum cleaner, the one with the cold fusion reactor that picks up bowling balls and small furry critters was stumped by that small paper circle leftover from my three hole punch, the paper equivalent of the doughnut munchkin.\n\nI passed over the little bugger hundreds of time causing a mild brown out in southern California as I increased the power to the machine well past the safety protocols.. Yeah sorry that was me.. yet the small paper dot remained unmoving on the floor. Mocking me....I swear it flipped over showing me its back side as a further insult. I paid good money for this piece of modern technology and was not about to bend down and pick the dot up.\n\nNo way, never.\n\nAfter going to night school to lean vacuum cleaner maintenance, I felt ready to fix my machine and get that evil disc off my floor at last. It had since been joined by others of its kind as I like ventilating paper with my electric three hole punch machine, I do this continually actually using recycled paper of course, but that is another tale for another review.\nAnyway, I realized that there was a belt in the vacuum that had stopped doing the belt thing between the fusion reactor and the mach 5 brushes that would pulverize dirt unless it surrendered peacefully.\n\nMy quest began as not all belts are the same. No siree, they have many names and many numbers and most would not work for my machine, nor come in time to save my marriage.\n\nI called Jeff Bezos who told me that Amazon sold stuff like this and after he tried to sell me a Kindle, I went online and found this belt and I ordered it. It was indeed a joyous day. I did not buy a Kindle.\n\nThe belt arrived promptly and I implanted it into the unit. In the process, dirt from as far back as 1942 was released into my home causing near white out conditions but I managed to conclude the operation. Gasping I turned on the machine and in moments the near toxic environment was replaced with clean fresh air, dust free and smelling of pine. I proceeded to capture the paper circles and made life bearable once again. Eureka!!!!",WOWEE!!!
4,B00002N62Y,Eureka,Eureka 54312-12 Vacuum Cleaner Belt,Right one for my eureka.,Four Stars


In [39]:
reviews_head = reviews.head()
review_text = []
for line in reviews_head.reviewText:
    review_text += line.splitlines()

In [21]:
review_text

['good product and price',
 'These belts are $2+ retail and some retailers/sellers use generic/non OEM specs sop the belts are loose...not the case with these!',
 'These came in a 2 pk and are perfect fit for my good old vac. Seem strong and durable. Not available local so I hope they last.',
 'So my super fancy vacuum cleaner, the one with the cold fusion reactor that picks up bowling balls and small furry critters was stumped by that small paper circle leftover from my three hole punch, the paper equivalent of the doughnut munchkin.',
 '',
 'I passed over the little bugger hundreds of time causing a mild brown out in southern California as I increased the power to the machine well past the safety protocols.. Yeah sorry that was me.. yet the small paper dot remained unmoving on the floor.  Mocking me....I swear it flipped over showing me its back side as a further insult.  I paid good money for this piece of modern technology and was not about to bend down and pick the dot up.',
 '',


In [33]:
from nltk.tokenize import sent_tokenize
tokens = sent_tokenize(str(review_text))

In [38]:
tokens_normalized = [token.lower() for token in tokens]
tokens_normalized

["['good product and price', 'these belts are $2+ retail and some retailers/sellers use generic/non oem specs sop the belts are loose...not the case with these!",
 "', 'these came in a 2 pk and are perfect fit for my good old vac.",
 'seem strong and durable.',
 'not available local so i hope they last.',
 "', 'so my super fancy vacuum cleaner, the one with the cold fusion reactor that picks up bowling balls and small furry critters was stumped by that small paper circle leftover from my three hole punch, the paper equivalent of the doughnut munchkin.",
 "', '', 'i passed over the little bugger hundreds of time causing a mild brown out in southern california as i increased the power to the machine well past the safety protocols.. yeah sorry that was me.. yet the small paper dot remained unmoving on the floor.",
 'mocking me....i swear it flipped over showing me its back side as a further insult.',
 'i paid good money for this piece of modern technology and was not about to bend down an