In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pandas import read_csv
from numpy import set_printoptions

In [2]:
# Set Working directory
os.chdir('C:/Users/asus/Documents/GitHub/CMSC-197-Miniproject')

In [3]:
data = pd.read_csv("data/amazon_reviews.txt", delimiter = "\t")
data.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
1,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
2,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
3,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
4,5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...


In [4]:
# Preparing Data
# Getting only relevant dimensions (Label, Rating, Verified Purchase, Review title and Review text) necessary for data analysis
df = data[['LABEL','REVIEW_TITLE', 'REVIEW_TEXT']].copy()
df.rename(columns = {'LABEL': 'Label', 'REVIEW_TITLE': 'Review_Title', 'REVIEW_TEXT': 'Review_Text'}, inplace = True)
df['Review_Text'] = df['Review_Title'] + " " + df['Review_Text']
df=df.drop(['Review_Title'], axis=1)

In [5]:
# Remove N/A
for col in df.columns:
    print(col, df[col].isnull().sum())
    df = df.dropna()

Label 0
Review_Text 0


In [6]:
# Expansion literary shortcuts -> e.g., I've to I have
# !pip install contractions
import contractions
df['no_contract'] = df['Review_Text'].apply(lambda x: [contractions.fix(word) for word in x.split()]) 
df['Review_Text_str'] = [' '.join(map(str, l)) for l in df['no_contract']]

In [7]:
# Removing HTML tags
# !pip install beautifulsoup4
from bs4 import BeautifulSoup
def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text

new_list = []
for rows in df['Review_Text_str']:
    new_list.append(strip_html_tags(rows))
df['tag_removed'] = new_list

  soup = BeautifulSoup(text, "html.parser")


In [8]:
import re
# Removing Numbers
df['number_removed'] = df['tag_removed'].apply(lambda x:re.sub(r'\d+', '', x))

In [9]:
# Tokenization
# !pip install nltk
# nltk.download('all')
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
df['tokenized'] = df['number_removed'].apply(word_tokenize)

In [10]:
# Conversion to lowercase
df['lower'] = df['tokenized'].apply(lambda x: [word.lower() for word in x])

In [11]:
# Removing special characers
from nltk.corpus import words
words = set(words.words())

df['no_spec'] = df['lower'].apply(lambda x: [word for word in x if word in words])

In [12]:
# Removal of stop words
from nltk.corpus import stopwords, wordnet
stop_words = set(stopwords.words('english'))
df['stopwords_removed'] = df['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])

In [13]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
df['lemmatized'] = df['stopwords_removed'].apply(lambda x: [wnl.lemmatize(word) for word in x])

In [14]:
# Parts of speech tagging
df['pos_tags'] = df['lemmatized'].apply(nltk.tag.pos_tag)

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

df['wordnet_pos'] = df['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

In [15]:
df.loc[df["Label"] == "__label1__", "Label"] = 0
df.loc[df["Label"] == "__label2__", "Label"] = 1

In [16]:
df.head()

Unnamed: 0,Label,Review_Text,no_contract,Review_Text_str,tag_removed,number_removed,tokenized,lower,no_punc,stopwords_removed,lemmatized,pos_tags,wordnet_pos
0,0,"useful When least you think so, this product w...","[useful, When, least, you, think, so,, this, p...","useful When least you think so, this product w...","useful When least you think so, this product w...","useful When least you think so, this product w...","[useful, When, least, you, think, so, ,, this,...","[useful, when, least, you, think, so, ,, this,...","[useful, when, least, you, think, so, this, pr...","[useful, least, think, product, save, day, kee...","[useful, least, think, product, save, day, kee...","[(useful, JJ), (least, JJS), (think, JJ), (pro...","[(useful, a), (least, a), (think, a), (product..."
1,0,New era for batteries Lithium batteries are so...,"[New, era, for, batteries, Lithium, batteries,...",New era for batteries Lithium batteries are so...,New era for batteries Lithium batteries are so...,New era for batteries Lithium batteries are so...,"[New, era, for, batteries, Lithium, batteries,...","[new, era, for, batteries, lithium, batteries,...","[new, era, for, batteries, lithium, batteries,...","[new, era, batteries, lithium, batteries, some...","[new, era, battery, lithium, battery, somethin...","[(new, JJ), (era, NN), (battery, NN), (lithium...","[(new, a), (era, n), (battery, n), (lithium, n..."
2,0,doesn't swing very well. I purchased this swin...,"[does not, swing, very, well., I, purchased, t...",does not swing very well. I purchased this swi...,does not swing very well. I purchased this swi...,does not swing very well. I purchased this swi...,"[does, not, swing, very, well, ., I, purchased...","[does, not, swing, very, well, ., i, purchased...","[does, not, swing, very, well, i, purchased, t...","[swing, well, purchased, swing, baby, months, ...","[swing, well, purchased, swing, baby, month, p...","[(swing, VBG), (well, RB), (purchased, VBN), (...","[(swing, v), (well, r), (purchased, v), (swing..."
3,0,Great computing! I was looking for an inexpens...,"[Great, computing!, I, was, looking, for, an, ...",Great computing! I was looking for an inexpens...,Great computing! I was looking for an inexpens...,Great computing! I was looking for an inexpens...,"[Great, computing, !, I, was, looking, for, an...","[great, computing, !, i, was, looking, for, an...","[great, computing, i, was, looking, for, an, i...","[great, computing, looking, inexpensive, desk,...","[great, computing, looking, inexpensive, desk,...","[(great, JJ), (computing, VBG), (looking, VBG)...","[(great, a), (computing, v), (looking, v), (in..."
4,0,Only use twice a week I only use it twice a we...,"[Only, use, twice, a, week, I, only, use, it, ...",Only use twice a week I only use it twice a we...,Only use twice a week I only use it twice a we...,Only use twice a week I only use it twice a we...,"[Only, use, twice, a, week, I, only, use, it, ...","[only, use, twice, a, week, i, only, use, it, ...","[only, use, twice, a, week, i, only, use, it, ...","[use, twice, week, use, twice, week, results, ...","[use, twice, week, use, twice, week, result, g...","[(use, NN), (twice, RB), (week, NN), (use, NN)...","[(use, n), (twice, r), (week, n), (use, n), (t..."


In [17]:
# Save the DataFrame to a JSON file
df.to_json('data/amazon_data.json', orient='records')