In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup

[nltk_data] Downloading package wordnet to /Users/aditya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Jewelry_v1_00.tsv.gz



## Read Data

## Keep Reviews and Ratings

In [3]:
# Load the tab separated data file, and print the first 5 rows for confirmation
data = pd.read_csv("./data/amazon_reviews_us_Jewelry_v1_00.tsv", sep="\t", usecols=["star_rating", "review_body"])
data.head()

  data = pd.read_csv("./data/amazon_reviews_us_Jewelry_v1_00.tsv", sep="\t", usecols=["star_rating", "review_body"])


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


Understanding Data

In [4]:
data.describe()

Unnamed: 0,star_rating,review_body
count,1767042,1766807
unique,11,1618522
top,5,Love it
freq,1041056,4288


In [5]:
data.star_rating.unique()

array([5, 1, 4, 3, 2, nan, '5', '1', '3', '4', '2', '2012-12-21'],
      dtype=object)

In [6]:
data.groupby(["star_rating"]).count()

Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,150441
2,97259
3,153660
4,259019
5,1040896
1,4566
2,3541
2012-12-21,0
3,5999
4,11411


In [7]:
# Drop the outlier which is star_rating = "2012-12-21"
data = data[data.star_rating != "2012-12-21"]

In [8]:
# Remove nan valued rows
data = data[data.star_rating.notnull()]

In [9]:
data.describe()

Unnamed: 0,star_rating,review_body
count,1767041,1766807
unique,10,1618522
top,5,Love it
freq,1041056,4288


In [10]:
# Convert all star rating to integer
data['star_rating'] = data.star_rating.astype(int)

In [11]:
data.head()

Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


In [12]:
data = data[data.review_body.notnull()]
data.head()

Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


In [13]:
# There are no empty reviews
(data.review_body.str.len() <= 0).sum()

0

Now we can continue with the process.

 ## We select 20000 reviews randomly from each rating class.



In [14]:
np.random.seed(17)
N_SAMPLES = 20000

In [15]:
reduced_data = data.groupby('star_rating', group_keys=False).apply(lambda x: x.sample(N_SAMPLES))

In [16]:
reduced_data.head()
reduced_data.to_csv("./data/training.csv")

# Data Cleaning



In [123]:
reduced_data["review_body"].str.lower()

1386788    these are actually gorgeous....., when you fir...
1294909    i ordered this item because it has screw back ...
1027605    i bought this for my girlfriend as one of her ...
904792     i would never buy from here again or recommend...
1324213    this combe is beautiful! however, the first ti...
                                 ...                        
1740641    got this for my wife for her birthday.  it sur...
996092     initially, i was nervous about my selection. h...
459393                               love, love, love these!
909113     bought this for my aunt when my cousin passed ...
526485     the facts: i had been looking at this pendant ...
Name: review_body, Length: 100000, dtype: object

In [None]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: pd.Series):
  return data.str.lower()

In [None]:
# Test HTML and URL replace RegEx


In [None]:
# Remove HTML and URL from reviews
def remove_html_and_url(data: pd.Series):
  data.str.replace(r"^\s*|\s\s*", "", regex=True)
  pass

In [None]:
# Test Non-alpha characters replace RegEX

In [None]:
# Remove non-alphabetical characters
def remove_non_alpa_characters(data: pd.Series):
  pass

In [None]:
# Remove extra spaces
def remove_extra_spaces(data: pd.Series):
  return data.str.replace(r"^\s*|\s\s*", " ", regex=True)

In [None]:
# Perform contractions
def remove_contractions(data: pd.Series):
  pass

In [None]:
def do_cleaning(data: pd.DataFrame, col: str):
  data[col] = to_lower(data[col])
  data[col] = remove_html_and_url(data[col])
  data[col] = remove_non_alpa_characters(data[col])
  data[col] = remove_extra_spaces(data[col])
  data[col] = remove_contractions(data[col])
  return data

In [None]:
cleaned_data = do_cleaning(reduced_data, col="review_body")

# Pre-processing

## remove the stop words 

In [None]:
from nltk.corpus import stopwords
 

## perform lemmatization  

In [None]:
from nltk.stem import WordNetLemmatizer


# TF-IDF Feature Extraction

# Perceptron

# SVM

# Logistic Regression

# Naive Bayes