In [1]:
# TODO: Remove this before submission
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"


In [2]:
import pandas as pd
import numpy as np
import nltk

nltk.download("wordnet")
import re
from bs4 import BeautifulSoup


[nltk_data] Downloading package wordnet to /Users/aditya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Jewelry_v1_00.tsv.gz



## Read Data


## Keep Reviews and Ratings


In [4]:
# Load the tab separated data file, and print the first 5 rows for confirmation
data = pd.read_csv("./data/amazon_reviews_us_Jewelry_v1_00.tsv", sep="\t", usecols=["star_rating", "review_body"])
data.head()


  data = pd.read_csv("./data/amazon_reviews_us_Jewelry_v1_00.tsv", sep="\t", usecols=["star_rating", "review_body"])


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


Understanding Data


In [5]:
data.describe()


Unnamed: 0,star_rating,review_body
count,1767042,1766807
unique,11,1618522
top,5,Love it
freq,1041056,4288


In [6]:
data.star_rating.unique()


array([5, 1, 4, 3, 2, nan, '5', '1', '3', '4', '2', '2012-12-21'],
      dtype=object)

In [7]:
data.groupby(["star_rating"]).count()


Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,150441
2,97259
3,153660
4,259019
5,1040896
1,4566
2,3541
2012-12-21,0
3,5999
4,11411


In [8]:
# Drop the outlier which is star_rating = "2012-12-21"
data = data[data.star_rating != "2012-12-21"]


In [9]:
# Remove nan valued rows
data = data[data.star_rating.notnull()]


In [10]:
data.describe()


Unnamed: 0,star_rating,review_body
count,1767041,1766807
unique,10,1618522
top,5,Love it
freq,1041056,4288


In [11]:
# Convert all star rating to integer
data["star_rating"] = data.star_rating.astype(int)


In [12]:
data.head()


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


In [13]:
data = data[data.review_body.notnull()]
data.head()


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


In [14]:
# There are no empty reviews
(data.review_body.str.len() <= 0).sum()


0

Now we can continue with the process.


## We select 20000 reviews randomly from each rating class.


In [15]:
# np.random.seed(101)
N_SAMPLES = 25000


In [16]:
sampled_data = data.groupby("star_rating", group_keys=False).apply(lambda x: x.sample(N_SAMPLES))


# Data Cleaning


In [17]:
avg_len_before_cleaning = sampled_data.review_body.str.len().mean()
f"Avg. length of reviews BEFORE CLEANING :: {avg_len_before_cleaning}"


'Avg. length of reviews BEFORE CLEANING :: 189.754576'

In [18]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: pd.Series):
    return data.str.lower()


In [19]:
def remove_accented_characters(data: pd.Series):
    import unicodedata

    """Removes accented characters from the Series

    Args:
        data (pd.Series): Series of string

    Returns:
        _type_: pd.Series
    """
    import unicodedata

    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))


In [20]:
def remove_html_and_url(data: pd.Series):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (pd.Series): A Pandas series of type string

    Returns:
        _type_: pd.Series
    """
    # Remove HTML encodings
    data.str.replace(r"&#\d+;", " ", regex=True)

    # Remove HTML tags (both open and closed)
    data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

    # Remove URLs
    data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

    return data


In [21]:
# Remove non-alphabetical characters
def remove_non_alpa_characters(data: pd.Series):
    return data.str.replace(r"_+|\\|[^a-zA-Z\s]", " ", regex=True)


In [22]:
# Remove extra spaces
def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)


In [23]:
# Install contractions package, if you don't have it
! pip install contractions



In [24]:
# Expanding contractions
def fix_contractions(data: pd.Series):
    import contractions

    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])

    return data.apply(contraction_fixer)


In [25]:
# A dictionary containing the columns and a list of functions to perform on it in order
# TODO: Think about handling negation.
# TODO: Replace numbers with NUM tag.
data_cleaning_pipeline = {
    "review_body": [
        to_lower,
        remove_accented_characters,
        remove_html_and_url,
        fix_contractions,
        remove_non_alpa_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = sampled_data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = cleaned_data[col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    cleaned_data[col] = temp_data.copy()


Starting: to_lower
Ended: to_lower
Starting: remove_accented_characters
Ended: remove_accented_characters
Starting: remove_html_and_url
Ended: remove_html_and_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpa_characters
Ended: remove_non_alpa_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [26]:
avg_len_after_cleaning = cleaned_data.review_body.str.len().mean()
f"Avg. length of reviews after cleaning :: {avg_len_after_cleaning}"


'Avg. length of reviews after cleaning :: 185.227792'

In [27]:
f"Before Cleaning: {avg_len_before_cleaning} ;; After Cleaning: {avg_len_after_cleaning}"


'Before Cleaning: 189.754576 ;; After Cleaning: 185.227792'

In [28]:
# TODO: Remove Test Block
# Checkpoint: Cleaned Data
cleaned_data.to_csv("./data/cleaned.tsv", sep="\t", index=False, encoding="UTF-8")


# Pre-processing


## remove the stop words


In [29]:
# TODO: Remove Test Block
cleaned_data = pd.read_csv("./data/cleaned.tsv", sep="\t")


In [30]:
avg_len_before_preprocessing = cleaned_data["review_body"].str.len().mean()
print(f"Avg. length of the reviews before preprocessing :: {avg_len_before_preprocessing}")


Avg. length of the reviews before preprocessing :: 185.227792


In [31]:
def tokenize(data: pd.Series):
    from nltk.tokenize import word_tokenize

    nltk.download("punkt")

    return data.apply(word_tokenize)


In [32]:
from typing import List, Set


def remove_stopwords(data: pd.Series):
    """Remove stop words using the NLTK stopwords dictionary

    Args:
        string (str): a document

    Returns:
        str: a document with stopwords removed
    """
    from nltk.corpus import stopwords

    nltk.download("stopwords")

    stopwords = set(stopwords.words())

    def remover(word_list: List[str], stopwords: Set[str]):
        return [word for word in word_list if not word in stopwords]

    return data.apply(lambda word_list: remover(word_list, stopwords))


## perform lemmatization


In [33]:
def lemmatize(data: pd.Series, consider_pos_tag: bool = True):
    from nltk.corpus import wordnet
    from nltk.stem import WordNetLemmatizer

    nltk.download("omw-1.4")

    # POS tagging
    def perform_nltk_pos_tag(data: pd.Series):
        from nltk import pos_tag

        nltk.download("averaged_perceptron_tagger")

        return data.apply(pos_tag)

    # Convert POS tag to wordnet pos tags
    def wordnet_pos_tagger(tag: str):
        if tag.startswith("J"):
            return wordnet.ADJ
        elif tag.startswith("V"):
            return wordnet.VERB
        elif tag.startswith("N"):
            return wordnet.NOUN
        elif tag.startswith("R"):
            return wordnet.ADV
        else:
            return None

    lemmatizer = WordNetLemmatizer()
    lemmatized = list()

    if consider_pos_tag:
        pos_tagged_data = data.copy()
        pos_tagged_data = perform_nltk_pos_tag(data)

        for row in pos_tagged_data:

            lemmatized_row = list()

            if consider_pos_tag:
                for word, tag in row:
                    wordnet_pos_tag = wordnet_pos_tagger(tag)

                    if wordnet_pos_tag is None:
                        lemmatized_row.append(word)
                    else:
                        result = lemmatizer.lemmatize(word, wordnet_pos_tag)
                        lemmatized_row.append(lemmatizer.lemmatize(word, wordnet_pos_tag))

            lemmatized.append(lemmatized_row)
    else:
        for row in data:
            lemmatized_row = list()

            for word in row:
                lemmatized_row.append(lemmatizer.lemmatize(word))

            lemmatized.append(lemmatized_row)

    return pd.Series(lemmatized)


In [34]:
# Concatenate lemmatized sentences back into one sentence
def concatenate(data: pd.Series):
    return data.apply(lambda words: " ".join(words))


In [35]:
preprocessing_pipeline = {"review_body": [tokenize, lemmatize, concatenate]}

# Run the pipeline
preprocessed_data = cleaned_data.copy()

# Process all the cleaning instructions
for col, pipeline in preprocessing_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = preprocessed_data[col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")

        if func.__name__ == "lemmatize":
            temp_data = func(temp_data, consider_pos_tag=False)
        else:
            temp_data = func(temp_data)

        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    preprocessed_data[col] = temp_data.copy()


Starting: tokenize


[nltk_data] Downloading package punkt to /Users/aditya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Ended: tokenize
Starting: lemmatize


[nltk_data] Downloading package omw-1.4 to /Users/aditya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Ended: lemmatize
Starting: concatenate
Ended: concatenate


In [36]:
# TODO: Remove Test Block
# Save lemmatized data
import pickle as pkl

with open("./data/preprocessed.pkl", "wb") as file:
    pkl.dump(preprocessed_data, file)


In [37]:
avg_len_after_preprocessing = preprocessed_data["review_body"].str.len().mean()
print(f"Avg. length of the reviews after preprocessing :: {avg_len_after_preprocessing}")


Avg. length of the reviews after preprocessing :: 181.045968


In [38]:
f"Before Preprocessing: {avg_len_before_preprocessing} ;; After Preprocessing: {avg_len_after_preprocessing}"


'Before Preprocessing: 185.227792 ;; After Preprocessing: 181.045968'

# TF-IDF Feature Extraction


In [39]:
# TODO: Remove Test Block
# Load lemmatized data
# import pickle as pkl

# preprocessed_data = None
# with open("./data/preprocessed.pkl", "rb") as file:
#     lemmatized_data = pkl.load(file)


In [40]:
preprocessed_data[preprocessed_data["review_body"].str.len() == 0].groupby("star_rating").count()


Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,14
2,2
3,1
4,12
5,25


In [41]:
preprocessed_data[preprocessed_data["review_body"].isnull()]


Unnamed: 0,star_rating,review_body


In [42]:
preprocessed_data["review_body"].isnull().values.any(), preprocessed_data["review_body"].isnull().sum()


(False, 0)

In [43]:
# # Drop empty strings
preprocessed_data["review_body"].replace("", np.nan, inplace=True)
preprocessed_data.dropna(subset=["review_body"], inplace=True)
preprocessed_data[preprocessed_data["review_body"].str.len() == 0].groupby("star_rating").count()
preprocessed_data["review_body"].isnull().values.any(), preprocessed_data["review_body"].isnull().sum()


Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1


(False, 0)

In [44]:
preprocessed_data.groupby(["star_rating"]).count()


Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,24986
2,24998
3,24999
4,24988
5,24975


In [45]:
# TODO: Remove Checkpoint ...
preprocessed_data.to_csv("./data/data.tsv", sep="\t", index=False)


In [46]:
# TODO: Remove this block
# Retrieve checkpoint
# data = pd.read_csv("./data/data.tsv", sep="\t")


In [47]:
# Resample data.
data = preprocessed_data.groupby("star_rating", group_keys=False).apply(lambda x: x.sample(20000))


In [48]:
# Split the data 80-20 split
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, stratify=data["star_rating"])


In [49]:
(train["review_body"].str.len() == 0).sum()


0

In [50]:
data.groupby(["star_rating"]).count()


Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,20000
2,20000
3,20000
4,20000
5,20000


In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(train["review_body"])

X_tfidf_train = vectorizer.transform(train["review_body"])
X_tfidf_test = vectorizer.transform(test["review_body"])
y_train = train["star_rating"]
y_test = test["star_rating"]


In [52]:
vectorizer.get_feature_names_out().tolist()


['aa',
 'aaa',
 'aaaaa',
 'aaaaaaaaaa',
 'aaah',
 'aaahhhhh',
 'aaallll',
 'aadjustable',
 'aagaard',
 'aahhhs',
 'aahs',
 'aandi',
 'aarp',
 'ab',
 'aback',
 'abalone',
 'abandon',
 'abandoned',
 'abaout',
 'abase',
 'abbey',
 'abble',
 'abbott',
 'abc',
 'abd',
 'abdolutely',
 'abdomen',
 'abhor',
 'abide',
 'abilites',
 'ability',
 'abit',
 'able',
 'ablt',
 'abmber',
 'abnormal',
 'abnormally',
 'abnoxious',
 'aboard',
 'abodomen',
 'abolutely',
 'abominable',
 'abosolutely',
 'abot',
 'abou',
 'abound',
 'abour',
 'about',
 'aboutthe',
 'above',
 'abrade',
 'abraided',
 'abrasion',
 'abrasive',
 'abreast',
 'abrfrcrombie',
 'abrin',
 'abroad',
 'abrupt',
 'abruptly',
 'abrus',
 'absence',
 'absent',
 'abslolutly',
 'absoloute',
 'absoloutly',
 'absolute',
 'absolutely',
 'absolutelylove',
 'absolutemente',
 'absolutey',
 'absolutley',
 'absolutly',
 'absorb',
 'absorbed',
 'absorbent',
 'absorber',
 'absorbs',
 'absoulete',
 'absouletly',
 'absoultely',
 'absoulute',
 'absoulutely

### Helper Functions


In [53]:
def calc_metrics(y_true, y_pred):
    from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

    accuracy_score = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=None)
    recall = recall_score(y_true, y_pred, average=None)
    f1 = f1_score(y_true, y_pred, average=None)

    print(f"{accuracy_score}")

    for rating_precision, rating_recall, rating_f1 in zip(precision, recall, f1):
        print(f"{rating_precision},{rating_recall},{rating_f1}")

    print(f"{np.mean(precision)},{np.mean(recall)},{np.mean(f1)}")


# Perceptron


In [54]:
from sklearn.linear_model import Perceptron

clf = Perceptron()
clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

calc_metrics(y_test, y_pred)


0.4153
0.5993046501521078,0.34475,0.43770830026979846
0.3385942760942761,0.40225,0.3676873857404022
0.31728748806112705,0.41525,0.3597184623714131
0.3599188915174045,0.26625,0.30607845954878576
0.5453397853986955,0.648,0.5922540843139494
0.43208901824472223,0.41530000000000006,0.4126893384488698


# SVM


In [55]:
from sklearn.svm import LinearSVC

clf = LinearSVC(dual=True)
clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

calc_metrics(y_test, y_pred)


0.5071
0.5611404435058078,0.66425,0.6083571837435604
0.40142150803461063,0.32475,0.35903814262023215
0.4189002486874827,0.379,0.39795248720304505
0.4601041952289553,0.4195,0.43886491434549496
0.628175519630485,0.748,0.682871162843775
0.4939483830174683,0.5071,0.49741677815122154


# Logistic Regression


In [57]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver="lbfgs", max_iter=500, multi_class="multinomial")

clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

calc_metrics(y_test, y_pred)


0.527
0.589334548769371,0.6465,0.6165951359084406
0.42097902097902096,0.37625,0.39735973597359736
0.4307848615697231,0.424,0.427365503338793
0.48879287064542265,0.4525,0.46994677398416207
0.6693199909028883,0.73575,0.7009646302250804
0.5198422585732853,0.5269999999999999,0.5224463558860146


# Naive Bayes


In [58]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

calc_metrics(y_test, y_pred)


0.5017
0.5915983097191151,0.595,0.5932942789480244
0.4109306522299562,0.3985,0.4046198756187334
0.40478446234012316,0.42725,0.4157139382145463
0.4404077365394668,0.42125,0.4306158957321748
0.6582716049382716,0.6665,0.662360248447205
0.5011985531533866,0.5017,0.5013208473921368
