In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import sklearn
from bs4 import BeautifulSoup

In [2]:
! pip install bs4 # in case you don't have it installed
# disable ssl verification
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('wordnet')
# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/waterdog/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Read Data

In [3]:
# Data Path: "./amazon_reviews_us_Office_Products_v1_00.tsv"
df = pd.read_csv("./amazon_reviews_us_Office_Products_v1_00.tsv", sep='\t', on_bad_lines='skip')
print(df.columns)

  df = pd.read_csv("./amazon_reviews_us_Office_Products_v1_00.tsv", sep='\t', on_bad_lines='skip')


Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')


## Keep Reviews and Ratings

In [4]:
df = df[['review_body', 'star_rating']]
df = df.dropna()
print(df.head())

                                         review_body star_rating
0                                     Great product.           5
1  What's to say about this commodity item except...           5
2    Haven't used yet, but I am sure I will like it.           5
3  Although this was labeled as &#34;new&#34; the...           1
4                    Gorgeous colors and easy to use           4


## Report the statistics of the data

In [5]:
# Filter out non-integer values
df = df[pd.to_numeric(df['star_rating'], errors='coerce').notnull()]
df['star_rating'] = df['star_rating'].astype(int)

def get_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

# Print the number of each star rating
print(df['star_rating'].value_counts())
    
# Print the number of positive (>=4), neutral (=3), and negative reviews (<=2)
print(df['star_rating'].apply(get_sentiment).value_counts())

# Print a sample for each sentiment 
print(df[df['star_rating'] == 5]['review_body'].iloc[0])
print(df[df['star_rating'] == 3]['review_body'].iloc[0])
print(df[df['star_rating'] == 1]['review_body'].iloc[0])

star_rating
5    1582704
4     418348
1     306967
3     193680
2     138381
Name: count, dtype: int64
star_rating
positive    2001052
negative     445348
neutral      193680
Name: count, dtype: int64
Great product.
Nice quality. Happy  with the item
Although this was labeled as &#34;new&#34; the one I received clearly had been used. The box had previously been opened., and the shredder was dirty and the bin was partially full of shredded paper. What was worse is that the unit will not work properly. It is not possible to insert the paper bin so as to enable the shredder to run. It will not operate if the bin is not in place, but I could never get the unit to recognize that the paper bin was actually fully inserted. After cleaning everything thoroughly and vacuuming the paper bin area, it worked ONCE! After that I was unable to get it work at all. I returned the unit immediately for a refund. I feel Amazon misrepresented the  unit as &#34;new&#34; when clearly it was not.


 ## We form three classes and select 100,000 reviews randomly from positive and negative classes.



In [6]:
sample_size = 100000

# set seed for reproducibility
np.random.seed(0)

positive = df[df['star_rating'] > 3].sample(sample_size)
negative = df[df['star_rating'] < 3].sample(sample_size)

df = pd.concat([positive, negative])

def get_sentiment_label(rating):
    if rating >= 4:
        return 1
    else:
        return 0

# replace star ratings with sentiment labels
# 1 - positive, 0 - negative
df['sentiment'] = df['star_rating'].apply(get_sentiment_label)

# remove star_rating column
df = df.drop(columns=['star_rating'])

samples = [567073,  # contraction
        2021257,
        2021081,
        ]

# randomly peek 5 rows
print(df.sample(5))

                                               review_body  sentiment
14057    Two out of four that I ordered came broken.  O...          0
567073   I've purchased these a few times now (2 or 3 c...          1
2605563  Let me start off by saying that I have owned b...          1
2021081  dont like it cant understand it, dont know why...          0
286020                                            I like:)          0


# Data Cleaning



## Convert the reviews to lowercase

In [7]:
# Convert the reviews to lowercase
df['review_body'] = df['review_body'].str.lower()
print(df.sample(5))

                                               review_body  sentiment
2157864  when i first installed the 2 ink cartridges th...          0
386445                                           wonderful          1
2303164  after a short time, these chalk holders do not...          0
2021257  my printer will not print black.  it was worki...          0
2370679  there have been some negative reviews of this ...          1


## Remove the HTML and URLs from the reviews

In [8]:
# Remove HTML tags
df['review_body'] = df['review_body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

# Remove URLs
df['review_body'] = df['review_body'].apply(lambda x: re.sub(r'http\S+', '', x) or re.sub(r'www\S+', '', x))

print(df.sample(5))

  df['review_body'] = df['review_body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
  df['review_body'] = df['review_body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())


                                               review_body  sentiment
666065   my key doesnt fit, but it feel like its a nice...          0
1535652  i bought these for my brother hl-2170w.  it us...          1
1744505  desktop publishing supplies 11 mil greeting ca...          0
2048041  i wanted red to accent my husbands office---it...          1
2594805  i first purchased a cx5200 epson all-in-one.  ...          0


## remove extra spaces

In [9]:
def remove_spaces(text):
    text = re.sub(r'\s+', ' ', text)
    return text

df['review_body'] = df['review_body'].apply(remove_spaces)
print(df.sample(5))

                                               review_body  sentiment
2027450  the ink was faulty i had to return but 2 stars...          0
741690   these are not too expensive and seem to write ...          1
1371358  my printer will not allow this cartridge to wo...          0
1359687  the "plain" book they have at the funeral home...          1
428969   truly a waste of time and money. purchased the...          0


## perform contractions on the reviews, e.g., won’t →will not.

In [10]:


# perform contractions on the reviews, e.g., won’t → will not. include as many contractions in English that you can think of
# Source: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
    "ain't": "am not / are not / is not / has not / have not",
    "aren't": "are not / am not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had / he would",
    "he'd've": "he would have",
    "he'll": "he shall / he will",
    "he'll've": "he shall have / he will have",
    "he's": "he has / he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how has / how is / how does",
    "i'd": "i had / i would",
    "i'd've": "i would have",
    "i'll": "i shall / i will",
    "i'll've": "i shall have / i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it had / it would",
    "it'd've": "it would have",
    "it'll": "it shall / it will",
    "it'll've": "it shall have / it will have",
    "it's": "it has / it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had / she would",
    "she'd've": "she would have",
    "she'll": "she shall / she will",
    "she'll've": "she shall have / she will have",
    "she's": "she has / she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as / so is",
    "that'd": "that would / that had",
    "that'd've": "that would have",
    "that's": "that has / that is",
    "there'd": "there had / there would",
    "there'd've": "there would have",
    "there's": "there has / there is",
    "they'd": "they had / they would",
    "they'd've": "they would have",
    "they'll": "they shall / they will",
    "they'll've": "they shall have / they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had / we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what shall / what will",
    "what'll've": "what shall have / what will have",
    "what're": "what are",
    "what's": "what has / what is",
    "what've": "what have",
    "when's": "when has / when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where has / where is",
    "where've": "where have",
    "who'll": "who shall / who will",
    "who'll've": "who shall have / who will have",
    "who's": "who has / who is",
    "who've": "who have",
    "why's": "why has / why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had / you would",
    "you'd've": "you would have",
    "you'll": "you shall / you will",
    "you'll've": "you shall have / you will have",
    "you're": "you are",
    "you've": "you have"
}

# Remove multiple mappings of contractions
for key, value in contractions.items():
    contractions[key] = value.split('/')[0].strip()

df = df.replace(contractions, regex=True)

## remove non-alphabetical characters

In [11]:
# remove non-alphabetical characters
df['review_body'] = df['review_body'].apply(lambda x: re.sub(r'[^a-zA-Z]', ' ', x))
print(df.sample(5))

                                               review_body  sentiment
2193476   like i am going to get cheated again   after ...          0
1323775  the bulletin board met my expectations and ser...          1
2583511  this is a poorly designed device and it does n...          0
2137021  i like the camera itself  which comes with jus...          1
267796   this ink works very good with my printer  just...          1


# Pre-processing

## remove the stop words 

In [12]:
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

df['review_body'] = df['review_body'].apply(remove_stopwords)
print(df.sample(5))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/waterdog/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                               review_body  sentiment
471326   using years one kind durable notebooks cheap t...          1
1396604  owned screen months could happier plenty mount...          1
1964696  well designed fully satisfied speed quality pr...          1
1221318                       item described fast delivery          1
583530   stick dollar tree foam board paper well even h...          0


## perform lemmatization  

In [13]:
from nltk.stem import WordNetLemmatizer

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

df['review_body'] = df['review_body'].apply(lemmatize)
print(df.sample(5))

                                               review_body  sentiment
1809821  flag curved rod till end keep curling top lett...          0
1357716  black ink cartridge empty disappointed itwas r...          0
2528099  idea well product work compatible digital isp ...          0
2015331  disappointed find make two size standard diary...          0
1318017  telephone handset arrived time advertised pric...          1


# TF-iDF Feature Extraction

In [14]:
# TF-iDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)

X = vectorizer.fit_transform(df['review_body']).toarray()
y = df['sentiment']

print(X.shape)

(200000, 1000)


# Perceptron

In [15]:
# Train a Perceptron model on your training dataset using the sklearn built-in
# implementation. Report Accuracy, Precision, Recall, and f1-score on both
# the training and testing split of your dataset.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = Perceptron()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.58      0.72     20087
           1       0.69      0.97      0.81     19913

    accuracy                           0.77     40000
   macro avg       0.82      0.77      0.76     40000
weighted avg       0.82      0.77      0.76     40000



# SVM

In [None]:
# Train an SVM model on your training dataset using the sklearn built-in
# implementation. Report Accuracy, Precision, Recall, and f1-score on both
# the training and testing split of your dataset.

from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Logistic Regression

In [None]:
# Train a Logistic Regression model on your training dataset using the sklearn
# built-in implementation. Report Accuracy, Precision, Recall, and f1-score on
# both the training and testing split of your dataset.

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Naive Bayes