In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import demoji
import string
import nltk
from nltk.corpus import stopwords

In [2]:
# import dataset
amazonReviews = pd.read_csv('resources/Amazon Review Data Web Scrapping - Amazon Review Data Web Scrapping.csv')

# Data Exploration

In [3]:
# get some basic info about dataset
amazonReviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60889 entries, 0 to 60888
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unique_ID      60889 non-null  int64 
 1   Category       60889 non-null  object
 2   Review_Header  60884 non-null  object
 3   Review_text    60857 non-null  object
 4   Rating         60889 non-null  int64 
 5   Own_Rating     60889 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.8+ MB


In [4]:
# view a few rows
amazonReviews.head(3)

Unnamed: 0,Unique_ID,Category,Review_Header,Review_text,Rating,Own_Rating
0,136040,smartTv,Nice one,I liked it,5,Positive
1,134236,mobile,Huge battery life with amazing display,I bought the phone on Amazon and been using my...,5,Positive
2,113945,books,Four Stars,"Awesome book at reasonable price, must buy ......",4,Positive


## make a smaller sample of the main dataset (5000 rows)

In [5]:
# make a smaller sample of the data to test on
sampleAmazon1 = amazonReviews.sample(n=5000,random_state=1)

# Preprocess sample dataset

In [6]:
def preprocess_reviews(df, text_column='Review_text'):
    """
    preprocess texts within a dataFrame.

    parameters:
    - df: pandas dataframe containing the review texts.
    - text_column: the name of the column containing the review texts.

    returns:
    - dataFrame with an additional column 'preprocessed_Review_text' for the preprocessed texts.
    """
    preprocessed_texts = []

    for text in df[text_column]:
        # Ensure text is a string
        text = str(text)
        # Lowercase the text
        text = text.lower()
        # Remove emojis
        text = demoji.replace(text, "")
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove stopwords
        text = ' '.join([word for word in text.split() if word not in stop_words])
        preprocessed_texts.append(text)
    
    # Add the preprocessed texts as a new column in the DataFrame
    df['preprocessed_' + text_column] = preprocessed_texts

    return df

In [7]:
# Initialize stopwords set
stop_words = set(stopwords.words('english'))

In [8]:
# Remove emojis
demoji.download_codes()

  demoji.download_codes()


In [9]:
# Process the DataFrame
df_processed = preprocess_reviews(sampleAmazon1)

In [10]:
df_processed

Unnamed: 0,Unique_ID,Category,Review_Header,Review_text,Rating,Own_Rating,preprocessed_Review_text
50555,162609,mobile,iPhone XR just brilliant in every way !!!✌️✌️,"Very beautiful phone...very long battery life,...",5,Positive,beautiful phonevery long battery life superb c...
4164,110155,mobile accessories,Awesome smartphone😘😘,1. Bright & colourful display 👍 2. Super smoot...,4,Positive,1 bright colourful display 2 super smooth proc...
33565,165995,mobile,Worst Sensor & Micro phone,Worst Sensor & Micro phone....,1,Negative,worst sensor micro phone
58243,141410,refrigerator,Installation pending,LED TV RECEIVED. INSTALLATION PENDING. PLS ARR...,2,Negative,led tv received installation pending pls arrange
6670,144360,mobile,Very nice product,Nice,5,Positive,nice
...,...,...,...,...,...,...,...
41395,163934,smartTv,Super good,Very nice,5,Positive,nice
58350,122796,refrigerator,Excellent camera,Excellent product,5,Positive,excellent product
9598,139342,mobile accessories,Great phone!,It's a midrange smartphone. The screen is big ...,5,Positive,midrange smartphone screen big clear picture q...
1900,122341,books,Very informative. Awesome and simple to unders...,Excellent book by the author on the importance...,5,Positive,excellent book author importance financial lit...
