In [28]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix


# Data Preparation and Cleaning

In [29]:
data = pd.read_csv('amazon.csv')

In [30]:
#Check shape of dataframe, there are close to 1500 rows, and 16 columns- great for analysis
data.shape

(1465, 16)

In [31]:
# Check for missing values
print(data.isnull().sum()) # 2 missing values on rating_count
data[data.rating_count.isnull()] # Show which rows have missing values - need to remove!

product_id             0
product_name           0
category               0
discounted_price       0
actual_price           0
discount_percentage    0
rating                 0
rating_count           2
about_product          0
user_id                0
user_name              0
review_id              0
review_title           0
review_content         0
img_link               0
product_link           0
dtype: int64


Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,user_name,review_id,review_title,review_content,img_link,product_link
282,B0B94JPY2N,Amazon Brand - Solimo 65W Fast Charging Braide...,Computers&Accessories|Accessories&Peripherals|...,₹199,₹999,80%,3.0,,USB C to C Cable: This cable has type C connec...,AE7CFHY23VAJT2FI4NZKKP6GS2UQ,Pranav,RUB7U91HVZ30,The cable works but is not 65W as advertised,I have a pd supported car charger and I bought...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Amazon-Brand-Charging-Su...
324,B0BQRJ3C47,"REDTECH USB-C to Lightning Cable 3.3FT, [Apple...",Computers&Accessories|Accessories&Peripherals|...,₹249,₹999,75%,5.0,,💎[The Fastest Charge] - This iPhone USB C cabl...,AGJC5O5H5BBXWUV7WRIEIOOR3TVQ,Abdul Gafur,RQXD5SAMMPC6L,Awesome Product,Quick delivery.Awesome ProductPacking was good...,https://m.media-amazon.com/images/I/31-q0xhaTA...,https://www.amazon.in/REDTECH-Lightning-Certif...


In [32]:
#Drop the missing values
data.dropna(subset=['rating_count'], inplace=True)
print(data.isnull().sum())

product_id             0
product_name           0
category               0
discounted_price       0
actual_price           0
discount_percentage    0
rating                 0
rating_count           0
about_product          0
user_id                0
user_name              0
review_id              0
review_title           0
review_content         0
img_link               0
product_link           0
dtype: int64


In [33]:
# Check for duplicate, none found, no actions needed!
print(data.duplicated().sum())

0


In [34]:
#Put the cleaned data into 2 dataframes for further analysis
cleanData1 = pd.DataFrame(data[['rating','review_title','review_content']])
cleanData2 = pd.DataFrame(data[['product_name', 'category', 'rating', 'rating_count', 'user_id', 'product_link']])

In [35]:
cleanData1.head()

Unnamed: 0,rating,review_title,review_content
0,4.2,"Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...
1,4.0,"A Good Braided Cable for Your Type C Device,Go...",I ordered this cable to connect my phone to An...
2,3.9,"Good speed for earlier versions,Good Product,W...","Not quite durable and sturdy,https://m.media-a..."
3,4.2,"Good product,Good one,Nice,Really nice product...","Good product,long wire,Charges good,Nice,I bou..."
4,4.2,"As good as original,Decent,Good one for second...","Bought this instead of original apple, does th..."


In [36]:
cleanData1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1463 entries, 0 to 1464
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   rating          1463 non-null   object
 1   review_title    1463 non-null   object
 2   review_content  1463 non-null   object
dtypes: object(3)
memory usage: 45.7+ KB


# Attempts at text classification

In [37]:
# We want to do text classification, but first, we do some preprocessing first


import nltk

nltk.download('all')




# create a list text

text = list(cleanData1['review_content'])




# preprocessing loop

import re

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()




corpus = []




for i in range(len(text)):

    r = re.sub('[^a-zA-Z]', ' ', text[i])

    r = r.lower()

    r = r.split()

    r = [word for word in r if word not in stopwords.words('english')]

    r = [lemmatizer.lemmatize(word) for word in r]

    r = ' '.join(r)

    corpus.append(r)




#assign corpus to data['text']

data['text'] = corpus

data.head()

[nltk_data] Error loading all: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>


LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/dinglinlee/nltk_data'
    - '/Users/dinglinlee/anaconda3/nltk_data'
    - '/Users/dinglinlee/anaconda3/share/nltk_data'
    - '/Users/dinglinlee/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [38]:
cleanData2.head()

Unnamed: 0,product_name,category,rating,rating_count,user_id,product_link
0,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,4.2,24269,"AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...",https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
1,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,Computers&Accessories|Accessories&Peripherals|...,4.0,43994,"AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...",https://www.amazon.in/Ambrane-Unbreakable-Char...
2,Sounce Fast Phone Charging Cable & Data Sync U...,Computers&Accessories|Accessories&Peripherals|...,3.9,7928,"AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...",https://www.amazon.in/Sounce-iPhone-Charging-C...
3,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,Computers&Accessories|Accessories&Peripherals|...,4.2,94363,"AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...",https://www.amazon.in/Deuce-300-Resistant-Tang...
4,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,Computers&Accessories|Accessories&Peripherals|...,4.2,16905,"AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...",https://www.amazon.in/Portronics-Konnect-POR-1...


In [8]:
cleanData2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  1465 non-null   object
 1   category      1465 non-null   object
 2   rating        1465 non-null   object
 3   rating_count  1463 non-null   object
 4   user_id       1465 non-null   object
 5   product_link  1465 non-null   object
dtypes: object(6)
memory usage: 68.8+ KB


# Exploratory Data Analysis
