In [1]:
import pandas as pd

In [2]:
# Read data from xlsx file
df = pd.read_excel('shopee_reviews.xlsx', 'Sheet1')
df.head()

Unnamed: 0,shopid,itemid,ctime,author_username,comment,rating_star,template_tags
0,223946658,11315955990,1670429351,s*****h,Performance:good\nBest Feature(s):yet to try\n...,5,"['Performance', 'Best Feature(s)', 'Value For ..."
1,223946658,11315955990,1653742257,bananaphone77,Performance:ok\nBest Feature(s):ok\nValue For ...,5,"['Performance', 'Best Feature(s)', 'Value For ..."
2,223946658,11315955990,1659064971,jasperjane80,Performance:Excellent.\nBest Feature(s):Back l...,5,"['Performance', 'Best Feature(s)', 'Value For ..."
3,223946658,11315955990,1637408323,s*****4,Came less than a week ! Consider fast! Keyboar...,5,[]
4,223946658,11315955990,1639452456,reeveschiu97,Packaging was a bust and the product was ok .....,2,[]


In [3]:
# Print out stats of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30010 entries, 0 to 30009
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   shopid           30010 non-null  int64 
 1   itemid           30010 non-null  int64 
 2   ctime            30010 non-null  int64 
 3   author_username  29931 non-null  object
 4   comment          30010 non-null  object
 5   rating_star      30010 non-null  int64 
 6   template_tags    30010 non-null  object
dtypes: int64(4), object(3)
memory usage: 1.6+ MB


# Categorize data as "neutral" vs "opinionated"
### Define star rating of 1 and 2 as bad reviews, 3 as neutral, 4 and 5 as good reviews

In [4]:
# Polarity detection to differentiate the opinionated data as 'positive' vs 'negative'
def polarity_detection(value):
    if(value == 4):
        return 1
    elif(value == 5):
        return 1
    else:
        return 0

def organize_data(dataframe):
    # Remove rating of '3'
    dataframe = dataframe[dataframe['rating_star'] != 3]

    # Display the polarity
    dataframe['polarity'] = dataframe['rating_star'].apply(polarity_detection)
    
    dataframe = dataframe.loc[:,['itemid', 'comment', 'rating_star', 'polarity']]
    
    # Reset Index
    dataframe.reset_index(drop=True, inplace=True)

    return dataframe

In [5]:
clean_df = organize_data(df)
clean_df.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['polarity'] = dataframe['rating_star'].apply(polarity_detection)


Unnamed: 0,itemid,comment,rating_star,polarity
0,11315955990,Performance:good\nBest Feature(s):yet to try\n...,5,1
1,11315955990,Performance:ok\nBest Feature(s):ok\nValue For ...,5,1
2,11315955990,Performance:Excellent.\nBest Feature(s):Back l...,5,1
3,11315955990,Came less than a week ! Consider fast! Keyboar...,5,1
4,11315955990,Packaging was a bust and the product was ok .....,2,0
5,11315955990,Performance:many colours\nBest Feature(s):many...,5,1
6,11315955990,Fast delivery and is working fine. However my ...,5,1
7,11315955990,I purchased it during the flash sale! I'm not ...,5,1
8,11315955990,Performance:Very Good\nBest Feature(s):Beautif...,5,1
9,11315955990,Performance:ok\nBest Feature(s):light is quite...,5,1


In [6]:
# Count number of positive and negative reviews
tb_counts = clean_df.polarity.value_counts()
tb_counts

1    28806
0      578
Name: polarity, dtype: int64

# Preprocessing Data

In [7]:
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jezeb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jezeb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jezeb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# Remove stopwords
stopwords = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def preprocess_data(comment):
    
    # Remove unwanted characters and numbers
    comment = re.sub('[^A-Za-z0-9]+', ' ', comment)
    
    # Tokenization
    tokens = nltk.word_tokenize(comment)
    
    # Remove stopwords from the comment
    comment = [word for word in tokens if word not in stopwords]
    
    # Lemmatization
    comment = [lemmatizer.lemmatize(word) for word in comment]
    
    # Make entire comment to be lowercase
    comment = [c.lower() for c in comment]

    # Join words for preprocessed comment
    comment = ' '.join(str(comment))
    return comment

In [9]:
clean_df['preprocessed_comment'] = clean_df['comment'].apply(lambda comment: preprocess_data(comment))
clean_df.head()

Unnamed: 0,itemid,comment,rating_star,polarity,preprocessed_comment
0,11315955990,Performance:good\nBest Feature(s):yet to try\n...,5,1,"[ ' p e r f o r m a n c e ' , ' g o o d ' , ..."
1,11315955990,Performance:ok\nBest Feature(s):ok\nValue For ...,5,1,"[ ' p e r f o r m a n c e ' , ' o k ' , ' ..."
2,11315955990,Performance:Excellent.\nBest Feature(s):Back l...,5,1,"[ ' p e r f o r m a n c e ' , ' e x c e l l ..."
3,11315955990,Came less than a week ! Consider fast! Keyboar...,5,1,"[ ' c a m e ' , ' l e ' , ' w e e k ' , ..."
4,11315955990,Packaging was a bust and the product was ok .....,2,0,"[ ' p a c k a g i n g ' , ' b u s t ' , ' ..."
