In [1]:
import pandas as pd
from nltk import agreement

In [2]:
# Read data from xlsx file
df = pd.read_excel('shopee_reviews.xlsx', 'Sheet1')
df.head()

Unnamed: 0,shopid,itemid,ctime,author_username,comment,rating_star,template_tags
0,223946658,11315955990,1670429351,s*****h,Performance:good\nBest Feature(s):yet to try\n...,5,"['Performance', 'Best Feature(s)', 'Value For ..."
1,223946658,11315955990,1653742257,bananaphone77,Performance:ok\nBest Feature(s):ok\nValue For ...,5,"['Performance', 'Best Feature(s)', 'Value For ..."
2,223946658,11315955990,1659064971,jasperjane80,Performance:Excellent.\nBest Feature(s):Back l...,5,"['Performance', 'Best Feature(s)', 'Value For ..."
3,223946658,11315955990,1637408323,s*****4,Came less than a week ! Consider fast! Keyboar...,5,[]
4,223946658,11315955990,1639452456,reeveschiu97,Packaging was a bust and the product was ok .....,2,[]


In [3]:
# Print out stats of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30010 entries, 0 to 30009
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   shopid           30010 non-null  int64 
 1   itemid           30010 non-null  int64 
 2   ctime            30010 non-null  int64 
 3   author_username  29931 non-null  object
 4   comment          30010 non-null  object
 5   rating_star      30010 non-null  int64 
 6   template_tags    30010 non-null  object
dtypes: int64(4), object(3)
memory usage: 1.6+ MB


Categorize data as "neutral" vs "opinionated" 

Define star rating of 1 and 2 as bad reviews, 3 as neutral, 4 and 5 as good reviews

In [4]:
# Polarity detection to differentiate the opinionated data as 'positive' vs 'negative'
def polarity_detection(value):
    if(value == 4):
        return 1
    elif(value == 5):
        return 1
    else:
        return 0

def organize_data(dataframe):
    # Remove rating of '3'
    dataframe = dataframe[dataframe['rating_star'] != 3]

    # Display the polarity
    dataframe['polarity'] = dataframe['rating_star'].apply(polarity_detection)
    
    dataframe = dataframe.loc[:,['itemid', 'comment', 'rating_star', 'polarity']]
    
    # Reset Index
    dataframe.reset_index(drop=True, inplace=True)

    return dataframe

In [5]:
clean_df = organize_data(df)
clean_df.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['polarity'] = dataframe['rating_star'].apply(polarity_detection)


Unnamed: 0,itemid,comment,rating_star,polarity
0,11315955990,Performance:good\nBest Feature(s):yet to try\n...,5,1
1,11315955990,Performance:ok\nBest Feature(s):ok\nValue For ...,5,1
2,11315955990,Performance:Excellent.\nBest Feature(s):Back l...,5,1
3,11315955990,Came less than a week ! Consider fast! Keyboar...,5,1
4,11315955990,Packaging was a bust and the product was ok .....,2,0
5,11315955990,Performance:many colours\nBest Feature(s):many...,5,1
6,11315955990,Fast delivery and is working fine. However my ...,5,1
7,11315955990,I purchased it during the flash sale! I'm not ...,5,1
8,11315955990,Performance:Very Good\nBest Feature(s):Beautif...,5,1
9,11315955990,Performance:ok\nBest Feature(s):light is quite...,5,1


In [6]:
# Count number of positive and negative reviews
tb_counts = clean_df.polarity.value_counts()
tb_counts

1    28806
0      578
Name: polarity, dtype: int64

578 bad reviews, 28806 good reviews

Manual Labelling Data (1000 records)

In [8]:
# Read data from xlsx file
manually_labelled_df = pd.read_excel('manually_labelled_data.xlsx', 'Sheet1')

# Data Preprocessing

In [29]:
import random
import tensorflow as tf
import numpy as np

tf.keras.utils.set_random_seed(17)  # sets seeds for base-python, numpy and tf

tf.config.list_physical_devices('GPU')

[]

In [9]:
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\XPS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\XPS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\XPS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
# Remove stopwords
stopwords = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def preprocess_data(comment):
    
    # Remove unwanted characters and numbers
    comment = re.sub('[^A-Za-z0-9]+', ' ', comment)
    
    # Tokenization
    tokens = nltk.word_tokenize(comment)
    
    # Remove stopwords from the comment
    comment = [word for word in tokens if word not in stopwords]
    
    # Lemmatization
    comment = [lemmatizer.lemmatize(word) for word in comment]
    
    # Make entire comment to be lowercase
    comment = [c.lower() for c in comment]

    # Join words for preprocessed comment
    comment = ' '.join(str(comment))
    return comment

In [11]:
manually_labelled_df['preprocessed_comment'] = manually_labelled_df['Comment'].apply(lambda comment: preprocess_data(comment))
manually_labelled_df.head()

Unnamed: 0,Column,Comment,Rater_1,Rater_2,Rater_3,preprocessed_comment
0,11322,Quality is good but already indicated the font...,0,0,0,"[ ' q u a l i t y ' , ' g o o d ' , ' a l ..."
1,10707,Very nice quality stickers. Perfect for namin...,1,1,1,"[ ' v e r y ' , ' n i c e ' , ' q u a l i ..."
2,20077,"Beautiful fairy lights for artwork, gifts and ...",1,1,1,"[ ' b e a u t i f u l ' , ' f a i r y ' , ..."
3,5222,"Very nice and fixes my computer perfectly, wil...",1,1,1,"[ ' v e r y ' , ' n i c e ' , ' f i x ' , ..."
4,4952,Awesome product for the price. Good for printi...,1,1,1,"[ ' a w e s o m e ' , ' p r o d u c t ' , ..."


In [12]:
clean_df.head()

Unnamed: 0,itemid,comment,rating_star,polarity
0,11315955990,Performance:good\nBest Feature(s):yet to try\n...,5,1
1,11315955990,Performance:ok\nBest Feature(s):ok\nValue For ...,5,1
2,11315955990,Performance:Excellent.\nBest Feature(s):Back l...,5,1
3,11315955990,Came less than a week ! Consider fast! Keyboar...,5,1
4,11315955990,Packaging was a bust and the product was ok .....,2,0


In [32]:
clean_df = clean_df.drop(columns=['itemid', 'rating_star'])
clean_df.head()

Unnamed: 0,comment,polarity
0,Performance:good\nBest Feature(s):yet to try\n...,1
1,Performance:ok\nBest Feature(s):ok\nValue For ...,1
2,Performance:Excellent.\nBest Feature(s):Back l...,1
3,Came less than a week ! Consider fast! Keyboar...,1
4,Packaging was a bust and the product was ok .....,0


In [113]:
text_len = []
for text in clean_df.comment:
    tweet_len = len(text.split())
    text_len.append(tweet_len)

In [114]:
clean_df['text_len'] = text_len

In [115]:
token_lens = []

for txt in clean_df['comment'].values:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
    
max_len=np.max(token_lens)

In [116]:
print(f"MAX TOKENIZED SENTENCE LENGTH: {max_len}")

MAX TOKENIZED SENTENCE LENGTH: 306


In [117]:
clean_df['token_lens'] = token_lens

In [118]:
clean_df.head()

Unnamed: 0,comment,polarity,text_len,token_lens
0,Performance:good\nBest Feature(s):yet to try\n...,1,10,21
1,Performance:ok\nBest Feature(s):ok\nValue For ...,1,20,31
2,Performance:Excellent.\nBest Feature(s):Back l...,1,13,30
3,Came less than a week ! Consider fast! Keyboar...,1,19,27
4,Packaging was a bust and the product was ok .....,0,35,41


In [33]:
from sklearn.model_selection import train_test_split
#split the data into train and test set

temp,test = train_test_split(clean_df, test_size=0.2, stratify=clean_df['polarity'])
train,val = train_test_split(temp, test_size=0.25, stratify=temp['polarity'])

In [34]:
#save the data
train.to_csv('comments_train.csv',index=False)
val.to_csv('comments_val.csv',index=False)
test.to_csv('comments_test.csv',index=False)

# BERT MODEL

In [239]:
import numpy as np
import seaborn as sns
import tensorflow as tf
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [240]:
train = pd.read_csv('comments_train.csv')
val = pd.read_csv('comments_val.csv')
test = pd.read_csv('comments_test.csv')

In [255]:
X_train = train['comment'].values
y_train = train['polarity'].values
X_val = val['comment'].values
y_val = val['polarity'].values
X_test = test['comment'].values
y_test = test['polarity'].values

In [256]:
tk = Tokenizer(num_words=NB_WORDS,
               split=" ")
tk.fit_on_texts(X_train)

In [257]:
ohe = preprocessing.OneHotEncoder()
y_train = ohe.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_val = ohe.fit_transform(np.array(y_val).reshape(-1, 1)).toarray()
y_test = ohe.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

In [258]:
print(f"TRAINING DATA: {X_train.shape[0]}\nVALIDATION DATA: {X_val.shape[0]}\nTESTING DATA: {X_test.shape[0]}" )

TRAINING DATA: 17630
VALIDATION DATA: 5877
TESTING DATA: 5877


In [259]:
MAX_LEN = 308 # Maximum number of tokens in a comment
EMB_DIM = 8 # Embedding dimension
NB_WORDS = 10000 # Number of words to include in our vocab

In [260]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [261]:
def tokenize(data,max_len=MAX_LEN) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids), np.array(attention_masks)

In [262]:
train_input_ids, train_attention_masks = tokenize(X_train, MAX_LEN)
val_input_ids, val_attention_masks = tokenize(X_val, MAX_LEN)
test_input_ids, test_attention_masks = tokenize(X_test, MAX_LEN)

In [263]:
from transformers import TFBertModel

bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [264]:
def create_bert(bert_model, max_len=MAX_LEN):
    ##params###
    opt = tf.keras.optimizers.Adam(learning_rate=1e-, decay=1e-5)
    loss = tf.keras.losses.CategoricalCrossentropy()
    accuracy = tf.keras.metrics.CategoricalAccuracy()

    input_ids = tf.keras.Input(shape=(max_len,), dtype='int32')
    attention_masks = tf.keras.Input(shape=(max_len,), dtype='int32')

    embeddings = bert_model([input_ids,attention_masks])[1]

    if len(embeddings.shape) == 2:
        embeddings = tf.keras.layers.Reshape((1, embeddings.shape[1]))(embeddings)
        embeddings = tf.keras.layers.Flatten()(embeddings)

    output = tf.keras.layers.Dense(2, activation="softmax")(embeddings)

    model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=output)

    model.compile(opt, loss=loss, metrics=accuracy)

    return model


In [265]:
bert_model = create_bert(bert_model, MAX_LEN)
bert_model.summary()

Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_33 (InputLayer)          [(None, 308)]        0           []                               
                                                                                                  
 input_34 (InputLayer)          [(None, 308)]        0           []                               
                                                                                                  
 tf_bert_model_11 (TFBertModel)  TFBaseModelOutputWi  109482240  ['input_33[0][0]',               
                                thPoolingAndCrossAt               'input_34[0][0]']               
                                tentions(last_hidde                                               
                                n_state=(None, 308,                                        

In [266]:
history_bert = bert_model.fit([train_input_ids,train_attention_masks], y_train, validation_data=([val_input_ids,val_attention_masks], y_val), epochs=1, batch_size=12)

   2/1470 [..............................] - ETA: 23:45:16 - loss: 1.1281 - categorical_accuracy: 0.5000    

KeyboardInterrupt: 