In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

import re
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

#data processing
import re, string
# import emoji
import nltk

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

#Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

#transformers
from transformers import BertTokenizerFast
from transformers import TFBertModel
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel

#keras
import tensorflow as tf
from tensorflow import keras

#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

#set seed for reproducibility
seed=42

In [None]:
data = pd.read_csv('/content/drive/MyDrive/rukmininfo/jeffin/flipkart-customer-review-and-rating.zip')


In [None]:
data[:5]

Unnamed: 0,review,rating
0,It was nice produt. I like it's design a lot. ...,5
1,awesome sound....very pretty to see this nd th...,5
2,awesome sound quality. pros 7-8 hrs of battery...,4
3,I think it is such a good product not only as ...,5
4,awesome bass sound quality very good bettary l...,5


In [None]:
data.shape

(9976, 2)

In [None]:
data.isnull().sum()

review    0
rating    0
dtype: int64

In [None]:
data_new = data[:500]
# Here we are creating a mini dataframe with 500 rows only. To check the accuracy of the model the student need to run the whole dataset.
# Replace "data_new" with "data".
data_new.shape

(500, 2)

In [None]:
import nltk
import re
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')

stop_words = set(stopwords.words('english'))

stemmer = nltk.SnowballStemmer("english")


def clean(text):
    text = str(text).lower()
    text = re.sub('[.*?]', '', text)
    text = re.sub('https?://S+|www.S+', '', text) # remove punctuation tags
    text = re.sub(r'[^\w\s]', '', text)

    text = [word for word in text.split(' ') if word not in stop_words] # remove stop words
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')] # apply stemmer
    text=" ".join(text)
    return text
data_new["Review"] = data_new['review'].apply(clean)


In [None]:
# Bag of Words Model
# This is the most simple vector space representational model for unstructured text. A vector space model is simply a mathematical model to represent unstructured text (or any other data) as numeric vectors, such that each dimension of the vector is a specific feature/attribute.

from sklearn.feature_extraction.text import CountVectorizer
# get bag of words features in sparse format
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(data_new["Review"])
cv_matrix

print(cv_matrix)

cv_matrix = cv_matrix.toarray()


In [None]:

# Here we can say that documents have been converted into numeric vectors so that each document is represented by one vector (row) in the feature matrix and each column represents a unique word as a feature.

#get all unique words
vocab = cv.get_feature_names_out()
# show document feature vectors
pd.DataFrame(cv_matrix, columns=vocab)

Find count of Reviews wrt. to ratings( 1,2,3,4,5)

data_new[data_new['rating'] == 1]
data_new[data_new['rating'] == 2]
data_new[data_new['rating'] == 3]
data_new[data_new['rating'] == 4]
data_new[data_new['rating'] == 5]


In [None]:
## plotting bar graph for
plt.figure(figsize = (7,7))
ratings = ['1star','2star','3star', '4star', '5star']
numRatings = [len(data[data['rating'] == 1 ]),
             len(data[data['rating'] == 2 ]),
             len(data[data['rating'] == 3 ]),
             len(data[data['rating'] == 4 ]),
             len(data[data['rating'] == 5 ])
             ]
plt.title('Biased Dataset Alert!!')
plt.grid(True)
plt.bar(ratings, numRatings)
plt.show()

In [None]:
# pad_sequence

# ‘pad_sequences’ padded the sequences into the same length.

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

text = data['review']
label = data['rating']

text = np.array(text)
label = np.array(label)


In [None]:
#label
# also labels are from 1-5
# tf expects 0-4
label = label - 1
label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text, label, test_size = 0.2, random_state = 42)

tokenizer = Tokenizer(num_words = 25000)
tokenizer.fit_on_texts(X_train)


 fit_on_texts() Method :

In the case where texts contains lists, we assume each entry of the lists to be a token. Required before using texts_to_sequences or texts_to_matrix.

The fit_on_texts method is a part of Keras tokenizer class.

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
#texts_to_sequences

# Transforms each text in texts to a sequence of integers.

# Only top num_words-1 most frequent words will be taken into account. Only words known by the tokenizer will be taken into account.

X_train = pad_sequences(X_train, maxlen = 50)
X_test = pad_sequences(X_test, maxlen = 50)


# X_train.shape
X_test.shape

## LSTM model

   #Terms

-- Word embedding

  Keras provides an embedding layer that converts each word into a fixed-length vector of defined size.

  Here the words, that are close in meaning are grouped near to one another in vector space.

  -- Example:

  While representing a word such as frog, the nearest neighbour of a frog would be frogs, toads, Litoria.

What are the advantages of embedding layer?

  The two main advantages of Embedding over Dense layers are reduced input size and reduce computational complexity, which results in speeding up the training time.

In [None]:
Mymodel = tf.keras.models.Sequential([

    # Input produces a symbolic tensor-like object (i.e. a placeholder).

    tf.keras.layers.Input(shape = (50,)),

    #Embedding(vocab_size, embedding_dim, name="embedding"),
    tf.keras.layers.Embedding(7857, 20), # Embed a 7857 word vocabulary into 20 dimensions.

    # to handle backwards input processing.
    #
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(70, return_sequences = True)),
    tf.keras.layers.LSTM(140), # Creating the number of memory units within the layer.
    tf.keras.layers.Dense(5, activation = 'softmax') # sigmoid..binary class
])

Mymodel.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'] )

keras tensor: we just give or train certain attributes that allows us to build Keras model just by knowing the inputs and outputs.

Input() is used to instantiate a Keras tensor.

shape: A shape tuple (integers), shape=(32,) indicates that the expected input will be batches of 32-dimensional vectors.
    

In [None]:
# Model fitting is a measure of how well a machine learning model generalizes to similar data to that on which it was trained.

#  compile (Configures the model for training); fit (Trains the model for a fixed number of epochs);
#  evaluate (Returns the loss value & metrics values for the model in test mode);
#   predict (Generates output predictions for the input

retVal = Mymodel.fit(X_train, y_train, batch_size=128,validation_data = (X_test,y_test), epochs = 5 )


In [None]:
acc = Mymodel.evaluate(X_test,y_test)

acc

In [None]:

twt = ['Meetings: Because none of us is as dumb as all of us.']
twt_text = tokenizer.texts_to_sequences(twt)
twt_text = pad_sequences(twt_text, maxlen = 50)
pred_value = Mymodel.predict(twt_text)

pred_value

In [None]:
# pred_prob = Mymodel.predict_proba(pred_value)
predict_classes=np.argmax(pred_value,axis=1)
print(predict_classes)

In [None]:
# pred_prob = Mymodel.predict_proba(pred_value)
predict_classes=np.argmin(pred_value,axis=1)
print(predict_classes)

#Transformers
 Transformers are designed to handle sequential data, such as natural language, for tasks such as translation and text summarization.  

Text classification with BERT

In [None]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
data_new1 = data[:500]
data_new1.shape

(500, 2)

In [None]:
data_new1.columns

Index(['review', 'rating'], dtype='object')

In [None]:
df_brt = data_new1[['review','rating']]

In [None]:
#CUSTOM DEFINED FUNCTIONS TO CLEAN THE TWEETS

#Clean emojis from text

#Remove punctuations, links, mentions and \r\n new line characters
def strip_all_entities(text):
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

In [None]:
texts_new = []
for t in df_brt.review:
    texts_new.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(t)))))


In [None]:
df_brt['text_clean'] = texts_new

In [None]:
df_brt

Unnamed: 0,review,rating,text_clean
0,It was nice produt. I like it's design a lot. ...,5,it was nice produt i like its design a lot its...
1,awesome sound....very pretty to see this nd th...,5,awesome soundvery pretty to see this nd the so...
2,awesome sound quality. pros 7-8 hrs of battery...,4,awesome sound quality pros 78 hrs of battery l...
3,I think it is such a good product not only as ...,5,i think it is such a good product not only as ...
4,awesome bass sound quality very good bettary l...,5,awesome bass sound quality very good bettary l...
...,...,...,...
495,good product.... you can go for it.initially i...,4,good product you can go for itinitially i felt...
496,This is a wonderful companion for both my phon...,4,this is a wonderful companion for both my phon...
497,The right and left distribution is not okay......,4,the right and left distribution is not okayoth...
498,"nice Bluetooth headphone, I am pleased with it...",5,nice bluetooth headphone i am pleased with its...


In [None]:
df_brt['text_clean'][1:8].values

array(['awesome soundvery pretty to see this nd the sound quality was too good i wish to take this product loved this product read more',
       'awesome sound quality pros 78 hrs of battery life including 45 mins approx call timeawesome sound output bass and treble are really very clear without equaliser with equaliser sound wary depends on the handset sound qualityweightless to carry and in head toomic is good but in traffic it is not too good 325535mm option is really important to mention really expecting other leading brands to implement thisconsvery tight in ears adjusters are ok this ll be very tightread more',
       'i think it is such a good product not only as per the quality but also the design is quite good i m using this product from january in this pandamic situation it has became the most useful and helpful overall the bass and the sound quality is pretty good and another thing that will give you such a sigh of relief that it will provide a wire that will help you in cas

In [None]:
text_len = []
for text in df_brt.text_clean:
    tweet_len = len(text.split())
    text_len.append(tweet_len)

In [None]:
df_brt['text_len'] = text_len

In [None]:
print(f" DF SHAPE: {df_brt.shape}")

 DF SHAPE: (500, 4)


In [None]:
df_brt = df_brt[df_brt['text_len'] > 4]

In [None]:
print(f" DF SHAPE: {df_brt.shape}")

 DF SHAPE: (500, 4)


In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
token_lens = []

for txt in df_brt['text_clean'].values:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))

max_len=np.max(token_lens)

In [None]:
print(f"MAX TOKENIZED SENTENCE LENGTH: {max_len}")

MAX TOKENIZED SENTENCE LENGTH: 120


In [None]:
# Let's check the long tokenized sentences (with more than 80 tokens ):

token_lens = []

for i,txt in enumerate(df_brt['text_clean'].values):
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
    if len(tokens)>80:
        print(f"INDEX: {i}, TEXT: {txt}")


INDEX: 2, TEXT: awesome sound quality pros 78 hrs of battery life including 45 mins approx call timeawesome sound output bass and treble are really very clear without equaliser with equaliser sound wary depends on the handset sound qualityweightless to carry and in head toomic is good but in traffic it is not too good 325535mm option is really important to mention really expecting other leading brands to implement thisconsvery tight in ears adjusters are ok this ll be very tightread more
INDEX: 3, TEXT: i think it is such a good product not only as per the quality but also the design is quite good i m using this product from january in this pandamic situation it has became the most useful and helpful overall the bass and the sound quality is pretty good and another thing that will give you such a sigh of relief that it will provide a wire that will help you in case of lacking chargesread more
INDEX: 5, TEXT: awsome sound powerful bass battery backup is also excellent and i loved bass t

In [None]:
df_brt['token_lens'] = token_lens

In [None]:
df_brt = df_brt.sort_values(by='token_lens', ascending=False)
df_brt.head(20)

Unnamed: 0,review,rating,text_clean,text_len,token_lens
143,The reason for choosing this one over all othe...,5,the reason for choosing this one over all othe...,91,120
296,The bass the sound is aswsome. You will not fa...,5,the bass the sound is aswsome you will not fac...,101,119
472,i am writing this review aftet using it 1 week...,5,i am writing this review aftet using it 1 week...,95,119
175,these are the best headphones...thanks to Flip...,5,these are the best headphonesthanks to flipkar...,78,119
482,I have seen this in Local Shop in October and ...,3,i have seen this in local shop in october and ...,94,117
89,Bought for Rs.999 on Big Billion days(Supercom...,4,bought for rs999 on big billion dayssupercomne...,85,117
7,Should u buy this---Pros:-1. Sound quality and...,4,should u buy thispros1 sound quality and build...,91,116
137,"Best headphones at this price, i got it for 85...",5,best headphones at this price i got it for 850...,101,114
201,Delivered in a day as promised by flipkart.. B...,5,delivered in a day as promised by flipkart big...,90,113
107,"Helo Friends,Since Boat is trending now in the...",4,helo friendssince boat is trending now in the ...,88,112


In [None]:
# The dataset looks more clean now. We will shuffle it and reset the index.
df_brt = df_brt.sample(frac=1).reset_index(drop=True)


Sentiment column analysis


In [None]:
df_brt['rating'].value_counts()

5    283
4    152
3     44
1     15
2      6
Name: rating, dtype: int64

In [None]:
# We note that the three classes are imbalanced. We will proceed with oversampling the train test, to remove bias towards the majority classes.

ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(df_brt['text_clean']).reshape(-1, 1), np.array(df_brt['rating']).reshape(-1, 1));
train_os = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['text_clean', 'rating']);

In [None]:
train_os['rating'].value_counts()

5    283
4    283
3    283
2    283
1    283
Name: rating, dtype: int64

In [None]:
X = train_os['text_clean'].values
y = train_os['rating'].values

In [None]:
X.shape

(1415,)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, stratify=y, random_state=seed)

In [None]:
y_train.shape, y_valid.shape,X_valid.shape,X_train.shape

((1273,), (142,), (142,), (1273,))

In [None]:
# One hot encoding. by using one hot encoding on the target variable we achieved higher accuracy.

y_train_le = y_train.copy()
y_valid_le = y_valid.copy()
# y_test_le = y_test.copy()

In [None]:
ohe = preprocessing.OneHotEncoder()
y_train = ohe.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_valid = ohe.fit_transform(np.array(y_valid).reshape(-1, 1)).toarray()
# y_test = ohe.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

In [None]:
print(f"TRAINING DATA: {X_train.shape[0]}\nVALIDATION DATA: {X_valid.shape[0]}\n" )

TRAINING DATA: 1273
VALIDATION DATA: 142



In [None]:
MAX_LEN=128

def tokenize(data,max_len=MAX_LEN) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [None]:
train_input_ids, train_attention_masks = tokenize(X_train, MAX_LEN)
val_input_ids, val_attention_masks = tokenize(X_valid, MAX_LEN)
# test_input_ids, test_attention_masks = tokenize(X_test, MAX_LEN)

In [None]:
val_input_ids.shape

(142, 128)

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
def create_model(bert_model, max_len=MAX_LEN):

    ##params###
    opt = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-7)
    loss = tf.keras.losses.CategoricalCrossentropy()
    accuracy = tf.keras.metrics.CategoricalAccuracy()


    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')

    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')

    embeddings = bert_model([input_ids,attention_masks])[1]

    output = tf.keras.layers.Dense(5, activation="softmax")(embeddings)

    model = tf.keras.models.Model(inputs = [input_ids,attention_masks], outputs = output)

    model.compile(opt, loss=loss, metrics=accuracy)

    return model



In [None]:
model1 = create_model(bert_model, MAX_LEN)
model1.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_2[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                           

In [None]:
train_input_ids.shape

In [None]:
y_train.shape, y_valid.shape

((1273, 5), (142, 5))

In [None]:
history_bert = model1.fit([train_input_ids,train_attention_masks], y_train, validation_data=([val_input_ids, val_attention_masks],y_valid), epochs=2, batch_size=32)

# history = model.fit(X_train, y_train, epochs=30,validation_data=(X_valid,y_valid),batch_size=100,verbose=1,shuffle=1)

Epoch 1/2
 9/40 [=====>........................] - ETA: 29:46 - loss: 1.6310 - categorical_accuracy: 0.2431

In [None]:
# BERT results

result_bert1 = model1.predict([val_input_ids, val_attention_masks])

In [None]:
y_pred_bert1 =  np.zeros_like(result_bert1)
y_pred_bert1[np.arange(len(y_pred_bert1)), result_bert1.argmax(1)] = 1

In [None]:
#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
def conf_matrix(y, y_pred, title):
    fig, ax =plt.subplots(figsize=(5,5))
    labels=[1,2,3,4,5]
    ax=sns.heatmap(confusion_matrix(y, y_pred), annot=True, cmap="Blues", fmt='g', cbar=False, annot_kws={"size":25})
    plt.title(title, fontsize=20)
    ax.xaxis.set_ticklabels(labels, fontsize=17)
    ax.yaxis.set_ticklabels(labels, fontsize=17)
    ax.set_ylabel('Test', fontsize=20)
    ax.set_xlabel('Predicted', fontsize=20)
    plt.show()

In [None]:
conf_matrix(y_valid.argmax(1), y_pred_bert1.argmax(1),'BERT Sentiment Analysis\nConfusion Matrix')

In [None]:
print('\tClassification Report for BERT:\n\n',classification_report(y_valid,y_pred_bert1))

In [None]:
accuracy = accuracy_score(y_valid, y_pred_bert1)
print('Accuracy: %.3f' % (accuracy * 100))

In [None]:
..............END.......................