In [74]:
import sqlite3
import pandas as pd

import tqdm

import re

# !pip install contractions
import contractions

from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report

import tensorflow
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

import gensim

import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
# Import dataset
db_connect = sqlite3.connect("/content/database.sqlite")
db_connect

<sqlite3.Connection at 0x7e0131787040>

In [12]:
df = pd.read_sql_query("select * from reviews LIMIT 10000", db_connect)
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [13]:
df.shape

(10000, 10)

In [14]:
df['Score'].value_counts()

Unnamed: 0_level_0,count
Score,Unnamed: 1_level_1
5,6183
4,1433
1,932
3,862
2,590


In [15]:
new_df = df[['Text', 'Score']]
new_df

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5
...,...,...
9995,we switched from the advance similac to the or...,1
9996,"Like the bad reviews say, the organic formula ...",5
9997,I wanted to solely breastfeed but was unable t...,5
9998,i love the fact that i can get this delieved t...,5


In [16]:
new_df['Text'][0]

'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'

In [17]:
def preprocess(sentence):
  sentence = re.sub("http\S+","", sentence)
  sentence = contractions.fix(sentence)
  sentence = BeautifulSoup(sentence, 'lxml').get_text()
  sentence = re.sub("\S*\d\S*", "", sentence).strip()
  sentence = re.sub("[^a-zA-Z]+", " ", sentence)
  sentence = ' '.join([word.lower() for word in sentence.split() if word.lower() not in set(stopwords.words('english'))])
  sentence = ' '.join([WordNetLemmatizer().lemmatize(word) for word in sentence.split()])
  return sentence

In [18]:
new_df['Text'] = new_df['Text'].apply(lambda x: preprocess(x))
new_df.head()

  sentence = BeautifulSoup(sentence, 'lxml').get_text()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Text'] = new_df['Text'].apply(lambda x: preprocess(x))


Unnamed: 0,Text,Score
0,bought several vitality canned dog food produc...,5
1,product arrived labeled jumbo salted peanut pe...,1
2,confection around century light pillowy citrus...,4
3,looking secret ingredient robitussin believe f...,2
4,great taffy great price wide assortment yummy ...,5


In [19]:
new_df['Text'][10]

'know cactus tequila unique combination ingredient flavour hot sauce make one kind picked bottle trip brought back home u totally blown away realized simply could find anywhere city bummed magic internet case sauce ecstatic love hot sauce mean really love hot sauce want sauce tastelessly burn throat grab bottle tequila picante gourmet de inclan realize taste never want use sauce thank personal incredible service'

## TF-IDF

In [20]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,1))
tf_idf_vect.fit(new_df['Text'])
features = tf_idf_vect.get_feature_names_out()
print("First 20 features: ", features[0:20])
print()

counts_tf_idf = tf_idf_vect.transform(new_df['Text'])
print(type(counts_tf_idf))
print("Shape of TF-idf vector: ", counts_tf_idf.get_shape())
print("Number of unique words is: ", counts_tf_idf.get_shape()[1])

First 20 features:  ['aa' 'aaaa' 'aadmit' 'aahhhs' 'ab' 'aback' 'abandon' 'abates' 'abberline'
 'abbott' 'abby' 'abdominal' 'abiding' 'ability' 'able' 'ablution'
 'abnormality' 'abnormally' 'aboiut' 'abor']

<class 'scipy.sparse._csr.csr_matrix'>
Shape of TF-idf vector:  (10000, 16176)
Number of unique words is:  16176


In [21]:
count_tf_idf_df = pd.DataFrame(counts_tf_idf.toarray(), columns = features)

In [22]:
count_tf_idf_df.head()

Unnamed: 0,aa,aaaa,aadmit,aahhhs,ab,aback,abandon,abates,abberline,abbott,...,zomg,zon,zoo,zoom,zotz,zucchini,zuke,zukes,zupas,zuppa
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
count_tf_idf_df.shape

(10000, 16176)

In [24]:
y = new_df.iloc[:10000,-1]
y

Unnamed: 0,Score
0,5
1,1
2,4
3,2
4,5
...,...
9995,1
9996,5
9997,5
9998,5


In [25]:
 x_train, x_test, y_train, y_test = train_test_split(count_tf_idf_df, y, test_size=0.2, stratify=y)

## Build ML model

In [26]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

In [27]:
y_pred_train = rf.predict(x_train)
y_pred_test = rf.predict(x_test)

In [28]:
print("Training accuracy: ", accuracy_score(y_train, y_pred_train))
print("Test accuracy: ", accuracy_score(y_test, y_pred_test))

Training accuracy:  0.99975
Test accuracy:  0.665


In [29]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           1       0.82      0.17      0.28       186
           2       1.00      0.14      0.25       118
           3       0.82      0.13      0.23       172
           4       0.74      0.09      0.16       287
           5       0.65      1.00      0.79      1237

    accuracy                           0.67      2000
   macro avg       0.81      0.31      0.34      2000
weighted avg       0.72      0.67      0.57      2000



## Feed Forward NN (MLP)

In [46]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=(16176)))
model.add(Dense(64, activation ='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='softmax'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=64, epochs=100)

Epoch 1/100




[1m118/125[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 7ms/step - accuracy: 0.0920 - loss: -24.1502



[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.0920 - loss: -29.1279 - val_accuracy: 0.0930 - val_loss: -449.3940
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.0913 - loss: -1035.7218 - val_accuracy: 0.0930 - val_loss: -4299.1577
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0923 - loss: -6517.4111 - val_accuracy: 0.0930 - val_loss: -15885.7959
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0979 - loss: -20400.7715 - val_accuracy: 0.0930 - val_loss: -39054.6016
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.0970 - loss: -47476.6953 - val_accuracy: 0.0930 - val_loss: -77623.0078
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0

<keras.src.callbacks.history.History at 0x7e0010737e80>

## Word2Vec - self trained model

In [62]:
reviews_lst = df['Text'].apply(gensim.utils.simple_preprocess)
reviews_lst

Unnamed: 0,Text
0,"[have, bought, several, of, the, vitality, can..."
1,"[product, arrived, labeled, as, jumbo, salted,..."
2,"[this, is, confection, that, has, been, around..."
3,"[if, you, are, looking, for, the, secret, ingr..."
4,"[great, taffy, at, great, price, there, was, w..."
...,...
9995,"[we, switched, from, the, advance, similac, to..."
9996,"[like, the, bad, reviews, say, the, organic, f..."
9997,"[wanted, to, solely, breastfeed, but, was, una..."
9998,"[love, the, fact, that, can, get, this, deliev..."


In [63]:
model = gensim.models.Word2Vec(window=10, min_count=2, sg=0) #cbow
model.build_vocab(reviews_lst, progress_per=1000)
model.train(reviews_lst, total_examples=model.corpus_count, epochs=model.epochs)

(2690343, 3596220)

In [65]:
model.wv['item']

array([-0.06210685,  0.56773037,  0.13227408, -0.61843103,  0.77730083,
       -0.56733483,  0.6671407 ,  0.8232023 , -0.41982484,  0.06605626,
        0.6377985 , -1.453561  ,  0.46971485,  0.04754182, -0.38840675,
       -1.0441966 ,  0.10648589, -0.13428003,  0.03143347, -0.10548433,
        0.10473711, -1.406326  , -1.3048992 ,  0.891976  ,  0.2894523 ,
       -0.54041106,  1.6259476 , -1.2091594 , -0.38302642,  0.02287482,
       -0.2687503 , -0.51750726,  0.36921033, -0.8713948 , -0.01038562,
        0.6381769 , -0.12228192,  0.04626131,  0.02597979, -0.8328892 ,
        0.39307752, -0.12652461,  1.0482672 , -0.13521297,  0.05782198,
       -0.47305566,  0.42109329, -0.39122564, -0.0262981 ,  0.7277673 ,
        0.6185984 , -1.013238  ,  0.32142434, -0.0102145 , -0.6586755 ,
       -0.4091888 ,  1.1433427 ,  1.1321338 , -1.3683751 ,  0.01182164,
       -0.55112433, -0.6793649 ,  0.25626883, -0.2646807 ,  0.0227079 ,
       -0.1547642 ,  0.60703486,  0.44370925, -0.7640492 ,  0.06

In [68]:
model.wv.most_similar('product')

[('item', 0.7757201790809631),
 ('company', 0.7247100472450256),
 ('service', 0.7075461745262146),
 ('deal', 0.6621077656745911),
 ('customer', 0.661376953125),
 ('amazon', 0.6537706255912781),
 ('suburb', 0.6454508900642395),
 ('thanks', 0.6423604488372803),
 ('seller', 0.6403212547302246),
 ('dot', 0.6397699117660522)]

In [69]:
model.wv.most_similar('cost')

[('costs', 0.9005500078201294),
 ('dollar', 0.849270224571228),
 ('prices', 0.8433094024658203),
 ('dollars', 0.8160195350646973),
 ('charges', 0.8077109456062317),
 ('paid', 0.8062180280685425),
 ('retail', 0.8042997121810913),
 ('paying', 0.8041025400161743),
 ('price', 0.8002727627754211),
 ('wegman', 0.7951760292053223)]

In [72]:
model.wv.similarity('good','bad')

0.64111245

In [73]:
model.wv.doesnt_match(['good', 'bad', 'like', 'love', 'price'])

'price'

In [77]:
# Save the model using pickle
with open('word2vec_model.pkl', 'wb') as f:
    pickle.dump(model, f)