In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import bz2
from transformers import T5ForConditionalGeneration, T5Tokenizer, BartForConditionalGeneration, BartTokenizer, pipeline
import torch
import copy
import re
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Dropout, Dense, Input, Embedding, GRU
from tensorflow.keras.models import Sequential, save_model, load_model
from keras.callbacks import ModelCheckpoint

# Training

In [2]:
train_df = pd.read_csv("/kaggle/working/amazon_reviews_train_cleaned.csv")
test_df = pd.read_csv("/kaggle/working/amazon_reviews_test_cleaned.csv")

In [3]:
train_df['label'] = train_df['label'] - 1
test_df['label'] = test_df['label'] - 1

In [4]:
train_df['text'] = train_df['title'] + " " + train_df['description']
train_df.head()

Unnamed: 0.1,Unnamed: 0,title,description,label,text
0,3028528,thanks lb,the listening sleepers chitlin circuit 15 conn...,1,thanks lb the listening sleepers chitlin circu...
1,2404118,came for the cello stayed for the oboe,i was looking for an allbritten cd of chamber ...,1,came for the cello stayed for the oboe i was l...
2,3534001,beauty is skin deep,hands down the most stylish toaster out there ...,0,beauty is skin deep hands down the most stylis...
3,3390257,wasnt drawn in by the characters,as others said typical romance mediocre underw...,0,wasnt drawn in by the characters as others sai...
4,60541,incorrect diagrams no scripts on web site,this is bad news lots of info left out pics in...,0,incorrect diagrams no scripts on web site thi...


In [5]:
test_df['text'] = test_df['title'] + " " + test_df['description']
test_df.head()

Unnamed: 0.1,Unnamed: 0,title,description,label,text
0,0,great cd,my lovely pat has one of the great voices of h...,1,great cd my lovely pat has one of the great vo...
1,1,one of the best game music soundtracks for a ...,despite the fact that i have only played a sma...,1,one of the best game music soundtracks for a ...
2,2,batteries died within a year,i bought this charger in jul 2003 and it worke...,0,batteries died within a year i bought this cha...
3,3,works fine but maha energy is better,check out maha energys website their powerex m...,1,works fine but maha energy is better check out...
4,4,great for the nonaudiophile,reviewed quite a bit of the combo players and ...,1,great for the nonaudiophile reviewed quite a b...


In [6]:
train_df.drop(["Unnamed: 0"], axis=1, inplace=True)
train_df.drop(['title', 'description'], axis=1, inplace=True)

In [7]:
test_df.drop(["Unnamed: 0"], axis=1, inplace=True)
test_df.drop(['title', 'description'], axis=1, inplace=True)

In [8]:
train_df = train_df.dropna()
test_df = test_df.dropna()

In [9]:
train_df = train_df.sample(frac=1)

In [10]:
X, y = train_df.drop(["label"], axis=1), train_df["label"]

In [11]:
X_test, y_test = test_df.drop(["label"], axis=1), test_df["label"]

In [12]:
X.head()

Unnamed: 0,text
2894765,a powerful book chuck colsons life is one of h...
1246880,warning warranty does not cover physical defec...
3431345,bible study i have a hard copy of the bible an...
2278068,what i needed not sure why this says chromed t...
273436,percolator the first time i tried it it leaked...


In [13]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
X_train.shape

(2890438, 1)

In [15]:
y_train.shape

(2890438,)

In [16]:
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()
y_test = y_test.to_numpy()


In [17]:
MAX_FEATURES = 200000
MAX_LENGTH = 256

In [18]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(X_train["text"])

In [19]:
X_train = tokenizer.texts_to_sequences(X_train['text'])
X_val = tokenizer.texts_to_sequences(X_val['text'])
X_test = tokenizer.texts_to_sequences(X_test['text'])

In [20]:
X_train = pad_sequences(X_train, maxlen=MAX_LENGTH, padding='post', truncating='post')
X_val = pad_sequences(X_val, maxlen=MAX_LENGTH, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=MAX_LENGTH, padding='post', truncating='post')

In [21]:
X_train[250]

array([2077, 6076,  259,    8,   19,   13,   64,   28,   34,    4,  683,
         51,   81,   15,   58,   50,   26,  464,   10,    1,   19,   27,
       2845,  873,   48,   79,    7,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [22]:
checkpoint_filepath = '/kaggle/working/best_model.keras'
checkpoint = ModelCheckpoint(
    filepath=checkpoint_filepath,   
    monitor='val_loss',             
    save_best_only=True,            
    mode='min',                     
    verbose=1                       
)


In [23]:
model = Sequential([
    Input(shape=(MAX_LENGTH,)),
    Embedding(MAX_FEATURES, 128),
    GRU(128, return_sequences=True, recurrent_activation='sigmoid'),
    GRU(128, recurrent_activation='sigmoid'),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [26]:
model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

In [27]:
model.fit(X_train, y_train, batch_size=128, epochs = 5, validation_data=(X_val, y_val), callbacks=[checkpoint])

Epoch 1/5
[1m22582/22582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.5005 - loss: 0.6932
Epoch 1: val_loss improved from inf to 0.69315, saving model to /kaggle/working/best_model.keras
[1m22582/22582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1101s[0m 49ms/step - accuracy: 0.5005 - loss: 0.6932 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 2/5
[1m22581/22582[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 45ms/step - accuracy: 0.5000 - loss: 0.6932
Epoch 2: val_loss did not improve from 0.69315
[1m22582/22582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1091s[0m 48ms/step - accuracy: 0.5000 - loss: 0.6932 - val_accuracy: 0.5000 - val_loss: 0.6932
Epoch 3/5
[1m22581/22582[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 45ms/step - accuracy: 0.6880 - loss: 0.4753
Epoch 3: val_loss improved from 0.69315 to 0.14648, saving model to /kaggle/working/best_model.keras
[1m22582/22582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10

<keras.src.callbacks.history.History at 0x7fa064983160>

In [28]:
model.evaluate(X_test, y_test)

[1m12491/12491[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 13ms/step - accuracy: 0.9499 - loss: 0.1409


[0.13810664415359497, 0.951320230960846]

In [54]:
def predict_sentence(sentence):
    sentence = preprocess(sentence)
    sentence = [sentence]
    tokenized_sentence = tokenizer.texts_to_sequences(sentence)
    padded_sentence = pad_sequences(tokenized_sentence, maxlen=MAX_LENGTH, padding='post', truncating='post')
    pred = model.predict(padded_sentence)[0][0] 
    if (pred > 0.5):
        print("Positive")
    else:
        print("Negative")

In [130]:
predict_sentence("artifical intelligence")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Negative
