In [45]:
import pandas as pd

In [46]:
data = pd.read_csv(r"/content/tripadvisor_hotel_reviews.csv")

In [47]:
data

Unnamed: 0,Review,Rating,Label
0,nice hotel expensive parking got good deal sta...,4,Positive
1,ok nothing special charge diamond member hilto...,2,Negative
2,nice rooms not 4* experience hotel monaco seat...,3,Negative
3,"unique, great stay, wonderful time hotel monac...",5,Positive
4,"great stay great stay, went seahawk game aweso...",5,Positive
...,...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5,Positive
20487,great location price view hotel great quick pl...,4,Positive
20488,"ok just looks nice modern outside, desk staff ...",2,Negative
20489,hotel theft ruined vacation hotel opened sept ...,1,Negative


In [48]:
X = data["Review"].astype(str)

In [49]:
import re

In [50]:
def preprocess(text):
  # Lowercasing
  lower_text = text.lower()
  lower_text = re.sub("\W"," ",lower_text)
  lower_text = re.sub("\s+"," ",lower_text)
  return lower_text

In [51]:
X_clean = X.apply(preprocess)

In [52]:
X_clean

0        nice hotel expensive parking got good deal sta...
1        ok nothing special charge diamond member hilto...
2        nice rooms not 4 experience hotel monaco seatt...
3        unique great stay wonderful time hotel monaco ...
4        great stay great stay went seahawk game awesom...
                               ...                        
20486    best kept secret 3rd time staying charm not 5 ...
20487    great location price view hotel great quick pl...
20488    ok just looks nice modern outside desk staff n...
20489    hotel theft ruined vacation hotel opened sept ...
20490    people talking ca n t believe excellent rating...
Name: Review, Length: 20491, dtype: object

In [53]:
# Stemmatization :

from nltk.stem import PorterStemmer

In [54]:
stemmer = PorterStemmer()

In [55]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [56]:
def Stem_Text(text):
   words = nltk.word_tokenize(text)
   stemmed_words = [stemmer.stem(word) for word in words]
   return " ".join(stemmed_words)

In [57]:
X_clean_Stem = X_clean.apply(Stem_Text)

In [58]:
X_clean_Stem

0        nice hotel expens park got good deal stay hote...
1        ok noth special charg diamond member hilton de...
2        nice room not 4 experi hotel monaco seattl goo...
3        uniqu great stay wonder time hotel monaco loca...
4        great stay great stay went seahawk game awesom...
                               ...                        
20486    best kept secret 3rd time stay charm not 5 sta...
20487    great locat price view hotel great quick place...
20488    ok just look nice modern outsid desk staff n t...
20489    hotel theft ruin vacat hotel open sept 17 2007...
20490    peopl talk ca n t believ excel rate hotel just...
Name: Review, Length: 20491, dtype: object

In [59]:
# Tokenization

from keras.preprocessing.text import Tokenizer

In [60]:
tokenize = Tokenizer(num_words=10000)

In [61]:
tokenize.fit_on_texts(X_clean_Stem)

In [62]:
data_sequences = tokenize.texts_to_sequences(X_clean_Stem)

In [63]:
from keras.preprocessing.sequence import pad_sequences

In [64]:
padded_sequences = pad_sequences(data_sequences,maxlen=100,padding="post",truncating="post")

In [65]:
from sklearn.preprocessing import LabelEncoder

In [66]:
le = LabelEncoder()

In [67]:
y = le.fit_transform(data["Label"])

In [68]:
y

array([1, 0, 0, ..., 0, 0, 0])

In [69]:
# Build the Architecture of RNN


from keras.models import Sequential

from keras.layers import Embedding,SimpleRNN,Dense

In [71]:
model=Sequential()
model.add(Embedding(10000,32,input_length=100))
model.add(SimpleRNN(100))
model.add(Dense(units=1,activation="sigmoid"))

In [72]:
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])

In [73]:
model.fit(padded_sequences,y,epochs=5,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7947b15e7b80>

In [77]:
# Testing the model

sentence = ['The staff was very rude and misbehaved also. Worst hotel to stay.']

sen = tokenize.texts_to_sequences(sentence)

padded_sen = pad_sequences(sen,maxlen=100,padding="post",truncating="post")

model.predict(padded_sen)



array([[0.86777085]], dtype=float32)

In [78]:
# Testing the model

sentence = ['The hotel was very bad.']

sen = tokenize.texts_to_sequences(sentence)

padded_sen = pad_sequences(sen,maxlen=100,padding="post",truncating="post")

model.predict(padded_sen)



array([[0.86776745]], dtype=float32)

In [82]:
le.inverse_transform([0])

array(['Negative'], dtype=object)