In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
import string
import pickle

# Function to clean text
def clean_text(text):
    text = text.lower()  # Lowercase the text
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    text = ''.join([char for char in text if not char.isdigit()])  # Remove digits
    text = ' '.join(text.split())  # Remove extra whitespace
    return text

# Read the data
reviews_df = pd.read_csv("Hotel_Reviews.csv")

# Create the review text and label
reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df["Positive_Review"]
reviews_df["sentiment"] = np.where(reviews_df["Reviewer_Score"] < 5, 1, 0)  # 1 for negative, 0 for positive

# Clean the text data
reviews_df["review_clean"] = reviews_df["review"].apply(clean_text)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reviews_df["review_clean"], reviews_df["sentiment"], test_size=0.2, random_state=42)

# Tokenize the text data
max_features = 10000  # Maximum number of words to keep based on frequency
tokenizer = Tokenizer(num_words=max_features, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
maxlen = 100  # Maximum length of sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

# Define the model
embedding_dim = 100
model = Sequential([
    Embedding(input_dim=max_features, output_dim=embedding_dim),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
epochs = 10
batch_size = 64
history = model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test_pad, y_test), callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Save the model
model.save("sentiment_model.h5")

# Save the tokenizer
with open("tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


Epoch 1/10
[1m6447/6447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m712s[0m 110ms/step - accuracy: 0.9571 - loss: 0.1600 - val_accuracy: 0.9635 - val_loss: 0.1013
Epoch 2/10
[1m6447/6447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m473s[0m 73ms/step - accuracy: 0.9649 - loss: 0.0961 - val_accuracy: 0.9647 - val_loss: 0.0969
Epoch 3/10
[1m6447/6447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m489s[0m 76ms/step - accuracy: 0.9671 - loss: 0.0897 - val_accuracy: 0.9646 - val_loss: 0.0972
Epoch 4/10
[1m6447/6447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m446s[0m 69ms/step - accuracy: 0.9686 - loss: 0.0848 - val_accuracy: 0.9643 - val_loss: 0.0990
Epoch 5/10
[1m6447/6447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m476s[0m 74ms/step - accuracy: 0.9705 - loss: 0.0802 - val_accuracy: 0.9635 - val_loss: 0.1025
[1m3224/3224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 11ms/step - accuracy: 0.9643 - loss: 0.0966




Test Loss: 0.0969063937664032
Test Accuracy: 0.9646721482276917


In [4]:
print(reviews_df.head(10))

                                       Hotel_Address  \
0   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
1   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
2   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
3   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
4   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
5   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
6   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
7   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
8   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
9   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   

   Additional_Number_of_Scoring Review_Date  Average_Score   Hotel_Name  \
0                           194    8/3/2017            7.7  Hotel Arena   
1                           194    8/3/2017            7.7  Hotel Arena   
2                           194   7/31/2017            7.7  Hotel Arena   
3                           194   7/31/2017            7.7  Hotel Arena   
4       

In [5]:
# Input review
input_review = "This hotel was fantastic! The staff were incredibly friendly and helpful. The room was worst and ucomfortable."

# Clean the input review
cleaned_review = clean_text(input_review)

# Tokenize and pad the cleaned review text
review_seq = tokenizer.texts_to_sequences([cleaned_review])
review_pad = pad_sequences(review_seq, maxlen=maxlen, padding='post')

# Predict sentiment
prediction = model.predict(review_pad)[0][0]

# Output result
if prediction >= 0.5:
    print("The review is predicted to be negative.")
else:
    print("The review is predicted to be positive.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step
The review is predicted to be positive.
