# NLP u ugostiteljskoj industriji

## Potrebni moduli

In [40]:
import pandas as pd
import numpy as np

import re

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Embedding, SimpleRNN, GRU, Dense, Dropout

## Učitavanje i pretprocesiranje podataka

In [41]:
df = pd.read_csv('https://drive.google.com/uc?export=download&id=1Eb4KOL-0jIr-DXu0eqaIzKQWrO8d0Vzo')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [42]:
# Pretprocesiranje teksta
df['Review'] = df['Review'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x).lower())

In [43]:
# Kategoriziranje ocjena
def label_rating(rating):
    if rating <= 2:
        return 'Negative'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Positive'

df['Label'] = df['Rating'].apply(label_rating)

In [44]:
# Tokenizacija
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['Review'])
sequences = tokenizer.texts_to_sequences(df['Review'])

In [45]:
# Padding
max_len = max([len(x) for x in sequences])
X = pad_sequences(sequences, maxlen=max_len)

In [46]:
# One-hot kodiranje oznaka
labels = pd.get_dummies(df['Label']).values

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)

// TODO
- reduce vocab size
- reduce epochs
- try out single layer RNN
- try out GRU
- implement GRU with GloVe/Word2Vec

In [48]:
model1 = Sequential()
model1.add(Embedding(4000, 128, input_length=max_len))  # 3000 je veličina vokabulara, 128 je dimenzija embeddinga
model1.add(SimpleRNN(32))  # jedinice u RNN sloju
model1.add(Dropout(0.5))   # Dropout za smanjenje preprilagođenosti
model1.add(Dense(3, activation='softmax'))  # Izlazni sloj s tri klase

model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=2)

history = model1.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_val, y_val), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [49]:
model1.evaluate(X_test, y_test)



[0.5030768513679504, 0.824591338634491]

In [50]:
# Definiranje modela
model2 = Sequential()
model2.add(Embedding(5000, 128, input_length=max_len))  # 5000 je veličina vokabulara, 128 je dimenzija embeddinga
model2.add(GRU(64))  # 64 jedinice u GRU sloju
model2.add(Dropout(0.5))  # Dropout za smanjenje preprilagođenosti
model2.add(Dense(3, activation='softmax'))  # 3 izlazne jedinice za tri klase sentimenta

# Kompilacija modela
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Postavljanje rano zaustavljanja
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Treniranje modela
history = model2.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_val, y_val), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


---

In [18]:
model = Sequential()
model.add(Embedding(5000, 128, input_length=max_len))
model.add(SimpleRNN(32, return_sequences=True))
model.add(Dropout(0.6))
model.add(SimpleRNN(32))
model.add(Dropout(0.6))
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=2)

history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_val, y_val), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [39]:
model.evaluate(X_test, y_test)



[0.5174793601036072, 0.8216637969017029]