In [2]:
import pandas as pd

In [10]:
data.info()

In [11]:
data = pd.read_csv("data/gpt_3.5_reviews.csv")

In [12]:
data.loc[:, 'review_text'] = data['review_text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
data = data.loc[data["review_text"] != '']
data.loc[:, "sentiment"] = data["sentiment"].astype('int8')
data.reset_index(inplace=True, drop=True)

In [14]:
data["review_text"] = data["review_text"].astype(str)

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['review_text'].values)
X = tokenizer.texts_to_sequences(data['review_text'].values)
X = pad_sequences(X)

In [16]:
Y = data['sentiment'].values

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [18]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.models import Sequential
embed_dim = 128
lstm_out = 196
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
batch_size = 32
epochs = 5

early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)
with tf.device('/CPU:0'):   
    model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test), callbacks=[early_stopping])
# model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test), callbacks=[early_stopping])

In [22]:
import numpy as np
def predict_sentiment(text):
    text = text.lower().replace(r'[^\w\s]', '')
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=X.shape[1])
    pred = model.predict(padded)
    return 'Positive' if np.argmax(pred) == 1 else 'Negative'

In [23]:
texts = [
    "Akokoľvek sa na to pozriem tak to nevyzerá dobre.",
    "Tento deň je skvelý",
    "Neviem či som správne pochopil.",
    "To si vážne urobila?!",
    "To myslíš vážne?",
    "Ja som veľmi rád, že si sa to konečne naučila.",
    "Som pekný.",
    "Zajtra si kúpim pivo.",
    "Tak teraz neviem čo ďalej.",
    "Prehrali sme ale vidím na tom aj pozitívum.",
    "Tak som rozmýšlal, že to asi aj výjde."
]
for i in texts:
    print(i)
    print(predict_sentiment(i))

In [23]:
score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size)
print("Test score:", score)
print("Test accuracy:", acc)

In [24]:
model.save("lstm_gpt.keras")

In [25]:
import pickle
with open("lstm_gpt_tokenizer.pickle", "wb") as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
X.shape[1]