In [3]:
import pandas as pd
import numpy as np
import os


In [4]:
tweets = pd.read_csv("data/translated_tweets.csv.gz")
tweets['SentimentText'] = tweets['SentimentText'].str.lower().str.replace(r'[^\w\s]', '')

In [5]:
tweets.drop("SentimentSource", axis="columns", inplace=True)
tweets.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,je to tak smutné pre môjho apl priateľa..........
1,2,0,chýbal mi nový trailer...
2,3,1,omg je už 7:30 :o
3,4,0,.. omgaga. im sooo im gunna cry. som bol u toh...
4,5,0,"myslím si, že mi bf podvádza na mňa!!!"


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(tweets['SentimentText'].values)
X = tokenizer.texts_to_sequences(tweets['SentimentText'].values)
X = pad_sequences(X)

In [9]:
Y = tweets['Sentiment'].values

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [11]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.models import Sequential
embed_dim = 128
lstm_out = 196
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [22]:
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
batch_size = 32
epochs = 5

early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)
with tf.device('/GPU:0'):
    model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test), callbacks=[early_stopping])
# model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test), callbacks=[early_stopping])

Epoch 1/5
[1m28453/28453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2996s[0m 105ms/step - accuracy: 0.7676 - loss: 0.4813 - val_accuracy: 0.7797 - val_loss: 0.4604
Epoch 2/5
[1m28453/28453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2972s[0m 104ms/step - accuracy: 0.7825 - loss: 0.4571 - val_accuracy: 0.7839 - val_loss: 0.4546
Epoch 3/5
[1m28453/28453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2966s[0m 104ms/step - accuracy: 0.7875 - loss: 0.4493 - val_accuracy: 0.7841 - val_loss: 0.4526
Epoch 4/5
[1m28453/28453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4432s[0m 156ms/step - accuracy: 0.7894 - loss: 0.4454 - val_accuracy: 0.7853 - val_loss: 0.4531
Epoch 5/5
[1m28453/28453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4365s[0m 153ms/step - accuracy: 0.7917 - loss: 0.4424 - val_accuracy: 0.7866 - val_loss: 0.4525
Restoring model weights from the end of the best epoch: 5.


In [24]:
model.save("trained_lstm.keras")

In [25]:
score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size)
print("Test score:", score)
print("Test accuracy:", acc)

7114/7114 - 372s - 52ms/step - accuracy: 0.7866 - loss: 0.4525
Test score: 0.4525470733642578
Test accuracy: 0.7865970134735107


In [49]:
def predict_sentiment(text):
    text = text.lower().replace(r'[^\w\s]', '')
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=X.shape[1])
    pred = model.predict(padded)
    return 'Positive' if np.argmax(pred) == 1 else 'Negative'

In [50]:
texts = [
    "Akokoľvek sa na to pozriem tak to nevyzerá dobre.",
    "Tento deň je skvelý",
    "Neviem či som správne pochopil.",
    "To si vážne urobila?!",
    "To myslíš vážne?",
    "Ja som veľmi rád, že si sa to konečne naučila.",
    "Som pekný.",
    "Zajtra si kúpim pivo.",
    "Tak teraz neviem čo ďalej.",
    "Prehrali sme ale vidím na tom aj pozitívum.",
    "Tak som rozmýšlal, že to asi aj výjde."
]
for i in texts:
    print(predict_sentiment(i))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Positive


In [45]:
import pickle 
with open("tweets_lstm_tokenizer.pickle", "wb") as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
os.getcwd()

'/mnt/c/Users/Erik/Projects/nlp_sentiment_analysis'

In [46]:
X.shape[1]

255