In [71]:
import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [72]:
import re
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A) 
    text = text.lower() 
    return text

In [73]:
data = pd.read_csv('data.csv')
texts = data['text']
labels = data['label']

data.head()

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0


In [74]:
texts = texts.apply(clean_text)
texts.head()

0    i always wrote this series off as being a comp...
1    st watched    out of dirsteve purcell typical ...
2    this movie was so poorly written and directed ...
3    the most interesting thing about miryang secre...
4    when i first read about berlin am meer i didnt...
Name: text, dtype: object

In [75]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

Found 50503 unique tokens.


In [76]:
data = pad_sequences(sequences, maxlen=100)

In [77]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [78]:
model = Sequential()
model.add(Embedding(input_dim=len(word_index), output_dim=128))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])          

In [79]:
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 35ms/step - accuracy: 0.5738 - loss: 0.6559 - val_accuracy: 0.7810 - val_loss: 0.4469
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.8844 - loss: 0.2847 - val_accuracy: 0.7840 - val_loss: 0.5112
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.9524 - loss: 0.1639 - val_accuracy: 0.7860 - val_loss: 0.5251
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.9807 - loss: 0.0682 - val_accuracy: 0.8010 - val_loss: 0.5924
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.9885 - loss: 0.0447 - val_accuracy: 0.7900 - val_loss: 0.6881
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.9889 - loss: 0.0393 - val_accuracy: 0.7640 - val_loss: 0.6481
Epoch 7/10
[1m125/125

<keras.src.callbacks.history.History at 0x24eb6c665a0>

In [80]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)

print(f'Accuracy: {accuracy:.2f}%')

Accuracy: 0.79%


In [83]:
test_text = ['I fell asleep through Dune 1, it was so boring']
test_text = pd.Series(test_text).apply(clean_text)

sequences = tokenizer.texts_to_sequences(test_text)
data = pad_sequences(sequences, maxlen=100)

predictions = model.predict(data)
print(predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[[0.0001103]]
