In [30]:
import pandas as pd
import numpy as np

np.random.seed(1337)

data = pd.read_csv('./clean_dataset.csv')
data = data.iloc[np.random.permutation(len(data))]
print(data['text'].str.len().describe())

CORPUS_MAX_SIZE = 1024
data['text'] = data['text'].str.slice(0, CORPUS_MAX_SIZE)

count    50000.000000
mean       858.389040
std        658.428061
min         22.000000
25%        452.000000
50%        633.000000
75%       1044.000000
max       9434.000000
Name: text, dtype: float64


In [31]:
train_size = 0.8
validation_size = 0.5

split_id = int(len(data) * train_size)

temp_train_x, test_x = data.text[:split_id], data.text[split_id:]
temp_train_y, test_y = data.label[:split_id], data.label[split_id:]

In [32]:
from collections import Counter
from tqdm import tqdm
tqdm.pandas()

words = temp_train_x.str.cat(sep=' ').split()

# build vocabulary
frequency_counter = Counter(words)
# sort words by the frequency they appear in the text
vocab = sorted(frequency_counter, key=frequency_counter.get, reverse=True)

# associate a number to each word in the list in ascending order
# in this way the most frequent words have lower numbers
int2word = dict(enumerate(vocab[:5000], 2))
int2word[0] = '<PAD>'
int2word[1] = '<UNK>'
word2int = {word: id for id, word in int2word.items()}
# encode words
reviews_enc = [[word2int.get(word, 1) for word in review.split()] for review in tqdm(temp_train_x.values)]

features = np.zeros((len(reviews_enc), CORPUS_MAX_SIZE), dtype=int)

for i, row in enumerate(reviews_enc):
  index = CORPUS_MAX_SIZE - len(row)
  features[i, index:] = np.array(row)[:CORPUS_MAX_SIZE]

# make val and test set
split_val_id = int(len(temp_train_x) * validation_size)
train_x, val_x = features[:split_val_id], features[split_val_id:]
train_y, val_y = temp_train_y[:split_val_id], temp_train_y[split_val_id:]

100%|██████████| 40000/40000 [00:00<00:00, 52706.05it/s]


now i have a train, validation and test set

In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

# Parameters
vocab_size = 10000  # Adjust to your vocabulary size
embedding_dim = 128  # Dimension of the word embeddings
sequence_length = 1024  # Length of the input sequences
lstm_units = 128  # Number of LSTM units
batch_size = 128
epochs = 4

X = train_x
y = train_y

# Build the LSTM model
model = Sequential([
  Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=sequence_length),
  Bidirectional(LSTM(lstm_units, return_sequences=True)),  # Bidirectional LSTM
  LSTM(lstm_units, return_sequences=False),  # Unidirectional LSTM
  Dense(64, activation='relu'),  # Dense layer with ReLU activation
  Dropout(0.2),  # Dropout layer
  Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy', 'F1Score'])




In [34]:
print("Training...")
model.fit(X, y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Save the model to disk
model.save("lstm_binary_classifier.keras")
print("Model saved to lstm_binary_classifier.h5")

Training...
Epoch 1/4
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m575s[0m 5s/step - F1Score: 0.6634 - accuracy: 0.6579 - loss: 0.5906 - val_F1Score: 0.6722 - val_accuracy: 0.8443 - val_loss: 0.3730
Epoch 2/4
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m572s[0m 5s/step - F1Score: 0.6666 - accuracy: 0.8910 - loss: 0.2755 - val_F1Score: 0.6722 - val_accuracy: 0.8530 - val_loss: 0.3468
Epoch 3/4
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m562s[0m 4s/step - F1Score: 0.6666 - accuracy: 0.9166 - loss: 0.2164 - val_F1Score: 0.6722 - val_accuracy: 0.8572 - val_loss: 0.3872
Epoch 4/4
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m574s[0m 5s/step - F1Score: 0.6671 - accuracy: 0.9440 - loss: 0.1559 - val_F1Score: 0.6722 - val_accuracy: 0.8342 - val_loss: 0.3901
Model saved to lstm_binary_classifier.h5


In [35]:
# Evaluate the model
loss, accuracy = model.evaluate(val_x, val_y, verbose=1)
print(f"Final Loss: {loss:.4f}, Final Accuracy: {accuracy:.4f}")

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m340s[0m 544ms/step - F1Score: 0.6609 - accuracy: 0.8391 - loss: 0.3837


ValueError: too many values to unpack (expected 2)