In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping





In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df = train_df.dropna()

In [4]:
train_df = train_df[:1000]

In [5]:
train_df

Unnamed: 0,overall,Review
0,5,I love these glitter pens. They sparkle deligh...
1,5,It works well with my machine. I use mostly c...
2,5,"This is a great assortment of colors, though t..."
3,5,Just what I was looking for.
4,5,I make 400 birds for the hospital each month.
...,...,...
995,5,worked great
996,5,good assortment
997,1,work on the design did not make a Dresden pla...
998,5,As described


In [6]:
x_train = train_df['Review'].values
y_train = train_df['overall'].values -1

In [7]:
len(x_train)

1000

In [8]:
len(y_train)

1000

In [9]:
train_df['text_length'] = train_df['Review'].apply(len)
train_df.head()

Unnamed: 0,overall,Review,text_length
0,5,I love these glitter pens. They sparkle deligh...,181
1,5,It works well with my machine. I use mostly c...,57
2,5,"This is a great assortment of colors, though t...",318
3,5,Just what I was looking for.,28
4,5,I make 400 birds for the hospital each month.,45


In [10]:
X_train, X_test, y_train, y_test = train_test_split(train_df['Review'], y_train, test_size=0.2, random_state=42)


In [11]:
max_len = 100 
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # out of vocabulary token
vocab_size = 500
# tokenizer = Tokenizer(num_words=vocab_size, char_level=False, oov_token=oov_tok)
# tokenizer.fit_on_texts(X_train)
# word_index = tokenizer.word_index
# total_words = len(word_index)

# print(total_words)


In [12]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

word_index = tokenizer.word_index
total_words = len(word_index)

print(total_words)

print("Sample from X_train after tokenization and padding:")
print(X_train_padded[0])

print("\nSample from X_test after tokenization and padding:")
print(X_test_padded[0])


3439
Sample from X_train after tokenization and padding:
[  3 181 347 287  21 142 263  58   3   1   7   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]

Sample from X_test after tokenization and padding:
[444 170   1   4  88 249  99  99 170   1   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]


In [13]:
vocab_size = 500 
embedding_dim = 32
model = Sequential()
model.add(Embedding(vocab_size, 100 , input_length=max_len))
model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
model.add(Bidirectional(LSTM(units=64)))
model.add(Dense(units=5, activation='softmax'))


# sparse_categorical_crossentropy cause multi-class+no one hot encoding
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(patience=3, restore_best_weights=True, mode='min')





In [14]:
model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_test_padded, y_test), callbacks=[early_stopping])

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.src.callbacks.History at 0x17cc6bd2dd0>

In [15]:
loss, accuracy = model.evaluate(X_test_padded, y_test)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.7200000286102295


# Word2Vec + LSTM