In [2]:
import numpy as np 
import pandas as pd

In [5]:
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, Bidirectional, Dropout
from sklearn.model_selection import train_test_split

import gensim

In [3]:
train_df = pd.read_csv('preprocessed_train.zip')
test_df = pd.read_csv('test.csv')

In [8]:
# Drop rows with null values
train_df = train_df.dropna()

In [4]:
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler

def early_stopping(monitor='val_loss', min_delta=0, patience=5, mode='auto'):
    return EarlyStopping(monitor=monitor, min_delta=min_delta, patience=patience, mode=mode)

# Function to create step decay learning rate scheduler
def step_decay(initial_lr=0.001, drop_factor=0.5, epochs_drop=5):
    def scheduler(epoch):
        return initial_lr * np.power(drop_factor, np.floor((1 + epoch) / epochs_drop))
    return scheduler

In [11]:
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder

# Split the data into features and target labels
X = train_df['lemprocessing_text']
y = train_df['overall']

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Padding sequences
max_len = 100  # Define your maximum sequence length
X_padded = pad_sequences(X_sequences, maxlen=max_len)


vocab_size = len(tokenizer.word_index) + 1 
embedding_size = 100  

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=X, vector_size=embedding_size, window=5, min_count=1, workers=4)

# Convert words to Word2Vec embeddings
word_index = tokenizer.word_index
embedding_matrix = np.zeros((vocab_size, embedding_size))
# Convert words to Word2Vec embeddings
embedding_matrix = np.zeros((vocab_size, embedding_size))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]


# Convert target labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# Define BiLSTM model with Word2Vec embeddings
def BiLSTM_model_with_Word2Vec_lem(input_length, vocab_size, embedding_size, embedding_matrix):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_size, input_length=input_length, weights=[embedding_matrix], trainable=False))
    model.add(Bidirectional(LSTM(150)))
    model.add(Dropout(0.3))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Early stopping and decay callbacks
early_stop_callback = early_stopping(patience=5)
decay = LearningRateScheduler(step_decay(initial_lr=0.001, drop_factor=0.5, epochs_drop=5))
callbacks_list = [early_stop_callback, decay]

# Create and compile the LSTM model with Word2Vec embeddings
bilstm_model_with_word2vec_lem = BiLSTM_model_with_Word2Vec_lem(max_len, vocab_size, embedding_size, embedding_matrix)

# Train the model
bilstm_model_with_word2vec_lem.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), callbacks=callbacks_list)

# Evaluate the model
loss, accuracy = bilstm_model_with_word2vec_lem.evaluate(X_test, y_test)
print("Test Accuracy with Word2Vec embeddings:", accuracy)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy with Word2Vec embeddings: 0.7633305191993713


In [12]:
test_df['Review'] = test_df['Review'].fillna('')
X_test = test_df['Review'] 
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len)

# Predict ratings
predicted_labels = bilstm_model_with_word2vec_lem.predict(X_test_padded)
predicted_ratings = predicted_labels.argmax(axis=1)

# Decode numerical labels back to original categories
predicted_sentiments = label_encoder.inverse_transform(predicted_ratings)

# Combine predictions with IDs
predictions_df = pd.DataFrame({'id': test_df['id'], 'overall': predicted_sentiments})

# Save predictions
predictions_df.to_csv('bilstm_lem.csv', index=False)

# Combine predictions with IDs
tm_df = pd.DataFrame({'id': test_df['id'], 'Review': test_df['Review'], 'overall': predicted_sentiments})

# Save predictions
tm_df.to_csv('bilstm_tm_lem.csv', index=False)



