In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Embedding, Dense
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [5]:
# Preprocessing: lowercase, stopwords, lemmatization
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

# Load data
data = pd.read_csv('train.csv')  
# remove emptyrecords
data = data.dropna()
# ===============================================REMOVE THIS LINE FOR FULL DATASET==========================================
data = data[:1000] 
#===========================================================================================================================


data['Review'] = data['Review'].apply(preprocess_text)

X = data['Review']
y = data['overall'].values - 1

embedding_dim = 100
max_len = 100

# tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=max_len, padding='post')

vocab_size = len(tokenizer.word_index) + 1


#smote to oversample records that are not well-represented
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_padded, y)
print(len(X))
print(len(y))

# Bi-LSTM 
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    Bidirectional(LSTM(64)),
    Dense(5, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

epochs = 10
batch_size = 64
# 10-Fold CV
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracies = []
for train_idx, test_idx in kfold.split(X_resampled, y_resampled):
    X_train, X_test = X_resampled[train_idx], X_resampled[test_idx]
    y_train, y_test = y_resampled[train_idx], y_resampled[test_idx]

    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)

    _, accuracy = model.evaluate(X_test, y_test, verbose=0)
    accuracies.append(accuracy)
mean_accuracy = np.mean(accuracies)
print("Mean Accuracy with 10-Fold Cross-Validation:", mean_accuracy)


1000
1000
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Mean Accuracy with 10-Fold Cross-Validation: 0.842098492383957


In [6]:
test_data = pd.read_csv('test.csv') 
test_data = test_data.dropna()
# ===============================================REMOVE THIS LINE FOR FULL DATASET==========================================
test_data = test_data[:500]
#===========================================================================================================================

test_data['Review'] = test_data['Review'].apply(preprocess_text)

test_sequences = tokenizer.texts_to_sequences(test_data['Review'])
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

test_predictions = model.predict(test_padded)
predicted_labels = np.argmax(test_predictions, axis=1)

submission_df = pd.DataFrame({'Review': test_data['Review'], 'Predicted': predicted_labels + 1}) 

submission_df.to_csv('smote_lstm.csv', index=False)

