In [2]:
!pip install imblearn

Collecting imblearn
  Obtaining dependency information for imblearn from https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl.metadata
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/92/e8/86c36e1b13007ca9c89381adac6c078cfc8fb71841a76c08a3fe3eca91d3/imbalanced_learn-0.12.0-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.12.0-py3-none-any.whl.metadata (8.2 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.0-py3-none-any.whl (257 kB)
   ---------------------------------------- 0.0/257.7 kB ? eta -:--:--
   - -------------------------------------- 10.2/257.7 kB ? eta -:--:--
   - -------------------------------------- 10.2/257.7 kB ? eta -:--:--
   --------- ----------------------------

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Embedding, Dense
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [4]:
# Preprocessing: lowercase, stopwords, lemmatization
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [6]:
from tensorflow.keras.models import load_model

# If saved as an HDF5 file
model = load_model('bilstm_model.h5')

In [7]:
# Load data
data = pd.read_csv('train.csv')  
# remove emptyrecords
data = data.dropna()

data['Review'] = data['Review'].apply(preprocess_text)

X = data['Review']
y = data['overall'].values - 1

# tokenization and padding
embedding_dim = 100
max_len = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=max_len, padding='post')
vocab_size = len(tokenizer.word_index) + 1
epochs = 10
batch_size = 64

In [10]:
test_data = pd.read_csv('test.csv') 
test_data = test_data.fillna('')

test_data['Review'] = test_data['Review'].apply(preprocess_text)

test_sequences = tokenizer.texts_to_sequences(test_data['Review'])
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

test_predictions = model.predict(test_padded)
predicted_labels = np.argmax(test_predictions, axis=1)

submission_df = pd.DataFrame({'id': test_data['id'], 'overall': predicted_labels + 1}) 



In [None]:
submission_df.to_csv('smote_lstm.csv', index=False)