In [6]:
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.models import Sequential
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

Preprocess the data

In [11]:
def preprocess_data(file_path):
    df = pd.read_csv(file_path, usecols=["Review", "Label"])
    df.rename(columns={"Label": "Rating"}, inplace=True)
    tokenizer = Tokenizer(num_words=20000, lower=True)
    tokenizer.fit_on_texts(df["Review"])
    X = tokenizer.texts_to_sequences(df["Review"].values)
    X = pad_sequences(X, maxlen=200)
    y = df["Rating"].values
    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test, tokenizer

Build LSTM model

In [None]:
def build_lstm_model(input_dim, output_dim, input_length):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
    return model

Build Bidirectional LSTM

In [None]:
def build_bidirectional_lstm_model(input_dim, output_dim, input_length):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    model.add(SpatialDropout1D(0.2))
    model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
    return model

Build bert model

In [4]:
#build random forest model with GridSearchCV
def build_random_forest_model_with_gridsearch():
  param_grid = {
      'n_estimators': [25,50],
      'max_depth': [None, 10, 20],
      'min_samples_split': [2, 5],
      'min_samples_leaf': [1, 2, 4]
  }
  rf_grid = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error')
  return rf_grid

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

def build_linear_regression_model_with_gridsearch():
    # Create a pipeline to include scaling
    pipeline = make_pipeline(
        StandardScaler(),  # Scale the features
        LinearRegression()
    )

    param_grid = {
        'linearregression__fit_intercept': [True, False],
        'linearregression__positive': [True, False]
    }

    grid_search = GridSearchCV(pipeline, param_grid, cv=5)
    return grid_search

In [24]:
#build xgboost model with Grid SearchCV
def build_xgboost_model_with_gridsearch():
  param_grid = {
      'n_estimators': [50, 75],
      'max_depth': [3, 5],
      'learning_rate': [0.1, 0.01]
  }
  xgb_grid = GridSearchCV(XGBRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error')
  return xgb_grid

In [None]:
# Train and evaluate model
def train_and_evaluate(model, X_train, y_train, X_test, y_test, batch_size=64, epochs=40):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f"Mean Squared Error: {mse}")
    return mse

In [8]:
#train and evaluate Linear Regression, RandomForest and Xgboost
def train_and_evaluate_with_gridsearch(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f"Mean Squared Error: {mse}")
    return mse

In [2]:
def save_model_and_tokenizer(model, tokenizer, model_path, tokenizer_path):
    # Check if the model is a GridSearchCV object
    if hasattr(model, 'best_estimator_'):
        model = model.best_estimator_  # Extract the best model if it is

    # Save the model with .keras extension if it's a Keras model
    if hasattr(model, 'save'):
        model.save(model_path + ".keras")  # Add .keras extension here
    else:
        # Handle saving non-Keras models here, e.g., using joblib for scikit-learn models
        import joblib
        joblib.dump(model, model_path + ".pkl")

    # Save the tokenizer
    with open(tokenizer_path, 'wb') as file:
        pickle.dump(tokenizer, file)

In [13]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
#preprocess data
file_path = "/content/drive/MyDrive/reviews.csv"
X_train, X_test, y_train, y_test, tokenizer = preprocess_data(file_path)

In [None]:
# LSTM Model
lstm_model = build_lstm_model(input_dim=20000, output_dim=128, input_length=200)
print("Training LSTM Model")
lstm_mse = train_and_evaluate(lstm_model, X_train, y_train, X_test, y_test)


In [None]:
# Bidirectional LSTM Model
bi_lstm_model = build_bidirectional_lstm_model(input_dim=20000, output_dim=128, input_length=200)
print("Training Bidirectional LSTM Model")
bi_lstm_mse = train_and_evaluate(bi_lstm_model, X_train, y_train, X_test, y_test)


Training Bidirectional LSTM Model
Epoch 1/40
[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m212s[0m 446ms/step - loss: 2.2702 - mean_squared_error: 2.2702 - val_loss: 1.4229 - val_mean_squared_error: 1.4229
Epoch 2/40
[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 445ms/step - loss: 1.3442 - mean_squared_error: 1.3442 - val_loss: 1.2438 - val_mean_squared_error: 1.2438
Epoch 3/40
[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 445ms/step - loss: 1.0864 - mean_squared_error: 1.0864 - val_loss: 1.2854 - val_mean_squared_error: 1.2854
Epoch 4/40
[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 447ms/step - loss: 0.9335 - mean_squared_error: 0.9335 - val_loss: 1.2258 - val_mean_squared_error: 1.2258
Epoch 5/40
[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 448ms/step - loss: 0.8231 - mean_squared_error: 0.8231 - val_loss: 1.3293 - val_mean_squared_error: 1.3293
Epoch 6/40
[1m467/467[0m [32m━━━━

In [16]:
#Random Forest model with GridSearchCV
rf_grid = build_random_forest_model_with_gridsearch()
print("Training Random Forest Model with GridSearchCV")
rf_mse = train_and_evaluate_with_gridsearch(rf_grid, X_train, y_train, X_test, y_test)
save_model_and_tokenizer(rf_grid, tokenizer, "best_model", "tokenizer.pkl")

Training Random Forest Model with GridSearchCV
Mean Squared Error: 0.46708895029097236


In [31]:
#Linear Regression model with Grid Search CV
lr_grid = build_linear_regression_model_with_gridsearch()
print("Training Linear Regression Model with GridSearchCV")
lr_mse = train_and_evaluate_with_gridsearch(lr_grid, X_train, y_train, X_test, y_test)

Training Linear Regression Model with GridSearchCV
Mean Squared Error: 1.8454456323675659


In [25]:
#xgboost model with grid search cv
xgb_grid = build_xgboost_model_with_gridsearch()
print("Training Xgboost Model with GridSearchCV")
xgb_mse = train_and_evaluate_with_gridsearch(xgb_grid, X_train, y_train, X_test, y_test)

Training Xgboost Model with GridSearchCV
Mean Squared Error: 1.1023760374139577


In [None]:
mse_scores = {
    "LSTM": lstm_mse,
    "Bidirectional LSTM": bi_lstm_mse,
    "Random Forest": rf_mse,
    "Linear Regression": lr_mse,
    "XGBoost": xgb_mse
}

In [1]:
# Determine the best model
best_model = min(mse_scores, key=mse_scores.get)
print(f"The best model is {best_model} with a mean squared error of {mse_scores[best_model]}.")


The best model is Random Forest with a mean squared error of 0.4726753968709584.


Save the best model

In [None]:
#save the best model, tokenizer based on the mse
from google.colab import files
if best_model == "LSTM":
    save_model_and_tokenizer(lstm_model, tokenizer, "best_model", "tokenizer.pkl")
    files.download('best_model.pkl')
elif best_model == "Bidirectional LSTM":
    save_model_and_tokenizer(bi_lstm_model, tokenizer, "best_model", "tokenizer.pkl")
    files.download('best_model.pkl')
elif best_model == "Random Forest":
    save_model_and_tokenizer(rf_grid, tokenizer, "best_model", "tokenizer.pkl")
    files.download('best_model.pkl')
elif best_model == "Linear Regression":
    save_model_and_tokenizer(lr_grid, tokenizer, "best_model", "tokenizer.pkl")
    files.download('best_model.pkl')
elif best_model == "XGBoost":
    save_model_and_tokenizer(xgb_grid, tokenizer, "best_model", "tokenizer.pkl")
    files.download('best_model.pkl')