# phase 3.2 model training: LSTM

In [None]:
import ast

import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences

## 5-star classification

In [None]:
def train_lstm_on_w2v_5_stars(train_text_path: str, train_labels_path: str, model_path: str) -> None:
    # load data
    X_train_raw = pd.read_csv(train_text_path, index_col=0)
    y_train = pd.read_csv(train_labels_path, index_col=0).squeeze()

    # align indexes
    X_train_raw, y_train = X_train_raw.align(y_train, join='inner', axis=0)

    # parse and pad W2V vectors
    X_train_sequences = X_train_raw['text'].apply(ast.literal_eval).apply(np.array).tolist()
    X_train_padded = pad_sequences(X_train_sequences, padding='post', dtype='float32')
    X_train = X_train_padded

    # encode labels
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_train_categorical = to_categorical(y_train_encoded)   

    # save the encoder for later decoding
    with open("../models/w2v_LSTM_label_encoder.pkl", 'wb') as file:
        pickle.dump(label_encoder, file)

    # build the LSTM model
    input_shape = (X_train.shape[1], X_train.shape[2])
    model = Sequential()
    model.add(Masking(mask_value=0., input_shape=input_shape))
    model.add(LSTM(64, return_sequences=False))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(y_train_categorical.shape[1], activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # train LSTM
    early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
    model.fit(X_train, y_train_categorical, epochs=10, batch_size=128, callbacks=[early_stopping])

    # save LSTM to file
    model.save(model_path)


In [None]:
# 70/30 version

train_lstm_on_w2v_5_stars(
    train_text_path='../data/70_30/train_texts_w2v.csv',
    train_labels_path='../data/70_30/train_labels.csv',
    model_path="../models/w2v_LSTM_70_30.h5"
)

In [None]:
# 80/20 version

train_lstm_on_w2v_5_stars(
    train_text_path='../data/80_20/train_texts_w2v.csv',
    train_labels_path='../data/80_20/train_labels.csv',
    model_path="../models/w2v_LSTM_80_20.h5"
)

## positive/neutral/negative classification

In [None]:
def train_lstm_on_w2v_pnn(train_text_path: str, train_labels_path: str, model_path: str) -> None:
    # load data
    X_train_raw = pd.read_csv(train_text_path, index_col=0)
    y_train = pd.read_csv(train_labels_path, index_col=0).squeeze()

    # align indexes
    X_train_raw, y_train = X_train_raw.align(y_train, join='inner', axis=0)

    # map stars to sentiments
    def map_sentiment(star_rating):
        if star_rating in [4, 5]:
            return 'Positive'
        elif star_rating == 3:
            return 'Neutral'
        else:
            return 'Negative'

    y_train = y_train.map(map_sentiment)

    # parse and pad W2V vectors
    X_train_sequences = X_train_raw['text'].apply(ast.literal_eval).apply(np.array).tolist()
    X_train_padded = pad_sequences(X_train_sequences, padding='post', dtype='float32')
    X_train = X_train_padded

    # encode labels
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_train_categorical = to_categorical(y_train_encoded)   

    # save the encoder for later decoding
    with open("../models/w2v_LSTM_label_encoder_PNN.pkl", 'wb') as file:
        pickle.dump(label_encoder, file)

    # build the LSTM model
    input_shape = (X_train.shape[1], X_train.shape[2])
    model = Sequential()
    model.add(Masking(mask_value=0., input_shape=input_shape))
    model.add(LSTM(64, return_sequences=False))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(y_train_categorical.shape[1], activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # train LSTM
    early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
    model.fit(X_train, y_train_categorical, epochs=10, batch_size=128, callbacks=[early_stopping])

    # save LSTM to file
    model.save(model_path)


In [None]:
# 70/30 version

train_lstm_on_w2v_pnn(
    train_text_path='../data/70_30/train_texts_w2v.csv',
    train_labels_path='../data/70_30/train_labels.csv',
    model_path="../models/w2v_LSTM_70_30_PNN.h5"
)

In [None]:
# 80/20 version

train_lstm_on_w2v_pnn(
    train_text_path='../data/80_20/train_texts_w2v.csv',
    train_labels_path='../data/80_20/train_labels.csv',
    model_path="../models/w2v_LSTM_80_20_PNN.h5"
)