# Multi-Channel LSTM

In [1]:
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
import pickle
from numpy import asarray

from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score, auc, roc_curve, RocCurveDisplay, confusion_matrix, classification_report
from sklearn.multiclass import OneVsRestClassifier
from itertools import cycle

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Embedding, concatenate, LSTM, Dense
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.legacy import Adam, SGD, RMSprop, Adagrad

import skopt
from skopt import gbrt_minimize, gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Categorical, Integer  
from tensorflow.keras import backend as K


# DATASET
DATASET_COLUMNS = ['Id', 'Review', 'Sentiment']
# Define a dictionary to map sentiment values to category names
senti_labels = {1: 'Negative', 2: 'Neutral', 3: 'Positive'}
senti_categories = list(senti_labels.values())
NUM_of_CLASSES = 3

input_folder_path = "./pls/Thesis_Jupyter_Final/src/input/"
processed_folder_path = "./pls/Thesis_Jupyter_Final/src/input/processed"

2023-06-16 01:51:25.949447: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))
# just checkıng gpu ıs avaılable

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
train = pd.read_csv(os.path.join(input_folder_path, "train.csv"))
val = pd.read_csv(os.path.join(input_folder_path, "val.csv"))
test = pd.read_csv(os.path.join(input_folder_path, "test.csv"))

x_train = train['x']
y_train = train['y']
x_val = val['x']
y_val = val['y']
x_test = test['x']
y_test = test['y']

x_train_encoded = np.load(os.path.join(processed_folder_path, "train_encoded_x.npy"))
y_train_encoded = np.load(os.path.join(processed_folder_path, "train_encoded_y.npy"))
x_val_encoded = np.load(os.path.join(processed_folder_path, "val_encoded_x.npy"))
y_val_encoded = np.load(os.path.join(processed_folder_path, "val_encoded_y.npy"))
x_test_encoded = np.load(os.path.join(processed_folder_path, "test_encoded_x.npy"))
y_test_encoded = np.load(os.path.join(processed_folder_path, "test_encoded_y.npy"))

w2v_embedding_vectors = np.load(os.path.join(processed_folder_path, "embedding_matrix.npy"))
print(w2v_embedding_vectors)

%store -r embedding_vocab_size
%store -r EMBEDDING_DIM
%store -r max_seq_length

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [-0.2965     -0.25454     0.52863002 ... -0.57709002  0.11623
   0.59446001]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.089755    0.045177    0.14989001 ... -0.12161    -1.33650005
  -0.012269  ]]


## Evaluation Functions

In [4]:
def calculate_metrics(score):
    acc =  score[1]
    loss = score[0]

    print(f"Accuracy: {acc:.2%}")
    print(f"Loss: {loss:.2f}")
    
    return acc, loss

def calculate_classification_report(y, y_pred, labels):
    report = classification_report(y, y_pred, labels=labels)
    print("Classification Report:\n", report)

def plot_confusion_matrix(y_true, y_pred, labels):
    cnf_mat = confusion_matrix(y_true, y_pred)
    mat_disp = ConfusionMatrixDisplay(confusion_matrix=cnf_mat, display_labels=labels)
    mat_disp = mat_disp.plot(cmap='Blues', xticks_rotation='vertical')
    plt.title(f'Confusion Matrix')
    plt.show()

def evaluate_model(model, model_name, x_encoded, y_encoded, y=None, only_metrics=True):    
    y_pred_prob = model.predict(x_encoded)

    print(f"*{model_name}")
    
    score = model.evaluate(x_encoded, y_encoded, verbose=0)
    calculate_metrics(score)
    
    senti_labels = ['negative', 'neutral', 'positive'] #TODO: to constants
    
    if not only_metrics:
        y_pred = np.argmax(y_pred_prob, axis=1) + 1
        calculate_classification_report(y, y_pred, labels=senti_labels)
        plot_confusion_matrix(y, y_pred, labels=senti_labels)
    
    print()

def one_hot_encode(y):
    y_encoded = np.zeros((len(y), NUM_of_CLASSES))
    for i, label in enumerate(y):
        y_encoded[i, label - 1] = 1

    return y_encoded

def plot_roc_curve(prob_test_vec, y_test, labels):
    fig, ax = plt.subplots(figsize=(10, 10))
    labels = labels
    colors = cycle(['limegreen', 'dodgerblue', 'red'])
    for senti, color in zip(range(NUM_of_CLASSES), colors):
        RocCurveDisplay.from_predictions(
            y_test[:, senti],
            prob_test_vec[:, senti],
            name=f"ROC curve for {labels[senti]}",
            color=color,
            ax=ax,
        )
    
def calculate_OvR_roc_auc_score(model, x, y, x_test, y_test, labels): #average??
    #y = one_hot_encode(y)
    #y_test = one_hot_encode(y_test)

    ovr_model = OneVsRestClassifier(model).fit(x, y)
    prob_test_vec = ovr_model.predict_proba(x_test)
    
    fpr, tpr, thresholds, auc_score = [], [], [], []
    for _ in range(NUM_of_CLASSES):
        fpr.append(0)
        tpr.append(0)
        thresholds.append(0)
        auc_score.append(0)
    
    for i in range(NUM_of_CLASSES):
        fpr[i], tpr[i], thresholds[i] = roc_curve(y_test[:, i], prob_test_vec[:, i])
        auc_score[i] = auc(fpr[i], tpr[i])

    print(f"AUC score: {auc_score}")
    averaged_auc_score = (sum(auc_score) / NUM_of_CLASSES)
    print(f"Averaged AUC score: {averaged_auc_score:.2f}")
    
    plot_roc_curve(prob_test_vec, y_test, labels)

In [5]:
def plot_development(history):
    acc =  history.history['accuracy']
    val_acc = history.history['val_accuracy']

    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(len(acc))
    
    plt.plot(epochs, acc, 'b', label='Training Accuracy')
    plt.plot(epochs, val_acc, 'r', label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()
    
    plt.figure()
    
    plt.plot(epochs, loss, 'b', label='Training Loss')
    plt.plot(epochs, val_loss, 'r', label='Validation Loss')
    plt.title('Training and validation Loss')
    plt.legend()
    
    plt.show()

## Hypterparameter Tuning

In [6]:
num_output_classes = 3
batch_size= 16
epochs=30

num_gru_layersA = Integer(low=1, high=3, name='num_gru_layersA')
num_gru_layersB = Integer(low=1, high=3, name='num_gru_layersB')
num_gru_unitsA = Categorical([16, 32, 64, 128], name='num_gru_unitsA') 
num_gru_unitsB = Categorical([16, 32, 64, 128], name='num_gru_unitsB') 
learning_rate = Categorical([1e-4, 1e-3, 1e-2], name='learning_rate')
adam_decay = Categorical([1e-6, 1e-4, 1e-2], name="adam_decay")

search_space = [
            num_gru_layersA,
            num_gru_layersB,
            num_gru_unitsA,
            num_gru_unitsB,
            learning_rate,
            adam_decay
            ]

# Specify one or more initial points for the search of optimal parameter
default_params = [1, 
                  1, 
                  16,
                  16, 
                  1e-4,
                  1e-6 
                  ]

In [7]:
def define_multi_channel_gru_model(num_gru_layersA, num_gru_layersB, num_gru_unitsA, num_gru_unitsB, learning_rate, adam_decay):
    # Vocabulary-based embedding layer
    inputsA = Input(shape=(max_seq_length,), name="input regular embeddings")
    # Word2Vec embedding layer
    inputsB = Input(shape=(max_seq_length,), name="input word2vec embeddings")
    
    # Define an embedding layer for each input
    embeddingsA = Embedding(embedding_vocab_size, EMBEDDING_DIM, input_length=max_seq_length, name="embeddingsA")(inputsA)
    embeddingsB = Embedding(embedding_vocab_size, EMBEDDING_DIM, input_length=max_seq_length, weights=[w2v_embedding_vectors], trainable=False, name="embeddingsB")(inputsB)
    
    # Pass both embeddings through their own LSTM layers
    gru_layersA = embeddingsA
    for i in range(num_gru_layersA):
        nameA = 'layer_gruA_{0}'.format(i+1)
        if i < num_gru_layersA-1:
            gru_layersA = LSTM(num_gru_unitsA, return_sequences=True, name=nameA)(gru_layersA)
        else:
            gru_layersA = LSTM(num_gru_unitsA, return_sequences=False, name=nameA)(gru_layersA)
        
    gru_layersB = embeddingsB
    for i in range(num_gru_layersB):
        nameA = 'layer_gruB_{0}'.format(i+1)
        if i < num_gru_layersB-1:
            gru_layersB = LSTM(num_gru_unitsB, return_sequences=True, name=nameA)(gru_layersB)
        else:
            gru_layersB = LSTM(num_gru_unitsB, return_sequences=False, name=nameA)(gru_layersB)
        

    # Concatenate the two inputs
    merged = concatenate([gru_layersA, gru_layersB])

    # Dense layer for the merged inputs & output Layer
    outputs = Dense(num_output_classes, activation='softmax', name="output")(merged)

    # Create the model
    model = Model(inputs=[inputsA, inputsB], outputs=outputs)

    # Compile the model
    adam = Adam(learning_rate=learning_rate, decay=adam_decay)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
    print(model.summary())

    return model

In [8]:
@use_named_args(dimensions=search_space)
def multi_objective_function(num_gru_layersA, num_gru_layersB, num_gru_unitsA, num_gru_unitsB, learning_rate, adam_decay):

    model = define_multi_channel_gru_model(num_gru_layersA=num_gru_layersA,
                                            num_gru_layersB=num_gru_layersB,
                                            num_gru_unitsA=num_gru_unitsA,
                                            num_gru_unitsB=num_gru_unitsB,
                                            learning_rate=learning_rate,
                                            adam_decay=adam_decay
                                            )

    early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)
    history = model.fit([x_train_encoded, x_train_encoded],
                        y_train_encoded,
                        validation_data=([x_val_encoded, x_val_encoded], y_val_encoded),
                        epochs=epochs, # TODO
                        batch_size=batch_size,
                        callbacks=[early_stopping],
                        verbose=2
                        )
    #return the validation accuracy for the last epoch.
    accuracy = history.history['val_accuracy'][-1]
    loss = history.history['val_loss'][-1]

    # Print the classification accuracy.
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Loss: {loss:.2}\n")


    # Delete the Keras model with these hyper-parameters from memory.
    del model
    
    # Clear the Keras session, otherwise it will keep adding new
    # models to the same TensorFlow graph each time we create
    # a model with a different set of hyper-parameters.
    K.clear_session()
    tf.compat.v1.reset_default_graph()
    
    # the optimizer aims for the lowest score, so we return our negative accuracy
    return -accuracy

## Gradient Boosted Model

In [9]:
gbrt_result = gbrt_minimize(func=multi_objective_function,
                            dimensions=search_space,
                            n_calls=12,
                            n_jobs=-1,
                            x0=default_params)

2023-06-16 01:51:33.597641: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 36469 MB memory:  -> device: 0, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:e3:00.0, compute capability: 8.0


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input regular embeddings (Inpu  [(None, 659)]       0           []                               
 tLayer)                                                                                          
                                                                                                  
 input word2vec embeddings (Inp  [(None, 659)]       0           []                               
 utLayer)                                                                                         
                                                                                                  
 embeddingsA (Embedding)        (None, 659, 100)     2480700     ['input regular embeddings[0][0]'
                                                                 ]                            

2023-06-16 01:51:40.328513: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8401
2023-06-16 01:51:40.555947: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


In [None]:
# TODO data frame summarizing parameter search
gbrt_best_params = {param.name: value for param, value in zip(gbrt_result.space, gbrt_result.x)}
print("Best Hyperparameters:", gbrt_best_params)

In [None]:
model = define_multi_channel_gru_model(gbrt_best_params['num_gru_layersA'],
                                        gbrt_best_params['num_gru_layersB'],
                                        gbrt_best_params['num_gru_unitsA'], 
                                        gbrt_best_params['num_gru_unitsB'],
                                        gbrt_best_params['learning_rate'], 
                                        gbrt_best_params['adam_decay'],
                                        )

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) # TODO: should I, again?
history = model.fit([x_train_encoded, x_train_encoded],
                    y_train_encoded,
                    validation_data=([x_val_encoded, x_val_encoded], y_val_encoded),
                    epochs=epochs, # TODO
                    batch_size=batch_size,
                    callbacks=[early_stopping],
                    verbose=2
                    )
plot_development(history)

In [None]:
model.evaluate([x_train_encoded, x_train_encoded], y_train_encoded)
evaluate_model(model, "Train multi-LSTM", x_train_encoded, y_train_encoded, only_metrics=True)

model.evaluate([x_val_encoded, x_val_encoded], y_val_encoded, verbose=0)
evaluate_model(model, "Val multi-LSTM", x_val_encoded, y_val_encoded, only_metrics=True)

model.evaluate([x_test_encoded, x_test_encoded], y_test_encoded, verbose=0)
evaluate_model(model, "Test multi-LSTM", x_test_encoded, y_test_encoded, y_test, only_metrics=False)
senti_labels = ['negative', 'neutral', 'positive'] # TODO
#calculate_OvR_roc_auc_score(model, x_train, y_train, x_test, y_test, senti_labels)

## Gaussian Process Model

In [None]:
gp_result = gp_minimize(func=multi_objective_function,
                            dimensions=search_space,
                            n_calls=12,
                            noise= 0.01,
                            n_jobs=-1,
                            kappa = 5,
                            x0=default_params)

In [None]:
# TODO data frame summarizing parameter search
gp_best_params = {param.name: value for param, value in zip(gp_result.space, gp_result.x)}
print("Best Hyperparameters:", gp_best_params)

In [None]:
model = define_multi_channel_gru_model(gp_best_params['num_gru_layersA'],
                                        gp_best_params['num_gru_layersB'],
                                        gp_best_params['num_gru_unitsA'], 
                                        gp_best_params['num_gru_unitsB'],
                                        gp_best_params['learning_rate'], 
                                        gp_best_params['adam_decay']
                                        )

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) # TODO: should I, again?
history = model.fit([x_train_encoded, x_train_encoded],
                    y_train_encoded,
                    validation_data=([x_val_encoded, x_val_encoded], y_val_encoded),
                    epochs=epochs, # TODO
                    batch_size=batch_size,
                    callbacks=[early_stopping],
                    verbose=2
                    )
plot_development(history)

In [None]:
model.evaluate([x_train_encoded, x_train_encoded], y_train_encoded)
evaluate_model(model, "Train multi-LSTM", x_train_encoded, y_train_encoded, only_metrics=True)

model.evaluate([x_val_encoded, x_val_encoded], y_val_encoded, verbose=0)
evaluate_model(model, "Val multi-LSTM", x_val_encoded, y_val_encoded, only_metrics=True)

model.evaluate([x_test_encoded, x_test_encoded], y_test_encoded, verbose=0)
evaluate_model(model, "Test multi-LSTM", x_test_encoded, y_test_encoded, y_test, only_metrics=False)
senti_labels = ['negative', 'neutral', 'positive'] # TODO
#calculate_OvR_roc_auc_score(model, x_train, y_train, x_test, y_test, senti_labels)