### BiLSTM Model training & validation

In [None]:
case = "Case_M"
code = f"20x1_{case}"
dataset_splitted_path = f"datasets/{case}"
models_path = f"models/{case}/{code}/bilstm"
embed_path = "features/fastText"
fastText_path = "../fastText/cc.en.300.bin"

# dataset_names = [
#     'APSTUD', 'BAM', 'CLOV', 'DM', 'DURACLOUD', 'JRESERVER', 'MDL', 'MESOS', 'MULE', 'MULESTUDIO', 'TIMOB', 'TISTUD', 'USERGRID', 'XD'  
# ]

batch_1 = ['APSTUD', 'BAM', 'CLOV', 'DM']
batch_2 = ['DURACLOUD', 'JRESERVER', 'MDL', 'MESOS']
batch_3 = ['MULE', 'MULESTUDIO', 'TIMOB', 'USERGRID']
batch_4 = ['TISTUD', 'XD']

# combine all datasets
dataset_names = batch_1 + batch_2 + batch_3 + batch_4

vocab = 10001
LEARNING_RATE = 1e-4   
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 20

print("done")

### Word Embedding

In [None]:
from datasets import Dataset
from sklearn.metrics import median_absolute_error
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Bidirectional, Embedding, BatchNormalization
from tensorflow.keras.optimizers import Adam
import os
from datasets import concatenate_datasets
import json
import numpy as np

from gensim.models import FastText
from gensim.models.fasttext import load_facebook_vectors

## Load the FastText model
fasttext_model = load_facebook_vectors(fastText_path)


for dataset_name in dataset_names:

    print(f"start processing - {dataset_name}...")

    output_dir = f"{embed_path}/{dataset_name}"

    # load the json data
    raw_train_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/train.json')
    raw_val_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/val.json')
    raw_test_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/test.json')

    # Tokenization
    tokenizer = Tokenizer(num_words=vocab, oov_token='<OOV>')
    combined = pd.concat([pd.Series(raw_train_data['text']), pd.Series(raw_val_data['text'])])
    tokenizer.fit_on_texts(combined)

    train = pd.Series(raw_train_data['text'])
    val = pd.Series(raw_val_data['text'])

    train_sequences = tokenizer.texts_to_sequences(train)
    val_sequences = tokenizer.texts_to_sequences(val)

    # Padding
    max_len = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in val_sequences))
    train_padded = pad_sequences(train_sequences, maxlen=max_len, dtype='int32', padding='pre',truncating='pre', value=0)
    val_padded = pad_sequences(val_sequences, maxlen=max_len, dtype='int32', padding='pre',truncating='pre', value=0)

    # Create an embedding matrix
    embedding_matrix = np.zeros((vocab, 300))  # Assuming the FastText model dimensions are 300
    for word, i in tokenizer.word_index.items():
        if i <= 10000:  # Only consider the top num_words
            if word in fasttext_model:
                embedding_vector = fasttext_model[word]
                embedding_matrix[i] = embedding_vector

    # save the embedding matrix
    os.makedirs(output_dir, exist_ok=True)
    np.save(f'{output_dir}/{dataset_name}-ft-embed.npy', embedding_matrix)

print("done" )

### BiLSTM model training & validation

In [None]:
from datasets import Dataset
from sklearn.metrics import median_absolute_error
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Bidirectional, Embedding, BatchNormalization
from tensorflow.keras.optimizers import Adam
import os
from datasets import concatenate_datasets
import json
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils import plot_model

def get_model_new(embedding_matrix, max_len):
    model = Sequential()
    model.add(Embedding(
        input_dim = vocab,
        output_dim = 300, 
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=True))
    model.add(Bidirectional(LSTM(128, recurrent_dropout=0.1, return_sequences=True)))
    model.add(Bidirectional(LSTM(64, recurrent_dropout=0.1)))
    model.add(Dropout(0.25))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='linear'))
    # model.summary()

    return model

for dataset_name in dataset_names:

    print(f"start processing - {dataset_name}...")

    output_dir = f"{models_path}/{dataset_name}"

    # load the json data
    raw_train_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/train.json')
    raw_val_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/val.json')
    raw_test_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/test.json')

    # Tokenization
    tokenizer = Tokenizer(num_words=vocab, oov_token='<OOV>')
    combined = pd.concat([pd.Series(raw_train_data['text']), pd.Series(raw_val_data['text'])])
    tokenizer.fit_on_texts(combined)

    train = pd.Series(raw_train_data['text'])
    val = pd.Series(raw_val_data['text'])

    train_sequences = tokenizer.texts_to_sequences(train)
    val_sequences = tokenizer.texts_to_sequences(val)

    # Padding
    max_len = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in val_sequences))
    train_padded = pad_sequences(train_sequences, maxlen=max_len, dtype='int32', padding='pre',truncating='pre', value=0)
    val_padded = pad_sequences(val_sequences, maxlen=max_len, dtype='int32', padding='pre',truncating='pre', value=0)

    # Load embedding matrix
    embedding_matrix = np.load(f'{embed_path}/{dataset_name}/{dataset_name}-ft-embed.npy')

    model = get_model_new(embedding_matrix, max_len)

    # optimizer = Adam(learning_rate=LEARNING_RATE)
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=LEARNING_RATE)
    model.compile(loss='mae', optimizer=optimizer, metrics=['mae'])

    # Generate the plot
    # plot_model(model, to_file=f'{output_dir}/{dataset_name}_model_plot.png', show_shapes=True, show_layer_names=False, show_layer_activations=True)

    # Fit the model
    train_label = pd.Series(raw_train_data['storypoint'])
    val_label = pd.Series(raw_val_data['storypoint'])

    history = model.fit(train_padded, train_label, validation_data=(val_padded, val_label), epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])
    
    os.makedirs(output_dir, exist_ok=True)
    model.save(f'{output_dir}/{dataset_name}.keras')

    val_loss, val_mae = model.evaluate(val_padded, val_label, verbose=1)
    print(f"val_loss: {val_loss}, val_mae: {val_mae}")

    # test
    test = pd.Series(raw_test_data['text'])
    test_sequences = tokenizer.texts_to_sequences(test)
    test_padded = pad_sequences(test_sequences, maxlen=max_len, dtype='int32', padding='pre',truncating='pre', value=0)

    test_label = pd.Series(raw_test_data['storypoint'])
    test_loss, test_mae = model.evaluate(test_padded, test_label, verbose=1)
    print(f"test_loss: {test_loss}, test_mae: {test_mae}")

    metrics = { 
        'type': 'bilstm',
        'val_loss': val_loss, 'val_mae': val_mae, 
        'test_loss': test_loss, 'test_mae': test_mae, 
        'epoch': EPOCHS,
        'batch_size': BATCH_SIZE,
        'learning_rate': LEARNING_RATE
    }

    metrics_json = json.dumps(metrics, indent=2)

    # Specify the file path
    file_path = f"{output_dir}/{dataset_name}.json"

    # Write the JSON string to the file
    with open(file_path, 'w') as file:
        file.write(metrics_json)

    # if 1 == 1:
    #     break

print("done")

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from datasets import Dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os

# Assume vocab, dataset_names, dataset_splitted_path, models_path are already defined

for dataset_name in dataset_names:
    print(f"Processing test predictions for {dataset_name}...")

    output_dir = f"{models_path}/{dataset_name}"

    # Load test data
    raw_test_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/test.json')
    test_texts = pd.Series(raw_test_data['text'])
    test_labels = pd.Series(raw_test_data['storypoint'])

    # Load tokenizer (fit on train+val as before)
    raw_train_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/train.json')
    raw_val_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/val.json')
    tokenizer = Tokenizer(num_words=vocab, oov_token='<OOV>')
    combined = pd.concat([pd.Series(raw_train_data['text']), pd.Series(raw_val_data['text'])])
    tokenizer.fit_on_texts(combined)

    # Tokenize and pad test data
    test_sequences = tokenizer.texts_to_sequences(test_texts)
    # Get max_len as used in training
    train_sequences = tokenizer.texts_to_sequences(pd.Series(raw_train_data['text']))
    val_sequences = tokenizer.texts_to_sequences(pd.Series(raw_val_data['text']))
    max_len = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in val_sequences))
    test_padded = pad_sequences(test_sequences, maxlen=max_len, dtype='int32', padding='pre', truncating='pre', value=0)

    # Load model
    model_path = f"{output_dir}/{dataset_name}.keras"
    model = tf.keras.models.load_model(model_path)

    # Predict
    test_predictions = model.predict(test_padded).flatten()

    # Save predictions
    predictions_df = pd.DataFrame({
        'true_storypoint': test_labels,
        'predicted_storypoint': test_predictions
    })
    predictions_df.to_csv(f"{output_dir}/{dataset_name}_predictions.csv", index=False)

    print(f"Saved predictions for {dataset_name}")

    # if 1 == 1:
    #     break

print("All predictions saved.")