### Ensemble Stacking 

In [10]:
BATCH_SIZE = 128
EPOCHS = 600
LEARNING_RATE = 1e-4
MAX_LENGTH = 256
BASE_MODEL = "roberta-base"
vocab = 10000

case = "Case_N"
code = f"20x1_{case}"
dataset_splitted_path = f"datasets/{case}"
models_path = f"models/{case}/{code}"


batch_1 = ['APSTUD', 'BAM', 'CLOV', 'DM']
batch_2 = ['DURACLOUD', 'JRESERVER', 'MDL', 'MESOS']
batch_3 = ['MULE', 'MULESTUDIO', 'TIMOB']
batch_4 = ['TISTUD', 'USERGRID', 'XD']

# combine all datasets
dataset_names = batch_1 + batch_2 + batch_3 + batch_4

project_names = ['AS', 'BB', 'CV', 'DM', 'DC', 'JS', 'MD', 'ME', 'MU', 'MS', 'AP', 'TS', 'UG', 'XD']

### Initialize

In [11]:
import tensorflow as tf
from tensorflow.keras import callbacks
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_absolute_error, median_absolute_error
import torch
from tensorflow.keras.preprocessing.text import Tokenizer as KerasTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from datasets import Dataset, concatenate_datasets
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import os
from keras.src.callbacks import EarlyStopping

class MakeTorchData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        item["labels"] = float(item["labels"])
        return item

    def __len__(self):
        return len(self.labels)

class RobertaRegressorWrapper:
    def __init__(self, model_path):
        self.model_path = model_path
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.trainer = None
        self.built = True
        self.initFit()

    def initFit(self):

        training_args = TrainingArguments(
            output_dir=f'{self.model_path}/ensemble',
            per_device_eval_batch_size=BATCH_SIZE,
        )

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            compute_metrics=compute_eval_metrics,  # Add your own metrics function here if needed
        )

        # self.trainer.train()
    def fit(self):
        pass

    def predict(self, X):
        self.trainer.eval_dataset = X
        predictions = self.trainer.predict(X).predictions
        return predictions
    
    def get_params(self, deep=True):
        return {"model_path": self.model_path}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    
    def score(self, X, y):
        predictions = self.predict(X)
        return mean_absolute_error(y, predictions)


class BiLSTMRegressorWrapper:
    def __init__(self, model_path):
        self.model_path = model_path
        self.model = load_model(model_path)
        self.built = True

    def fit(self, X, y, **kwargs):
        # optimizer = Adam(learning_rate=LEARNING_RATE)
        # self.model.compile(optimizer=optimizer, loss='mae', metrics=['mae'])
        # self.model.fit(X, y, **kwargs)
        pass

    def predict(self, X):
        return self.model.predict(X).flatten()
    
def get_max_len(data, tokernizer):
    tokernizer.fit_on_texts(data)
    sequence_combined = tokernizer.texts_to_sequences(data)
    max_len = max([len(x) for x in sequence_combined])
    return max_len

def compute_eval_metrics(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    mae = mean_absolute_error(labels, logits)
    mdae = median_absolute_error(labels, logits)
    return {"mae": mae, "mdae": mdae}

def preprocess_function(examples, tokernizer):
    encoded = tokernizer(examples['text'], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    dataset = MakeTorchData(encoded, examples['storypoint'])
    return dataset

def load_dataset(dataset_name):
    raw_train_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/train.json')
    raw_val_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/val.json')
    raw_test_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/test.json')

    return raw_train_data, raw_val_data, raw_test_data

def load_dataset_type(dataset_name, dataset_type):
    raw_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/{dataset_type}.json')
    return raw_data

def load_models(dataset_name):
    bilstm_model_path = f'{models_path}/bilstm/{dataset_name}/{dataset_name}.keras'
    roberta_model_path = f'{models_path}/roberta/{dataset_name}/model'

    bilstm_wrapper = BiLSTMRegressorWrapper(bilstm_model_path)
    roberta_wrapper = RobertaRegressorWrapper(roberta_model_path)

    return bilstm_wrapper, roberta_wrapper

def get_padding_sequence(data, tokenizer, max_len):
    sequences = tokenizer.texts_to_sequences(data)
    padded_seq = pad_sequences(sequences, maxlen=max_len, dtype='int32', padding='pre',truncating='pre', value=0)
    return padded_seq

def get_meta_model():
    meta_model = Sequential()
    meta_model.add(Dense(10, input_dim=2, activation='relu'))  # Assuming you have 2 base models
    meta_model.add(Dense(10, activation='relu'))  # Additional Dense layer
    meta_model.add(Dense(1, activation='linear'))  # Final Dense layer with activation function

    # Compile the meta-model
    optimizer = Adam(learning_rate=LEARNING_RATE)
    # optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=LEARNING_RATE)
    meta_model.compile(loss='mae', optimizer=optimizer, metrics=['mae'])

    return meta_model

def load_meta_model(dataset_name):
    meta_model_path = f'{models_path}/meta-model/{dataset_name}/{dataset_name}.keras'
    meta_model = load_model(meta_model_path)
    return meta_model

print("done")

done


### Stacking training

In [None]:
from sklearn.model_selection import KFold
from keras.callbacks import ModelCheckpoint

roberta_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
bilstm_tokenizer = KerasTokenizer(num_words=vocab, oov_token=0)

kf = KFold(n_splits=5)

for dataset_name in dataset_names:
    print(f"start processing - {dataset_name}...")
    os.makedirs(f'{models_path}/meta-model/{dataset_name}', exist_ok=True)

    # load models
    bilstm_model, roberta_model = load_models(dataset_name)

    # load dataset
    raw_train_data, raw_val_data, raw_test_data = load_dataset(dataset_name)

    # calculate the max length of the sequences
    max_len = get_max_len(pd.concat([pd.Series(raw_train_data['text']), pd.Series(raw_val_data['text'])]), bilstm_tokenizer)
    print(f"max_len: {max_len}")

    # convert raw_train_data to a DataFrame if it's not already
    if not isinstance(raw_train_data, pd.DataFrame):
        raw_train_data = pd.DataFrame(raw_train_data)

    meta_model_inputs = []
    meta_model_targets = []

    for train_index, val_index in kf.split(raw_train_data):
        # Split the data
        raw_train_data_fold = raw_train_data.iloc[train_index]
        raw_val_data_fold = raw_train_data.iloc[val_index]

        # prepare for the RoBERTa model
        roberta_train_data = preprocess_function(raw_train_data_fold.to_dict('list'), roberta_tokenizer)
        roberta_val_data = preprocess_function(raw_val_data_fold.to_dict('list'), roberta_tokenizer)

        # prepare for the BiLSTM model
        train_data = pd.Series(raw_train_data_fold['text'])
        train_padded_seq = get_padding_sequence(train_data, bilstm_tokenizer, max_len)
        val_data = pd.Series(raw_val_data_fold['text'])
        val_padded_seq = get_padding_sequence(val_data, bilstm_tokenizer, max_len)

        # predict on validation fold
        roberta_val_preds = roberta_model.predict(roberta_val_data)
        bilstm_val_preds = bilstm_model.predict(val_padded_seq)

        # Stack the predictions together
        val_preds_meta = np.column_stack((roberta_val_preds, bilstm_val_preds))

        # Add the predictions and targets to the lists
        meta_model_inputs.append(val_preds_meta)
        meta_model_targets.append(raw_val_data_fold['storypoint'])

    # Concatenate all the predictions and targets
    meta_model_inputs = np.concatenate(meta_model_inputs)
    meta_model_targets = np.concatenate(meta_model_targets)

    meta_model = get_meta_model()

    # Define the checkpoint path and filename
    checkpoint_filepath = f'{models_path}/meta-model/{dataset_name}/checkpoint'

    # Create a ModelCheckpoint callback that saves the weights only of the best model observed as per the validation data
    model_checkpoint_callback = ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_loss',
        mode='min',
        save_best_only=True)
    
    # Prepare the validation data for the base models
    eval_label = pd.Series(raw_val_data['storypoint']).astype(float)
    roberta_val_data = preprocess_function(raw_val_data, roberta_tokenizer)
    val_padded_seq = get_padding_sequence(pd.Series(raw_val_data['text']), bilstm_tokenizer, max_len)

    # Generate predictions from the base models on the validation set
    roberta_val_preds = roberta_model.predict(roberta_val_data)
    bilstm_val_preds = bilstm_model.predict(val_padded_seq)

    # Stack the predictions together
    val_preds_meta = np.column_stack((roberta_val_preds, bilstm_val_preds))

    # Fit the model with the new callback
    history = meta_model.fit(meta_model_inputs, meta_model_targets, validation_data=(val_preds_meta, eval_label), epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1, callbacks=[EarlyStopping(monitor='val_loss', patience=3), model_checkpoint_callback])

    # Load the weights of the best model observed during training
    meta_model.load_weights(checkpoint_filepath)

    # save the meta model
    meta_model.save(f'{models_path}/meta-model/{dataset_name}/{dataset_name}.keras')

    
    val_loss, val_mae =  meta_model.evaluate(val_preds_meta, eval_label)
    print(f"val_loss: {val_loss}, val_mae: {val_mae}")

    if 1 == 1:
        break

### Validation

In [None]:
roberta_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
bilstm_tokenizer = KerasTokenizer(num_words=vocab, oov_token=0)

results = []
residuals_all = []

for dataset_name in dataset_names:
    print(f"start processing - {dataset_name}...")

    # get project name
    project_name = project_names[dataset_names.index(dataset_name)]

    # load models
    bilstm_model, roberta_model = load_models(dataset_name)
    meta_model = load_meta_model(dataset_name)

    # load dataset
    raw_train_data, raw_val_data, raw_test_data = load_dataset(dataset_name)

    # calculate the max length of the sequences
    max_len = get_max_len(pd.concat([pd.Series(raw_train_data['text']), pd.Series(raw_val_data['text'])]), bilstm_tokenizer)
    print(f"max_len: {max_len}")

    # Prepare the validation data for the base models
    eval_label = pd.Series(raw_val_data['storypoint']).astype(float)
    roberta_val_data = preprocess_function(raw_val_data, roberta_tokenizer)
    val_padded_seq = get_padding_sequence(pd.Series(raw_val_data['text']), bilstm_tokenizer, max_len)

    # Generate predictions from the base models on the validation set
    roberta_val_preds = roberta_model.predict(roberta_val_data)
    bilstm_val_preds = bilstm_model.predict(val_padded_seq)

    # calculate MAPE for each model
    roberta_val_preds_flat = roberta_val_preds.flatten()
    mape_roberta = np.mean(np.abs((eval_label - roberta_val_preds_flat) / eval_label))
    mape_bilstm = np.mean(np.abs((eval_label - bilstm_val_preds) / eval_label))

    print(f"MAPE RoBERTa: {mape_roberta}")
    print(f"MAPE BiLSTM: {mape_bilstm}")

    # Stack the predictions together
    val_preds_meta = np.column_stack((roberta_val_preds, bilstm_val_preds))

    stacking_pred_nn = meta_model.predict(val_preds_meta)

    # save the predictions
    np.save(f'{models_path}/meta-model/{dataset_name}/{dataset_name}_val_pred.npy', stacking_pred_nn)

    # Calculate the MAE and MdAE
    mae_nn = mean_absolute_error(eval_label, stacking_pred_nn)
    mdae_nn = median_absolute_error(eval_label, stacking_pred_nn)
    print(f"NN Staking MdAE: {mdae_nn}")
    print(f"NN Staking MAE: {mae_nn}")
    print(f"Finish process {dataset_name}")

    val_storypoint = raw_val_data['storypoint']
    # Standard Deviation of Residuals
    std_dev = np.std(val_storypoint - stacking_pred_nn)

    # Calculate the percentage error for each prediction
    percentage_errors = (val_storypoint - stacking_pred_nn) / val_storypoint

    # Calculate the mean percentage error
    mpe = np.mean(percentage_errors)

    # calculate the Mean Absolute Percentage Error
    mape = np.mean(np.abs(percentage_errors))
    
    result = {
        'dataset': dataset_name,
        'project_name': project_name,
        'mae': mae_nn,
        'mdae': mdae_nn,
        'std_dev': std_dev,
        'mpe': mpe,
        'mape_stack': mape,
        'mape_roberta': mape_roberta,
        'mape_bilstm': mape_bilstm
    }
    
    results.append(result)

    # if 1 == 1:
    #     break

results_df = pd.DataFrame(results)
results_df.to_csv(f'{models_path}/meta-model/results.csv', index=False)
   
print("done")