# SERIEMA: A Framework to Enhance Clustering Stability by Fusing Multimodal Data
 
A novel multimodal framework that seamlessly integrates categorical, numerical, and text data to bolster clustering robustness. It represents a novel approach to customer segmentation and paves the way for future exploration of data fusion techniques in the context of marketing and other applications.

### Loading dependencies and variables

In [1]:
import logging
import os
from statistics import mean, stdev
import sys

import pandas as pd
from config.definitions import ROOT_DIR

os.chdir(ROOT_DIR + '\\src\\model\\')

In [2]:

from transformers import (
    AutoTokenizer,
    AutoConfig,
    HfArgumentParser,
    set_seed,
    TrainerCallback,
    EarlyStoppingCallback,
    Trainer,
    EvalPrediction,
    TrainingArguments,
)

from multimodal_exp_args import (
    MultimodalDataTrainingArguments,
    ModelArguments,
    OurTrainingArguments,
    ComputerStabilityArguments,
)

from evaluation import calc_stability_metrics
from data import load_data_from_folder, load_data_into_folds
from multimodal import TabularConfig
from multimodal import AutoModelWithTabular
from multimodal import CustomTrainer
from util import create_dir_if_not_exists, get_args_info_as_str

### Model, Data, Training, and Stability arguments

In [3]:
parser = HfArgumentParser(
    (ModelArguments, MultimodalDataTrainingArguments, OurTrainingArguments, ComputerStabilityArguments)
)

model_args = ModelArguments(
    model_name_or_path='bert-base-multilingual-uncased',
    config_name=None, 
    tokenizer_name='bert-base-multilingual-uncased', 
    cache_dir=None
    )

data_args = MultimodalDataTrainingArguments(
    data_path=ROOT_DIR + '\\src\\model\\notebook\\', 
    create_folds=False, num_folds=5, 
    validation_ratio=0.2, 
    freeze_transformer_weights=True, 
    latent_dim=3, bn_enc=False, 
    bn_dec=False, vae_out_dim=1, 
    VAE_architecture=[1000, 1000, 600, 300], 
    column_info_path='column_info.json', 
    column_info={'text_cols': ['text'], 'cat_cols': [], 'num_cols': ['review_count', 'useful_user', 'funny_user', 'cool_user', 'fans', 'average_stars', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos', 'friend_count', 'elite_count', 'yelp_since_YRMO', 'yelp_since_year', 'stars', 'useful_review', 'funny_review', 'cool_review'], 'text_col_sep_token': None}, 
    categorical_encode_type='binary', 
    numerical_transformer_method='none',
    mlp_division=4, 
    vae_division=2, 
    combine_feat_method='gating_on_cat_and_num_feats_then_sum',
    mlp_dropout=0.1, 
    vae_dropout=0, 
    numerical_bn=True, 
    use_simple_classifier=False, 
    mlp_act='relu', 
    vae_act='lrelu', 
    gating_beta=0.2, 
    train_csv_name='train.csv',
    val_csv_name='val.csv', 
    test_csv_name='test.csv', 
    train_samples=20, 
    val_samples=20, 
    test_samples=20
    )

training_args = OurTrainingArguments(
    output_dir=ROOT_DIR + '\\src\\model\\notebook\\', 
    overwrite_output_dir=True, 
    do_train=True, 
    do_eval=True,
    do_predict=True, 
    evaluation_strategy='steps', 
    prediction_loss_only=False,
    per_device_train_batch_size=10, 
    per_device_eval_batch_size=10,
    per_gpu_train_batch_size=None,
    per_gpu_eval_batch_size=None,
    gradient_accumulation_steps=1, 
    eval_accumulation_steps=None, 
    eval_delay=0, 
    eval_steps=5,
    learning_rate=0.003, 
    weight_decay=0.0, 
    adam_beta1=0.9, 
    adam_beta2=0.999, 
    adam_epsilon=1e-08, 
    max_grad_norm=1.0, 
    num_train_epochs=5, 
    max_steps=-1, 
    lr_scheduler_type='linear', 
    warmup_ratio=0.0, 
    warmup_steps=0, 
    log_level='passive',
    log_level_replica='passive',
    log_on_each_node=True, 
    logging_dir=ROOT_DIR + '\\src\\model\\notebook\\', 
    logging_strategy='steps', 
    logging_first_step=False, 
    logging_steps=5, 
    logging_nan_inf_filter=True, 
    save_strategy='steps', 
    save_steps=3000, 
    save_total_limit=None, 
    save_on_each_node=False, 
    no_cuda=False,
    use_mps_device=False, 
    seed=42, data_seed=None,
    jit_mode_eval=False, 
    use_ipex=False,
    bf16=False, 
    fp16=False, 
    fp16_opt_level='O1', 
    half_precision_backend='auto', 
    bf16_full_eval=False, 
    fp16_full_eval=False,
    tf32=None, local_rank=-1,
    experiment_name='SERIEMA - Notebook'
)

stability_args = ComputerStabilityArguments(
    clusters=[2], 
    k_means_random_state=42, 
    n_samples=50, 
    randomStateSeed=1, 
    compute_stability_steps=0
)

### Loading training, validation, and test datasets

In [4]:
create_dir_if_not_exists(training_args.output_dir)
stream_handler = logging.StreamHandler(sys.stderr)
file_handler = logging.FileHandler(
    filename=os.path.join(training_args.output_dir, "train_log.txt"),  encoding='utf-8', mode="w+"
)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[stream_handler, file_handler],
)

tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name
    if model_args.tokenizer_name
    else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
)

if not data_args.create_folds:
    train_dataset, val_dataset, test_dataset = load_data_from_folder(
        data_args.train_csv_name, 
        data_args.val_csv_name, 
        data_args.test_csv_name,
        data_args.train_samples,
        data_args.val_samples,
        data_args.test_samples,
        data_args.data_path,
        data_args.column_info["text_cols"],
        tokenizer,
        categorical_cols=data_args.column_info["cat_cols"],
        numerical_cols=data_args.column_info["num_cols"],
        categorical_encode_type=data_args.categorical_encode_type,
        numerical_transformer_method=data_args.numerical_transformer_method,
        sep_text_token_str=tokenizer.sep_token
        if not data_args.column_info["text_col_sep_token"]
        else data_args.column_info["text_col_sep_token"],
        max_token_length=training_args.max_token_length,
        debug=training_args.debug_dataset,
        debug_dataset_size=training_args.debug_dataset_size,
    )
    train_datasets = [train_dataset]
    val_datasets = [val_dataset]
    test_datasets = [test_dataset]
else:
    train_datasets, val_datasets, test_datasets = load_data_into_folds(
        data_args.data_path,
        data_args.num_folds,
        data_args.validation_ratio,
        data_args.column_info["text_cols"],
        tokenizer,
        categorical_cols=data_args.column_info["cat_cols"],
        numerical_cols=data_args.column_info["num_cols"],
        categorical_encode_type=data_args.categorical_encode_type,
        numerical_transformer_method=data_args.numerical_transformer_method,
        sep_text_token_str=tokenizer.sep_token
        if not data_args.column_info["text_col_sep_token"]
        else data_args.column_info["text_col_sep_token"],
        max_token_length=training_args.max_token_length,
        debug=training_args.debug_dataset,
        debug_dataset_size=training_args.debug_dataset_size,
    )
train_dataset = train_datasets[0]
set_seed(training_args.seed)

01/24/2024 16:50:21 - INFO - data.data_utils -   0 categorical columns
01/24/2024 16:50:21 - INFO - data.data_utils -   25 numerical columns
01/24/2024 16:50:21 - INFO - data.load_data -   Text columns: ['text']
01/24/2024 16:50:21 - INFO - data.load_data -   Raw text example: Cutest spot we've visited so far. The ambiance is great, very friendly service and the food was wonderful. We had the Eggs Benedict and Spicy Boudin Omelette, both were great choices. The peach and strawberry mimosas were a hit as well. I definitely recommend it!
01/24/2024 16:50:21 - INFO - data.data_utils -   0 categorical columns
01/24/2024 16:50:21 - INFO - data.data_utils -   25 numerical columns
01/24/2024 16:50:21 - INFO - data.load_data -   Text columns: ['text']
01/24/2024 16:50:21 - INFO - data.load_data -   Raw text example: I've been here maybe a dozen times?  It's a pretty gothy-feeling bar, that's for sure!  Very dimly lit maroon-colored with random Russian-esque art haphazardly hung about and small

### Setting up model configuration

In [5]:
total_results = []
for i, (train_dataset, val_dataset, test_dataset) in enumerate(
    zip(train_datasets, val_datasets, test_datasets)
):
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name
        else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    tabular_config = TabularConfig(
        cat_feat_dim=train_dataset.cat_feats.shape[1]
        if train_dataset.cat_feats is not None
        else 0,
        numerical_feat_dim=train_dataset.numerical_feats.shape[1]
        if train_dataset.numerical_feats is not None
        else 0,
        **vars(data_args),
    )
    config.tabular_config = tabular_config
    model = AutoModelWithTabular.from_pretrained(
        model_args.config_name
        if model_args.config_name
        else model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir,
    )

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertWithTabular: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertWithTabular from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertWithTabular from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertWithTabular were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['bert.tabular_cla

### Custom Callback based on TrainerCallback

In [6]:
class CustomCallback(TrainerCallback):
    def __init__(self, trainer, stability_args):
        super().__init__()
        self.trainer = trainer
        self.stability_args = stability_args

    def calc_stability(self, coming_from_train_end = False):
        if (self.stability_args.compute_stability_steps > 0 
            and self.state.global_step % self.stability_args.compute_stability_steps == 0) or coming_from_train_end:

            val_outputs = self.trainer.predict(self.trainer.eval_dataset).predictions
            stability_out = calc_stability_metrics(
                val_outputs, 
                self.stability_args.clusters, 
                self.stability_args.k_means_random_state,
                self.stability_args.n_samples, 
                self.stability_args.randomStateSeed
            )

    def on_step_end(self, args, state, control, **kwargs):
        self.calc_stability()

    def on_train_end(self, args, state, control, **kwargs):
        self.calc_stability(coming_from_train_end = True)

trainer = CustomTrainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        eval_dataset = val_dataset,
        compute_metrics = None, #evaluation strategy to adopt during training calling by evaluation_strategy. See bertvaewithtabular/multimodal_transformers/model/custom_trainer.py#L261
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
trainer.add_callback(CustomCallback(trainer, stability_args))

01/24/2024 16:50:23 - INFO - multimodal_exp_args -   PyTorch: setting up devices


### Training the model

In [7]:
trainer.train(
    model_path=model_args.model_name_or_path
    if os.path.isdir(model_args.model_name_or_path)
    else None
)
trainer.save_model()

***** Running training *****
  Num examples = 20
  Num Epochs = 5
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 10
  Number of trainable parameters = 173284368


  0%|          | 0/10 [00:00<?, ?it/s]

01/24/2024 16:52:05 - INFO - multimodal.custom_trainer -   ***** Running Evaluation *****
01/24/2024 16:52:05 - INFO - multimodal.custom_trainer -     Num examples = 20
01/24/2024 16:52:05 - INFO - multimodal.custom_trainer -     Batch size = 10


{'loss': 10391.2266, 'learning_rate': 0.0015, 'epoch': 2.5}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 3821.28369140625, 'eval_runtime': 24.6076, 'eval_samples_per_second': 0.813, 'eval_steps_per_second': 0.081, 'epoch': 2.5}


01/24/2024 16:54:02 - INFO - multimodal.custom_trainer -   ***** Running Evaluation *****
01/24/2024 16:54:02 - INFO - multimodal.custom_trainer -     Num examples = 20
01/24/2024 16:54:02 - INFO - multimodal.custom_trainer -     Batch size = 10


{'loss': 3467.8762, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/2 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)


01/24/2024 16:54:25 - INFO - multimodal.custom_trainer -   ***** Running Prediction *****
01/24/2024 16:54:25 - INFO - multimodal.custom_trainer -     Num examples = 20
01/24/2024 16:54:25 - INFO - multimodal.custom_trainer -     Batch size = 10


{'eval_loss': 1482.9000244140625, 'eval_runtime': 23.0298, 'eval_samples_per_second': 0.868, 'eval_steps_per_second': 0.087, 'epoch': 5.0}
{'train_runtime': 241.3256, 'train_samples_per_second': 0.414, 'train_steps_per_second': 0.041, 'train_loss': 6929.5513671875, 'epoch': 5.0}


  0%|          | 0/2 [00:00<?, ?it/s]

01/24/2024 16:54:50 - INFO - evaluation -   Computing adjusted_rand_score k = 2
01/24/2024 16:54:51 - INFO - evaluation -   Computing adjusted_mutual_info_score k = 2
01/24/2024 16:54:53 - INFO - evaluation -   Computing [bagclust]
01/24/2024 16:54:54 - INFO - evaluation -   Computing [han]
01/24/2024 16:54:57 - INFO - evaluation -   Computing [OTstab]











Saving model checkpoint to C:\Users\FS-Ma\OneDrive\Documents\projects\SIRIEMA\src\model\notebook\
Configuration saved in C:\Users\FS-Ma\OneDrive\Documents\projects\SIRIEMA\src\model\notebook\config.json
Model weights saved in C:\Users\FS-Ma\OneDrive\Documents\projects\SIRIEMA\src\model\notebook\pytorch_model.bin


### Prediction on validation and test datasets

In [8]:
eval_result = trainer.evaluate(eval_dataset=val_dataset)
output_eval_file = os.path.join(
    training_args.output_dir, f"eval_metric_results.txt"
)
if trainer.is_world_process_zero():
    with open(output_eval_file, "w") as writer:
        for key, value in eval_result.items():
            writer.write("%s = %s\n" % (key, value))            

output_predict_eval = os.path.join(
    training_args.output_dir, f"predict_val.csv"
)
predictions = trainer.predict(test_dataset=val_dataset).predictions
pd.DataFrame(predictions).to_csv(output_predict_eval, index = False)

01/24/2024 16:54:59 - INFO - multimodal.custom_trainer -   ***** Running Evaluation *****
01/24/2024 16:54:59 - INFO - multimodal.custom_trainer -     Num examples = 20
01/24/2024 16:54:59 - INFO - multimodal.custom_trainer -     Batch size = 10


  0%|          | 0/2 [00:00<?, ?it/s]

01/24/2024 16:55:30 - INFO - multimodal.custom_trainer -   ***** Running Prediction *****
01/24/2024 16:55:30 - INFO - multimodal.custom_trainer -     Num examples = 20
01/24/2024 16:55:30 - INFO - multimodal.custom_trainer -     Batch size = 10


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
predictions = trainer.predict(test_dataset=test_dataset).predictions
output_test_file = os.path.join(
    training_args.output_dir, f"predict_test.csv"
)
if trainer.is_world_process_zero():
    pd.DataFrame(predictions).to_csv(output_test_file, index = False)

01/24/2024 16:56:04 - INFO - multimodal.custom_trainer -   ***** Running Prediction *****
01/24/2024 16:56:04 - INFO - multimodal.custom_trainer -     Num examples = 20
01/24/2024 16:56:04 - INFO - multimodal.custom_trainer -     Batch size = 10


  0%|          | 0/2 [00:00<?, ?it/s]