### Merging

1  “anna_vs_carla",
2    "breaking_down_hydrogen_peroxide",
3    "carlos_javier_atomic_model",
4    "dry_ice_model",
5    "gas_filled_balloons",
6    "layers_in_test_tube",
7   "model_for_making_water",
8   "namis_careful_experiment",
9    "natural_sugar"
 
-Method 1: Based on our GW-SMM method:
 
merge 1,2,8, merge 4,6,7, merge 3,9. let 5 be an individual model
 

In [None]:
import torch
import numpy as np
from transformers import BertForSequenceClassification, BertConfig
import os
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import f1_score, classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
import sys

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
def trim_tensor(tensor, k):
    flat = tensor.view(-1)
    n = flat.numel()
    if k >= 1.0:
        return tensor.clone()
    num_to_keep = int(np.ceil(n * k))
    if num_to_keep < 1:
        num_to_keep = 1
    topk_values, _ = torch.topk(flat.abs(), num_to_keep)
    threshold = topk_values[-1]
    trimmed = tensor.clone()
    flat_trimmed = trimmed.view(-1)
    flat_trimmed[flat.abs() < threshold] = 0
    return trimmed

def ties_merge_two_models(init_sd, sd_list, k=0.2, lam=1.0):
    merged_sd = {}
    for key in init_sd.keys():
        if "classifier" in key:
            merged_sd[key] = init_sd[key]  # Keep classifier from the initial model
            continue

        if not all(key in sd for sd in sd_list):
            merged_sd[key] = init_sd[key]
            continue

        # Compute parameter deltas (tau values)
        taus = [sd[key] - init_sd[key] for sd in sd_list]
        trimmed_taus = [trim_tensor(t, k) for t in taus]

        stacked = torch.stack(trimmed_taus, dim=0)
        sum_tensor = torch.sum(stacked, dim=0)
        elected_sign = torch.sign(sum_tensor)

        mask = torch.stack([(torch.sign(t) == elected_sign).float() for t in trimmed_taus], dim=0)
        masked_sum = torch.sum(stacked * mask, dim=0)
        count = torch.sum(mask, dim=0)

        merged_tau = torch.where(count > 0, masked_sum / count, torch.zeros_like(masked_sum))
        merged_sd[key] = init_sd[key] + lam * merged_tau

    return merged_sd

def merge_models_ties(ft_model_paths, pretrained_model_name="bert-base-uncased", num_labels=4, k=0.3, lam=1.0, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    if len(ft_model_paths) > 3:
        print("Warning: More than 3 models provided. Only the first 3 will be used for merging.")
        ft_model_paths = ft_model_paths[:3]

    ft_state_dicts = []
    for path in ft_model_paths:
        print(f"Loading fine-tuned model from {path} ...")
        model_ft = torch.load(path, map_location=device)
        ft_state_dicts.append(model_ft.state_dict())

    # Initialize a base model for reference architecture
    config = BertConfig.from_pretrained(pretrained_model_name, num_labels=num_labels,problem_type="multi_label_classification")
    # init_model = BertForSequenceClassification(config)
    init_model = torch.load("../Models/best_model_anna_vs_carla.pt")
    # init_model = torch.load("./Models/best_model_breaking_down_hydrogen_peroxide.pt")
    init_model.to(device)
    init_sd = init_model.state_dict()

    # Ensure all models have the same architecture (excluding classifier)
    keys_init = {k for k in init_sd.keys() if "classifier" not in k}
    for i, sd in enumerate(ft_state_dicts):
        keys_model = {k for k in sd.keys() if "classifier" not in k}
        if keys_model != keys_init:
            raise ValueError(f"Model {i+1} has a different architecture than the initial model.")

    # Merge models (excluding classifier)
    merged_sd = ties_merge_two_models(init_sd, ft_state_dicts, k=k, lam=lam)

    # Load merged weights into a new model
    merged_model = torch.load("../Models/best_model_anna_vs_carla.pt")
    merged_model.load_state_dict(merged_sd, strict=False)
    merged_model.to(device)
    merged_model.eval()
    
    print("Model merging completed successfully (Classifier kept from base model)!")
    return merged_model


In [15]:
def finetune_and_evaluate(
    merged_model,  # Pass the model directly instead of the path
    original_model_path,  # Path to the original model
    train_csv_path, 
    test_csv_path, 
    max_length=100, 
    batch_size=32, 
    threshold=0.5, 
    train_epochs=5, 
    learning_rate=2e-5, 
    merged_model_path='merged.pt'
    device=None
):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load training data to determine the number of labels
    print("Loading training data...")
    df_train = pd.read_csv(train_csv_path)

    # Ensure the first column is named "Sentence"
    df_train.rename(columns={df_train.columns[0]: "Sentence"}, inplace=True)

    # Remove "Group" column if it exists
    if df_train.columns[-1] == "Group":
        df_train.drop(columns=["Group"], inplace=True)

    train_sentences = df_train["Sentence"].tolist()
    train_label_cols = list(df_train.columns[1:])  # Label columns
    num_labels = len(train_label_cols)  # Number of classes
    train_labels = df_train[train_label_cols].astype(int).values

    print(f"[Info] Detected {num_labels} classes from training data.")

    # Initialize a new model with the correct number of labels
    print("Initializing a new BertForSequenceClassification model...")
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=num_labels, problem_type="multi_label_classification"
    )
    model.to(device)

    # Transfer parameters from merged model (except classifier)
    print("Transferring encoder parameters from merged model...")
    merged_sd = merged_model.state_dict()
    model_sd = model.state_dict()

    for key in model_sd.keys():
        if "classifier" not in key:  # Only replace encoder layers
            model_sd[key] = merged_sd[key]

    # Transfer the classifier head from the original model
    print("Transferring classification head from original model...")
    original_model = torch.load(original_model_path, map_location=device)
    classifier_sd = original_model.classifier.state_dict()
    model.classifier.load_state_dict(classifier_sd)

    # Apply the new state dictionary
    model.load_state_dict(model_sd)
    model.to(device)

    # Freeze all layers except the classifier head
    for name, param in model.named_parameters():
        if not name.startswith("classifier"):
            param.requires_grad = False  # Freeze BERT encoder
        else:
            param.requires_grad = True  # Fine-tune only classifier

    print("[Info] Model is ready: Encoder from merged model, Classifier from original model. Fine-tuning classifier head only.")

    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # Tokenize training data
    train_encodings = tokenizer.batch_encode_plus(
        train_sentences,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    train_inputs = train_encodings["input_ids"]
    train_masks = train_encodings["attention_mask"]

    train_dataset = TensorDataset(
        train_inputs, train_masks, torch.tensor(train_labels, dtype=torch.float)
    )
    train_dataloader = DataLoader(
        train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset)
    )

    # Fine-tuning setup
    optimizer = AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=learning_rate
    )
    loss_fct = torch.nn.BCEWithLogitsLoss()

    model.train()
    print("[Info] Fine-tuning classifier head on training dataset.")

    for epoch in range(train_epochs):
        total_loss = 0.0
        for batch in train_dataloader:
            b_input_ids, b_attention_mask, b_labels = [x.to(device) for x in batch]

            outputs = model(b_input_ids, attention_mask=b_attention_mask)
            logits = outputs.logits
            loss = loss_fct(logits, b_labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_dataloader)
        print(f"[Epoch {epoch+1}/{train_epochs}] Loss: {avg_loss:.4f}")
    
    # torch.save(model, './Merged_Models/{}'.format(merged_model_path)) # save the merged model if needed
    model.eval()
    print("[Info] Classifier fine-tuning completed.\n")

    # Load and process test data
    print("Loading test data...")
    df_test = pd.read_csv(test_csv_path)

    # Rename first column to "Sentence"
    df_test.rename(columns={df_test.columns[0]: "Sentence"}, inplace=True)

    # Ensure consistency in test data
    if df_test.columns[-1] == "Group":
        df_test.drop(columns=["Group"], inplace=True)

    test_sentences = df_test["Sentence"].tolist()
    test_label_cols = list(df_test.columns[1:])

    # Ensure test_label_cols only contains valid columns
    valid_label_cols = [col for col in test_label_cols if col in df_test.columns]

    if len(valid_label_cols) < len(test_label_cols):
        print(f"Warning: Some labels are missing in the test dataset! Missing labels: {set(test_label_cols) - set(valid_label_cols)}")

    test_label_cols = valid_label_cols  # Use only the existing labels

    # Remove rows with invalid (-1) labels
    df_test = df_test[~df_test[test_label_cols].eq(-1).any(axis=1)].copy()

    # Extract test labels
    ground_truth = df_test[test_label_cols].values  # Ensure proper format

    test_encodings = tokenizer.batch_encode_plus(
        test_sentences,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    test_inputs = test_encodings["input_ids"].to(device)
    test_masks = test_encodings["attention_mask"].to(device)

    # Run evaluation
    with torch.no_grad():
        outputs = model(test_inputs, attention_mask=test_masks)
        logits = outputs.logits
        pred_prob = torch.sigmoid(logits).cpu().numpy()

    # Compute binary predictions using a fixed threshold
    pred_labels = (pred_prob > threshold).astype(int)

    # Ensure proper format for evaluation metrics
    true_bools = (ground_truth == 1)
    pred_bools = (pred_labels == 1)

    # Compute Metrics:
    micro_f1 = f1_score(true_bools, pred_bools, average='micro')  # Micro F1-score
    macro_f1 = f1_score(true_bools, pred_bools, average='macro')
    exact_match = np.mean(np.all(pred_bools == true_bools, axis=1))  # Exact match accuracy
    per_label_acc = np.mean(pred_bools == true_bools)  # Per-label accuracy

    clf_report = classification_report(true_bools, pred_bools, target_names=test_label_cols)

    # Print results
    print("[Evaluation] Exact Match Accuracy:", f"{exact_match:.4f}")
    print("[Evaluation] Per-label Accuracy:  ", per_label_acc)
    print("[Evaluation] Micro F1 Score:      ", f"{micro_f1:.4f}")
    print("[Evaluation] Macro F1 Score:      ", f"{macro_f1:.4f}")  
    print("[Evaluation] Classification Report:")
    print(clf_report)

    results = {
        "exact_match_accuracy": exact_match,
        "per_label_accuracy": per_label_acc,
        "micro_f1": micro_f1,
        "macro_f1": macro_f1,  
        "classification_report": clf_report,
        "pred_labels": pred_labels,
        "ground_truth": ground_truth
    }
    
    return results


In [17]:
def get_merged_model(task_ids, k=0.3, lam=0.5):
    ft_model_paths = [model_paths[t] for t in task_ids]
    merged = merge_models_ties(
        ft_model_paths=ft_model_paths,
        pretrained_model_name="bert-base-uncased",
        num_labels=4,
        k=k,
        lam=lam,
        device=device
    )
    return merged

def load_single_model(task_id):
    model_obj = torch.load(model_paths[task_id], map_location=device)
    return model_obj  


In [10]:
# 9 models
model_paths = {
    1: "../Models/best_model_anna_vs_carla.pt",
    2: "../Models/best_model_breaking_down_hydrogen_peroxide.pt",
    3: "../Models/best_model_carlos_javier_ atomic_model.pt",
    4: "../Models/best_model_dry_ice_model.pt",
    5: "../Models/best_model_gas_filled_balloons.pt",
    6: "../Models/best_model_layers_in_test_tube.pt",
    7: "../Models/best_model_model_for_making water.pt",
    8: "../Models/best_model_namis_careful_experiment.pt",
    9: "../Models/best_model_natural_sugar.pt",
}

# train paths
train_paths = {
    1: "../PASTA_data/new_processed_data/anna_vs_carla_train.csv",
    2: "../PASTA_data/new_processed_data/breaking_down_hydrogen_peroxide_train.csv",
    3: "../PASTA_data/new_processed_data/carlos_javier_ atomic_model_train.csv",
    4: "../PASTA_data/new_processed_data/dry_ice_model_train.csv",
    5: "../PASTA_data/new_processed_data/gas_filled_balloons_train.csv",
    6: "../PASTA_data/new_processed_data/layers_in_test_tube_train.csv",
    7: "../PASTA_data/new_processed_data/model_for_making water_train.csv",
    8: "../PASTA_data/new_processed_data/namis_careful_experiment_train.csv",
    9: "../PASTA_data/new_processed_data/natural_sugar_train.csv",
}

# test paths
test_paths = {
    1: "../PASTA_data/new_processed_data/anna_vs_carla_test.csv",
    2: "../PASTA_data/new_processed_data/breaking_down_hydrogen_peroxide_test.csv",
    3: "../PASTA_data/new_processed_data/carlos_javier_ atomic_model_test.csv",
    4: "../PASTA_data/new_processed_data/dry_ice_model_test.csv",
    5: "../PASTA_data/new_processed_data/gas_filled_balloons_test.csv",
    6: "../PASTA_data/new_processed_data/layers_in_test_tube_test.csv",
    7: "../PASTA_data/new_processed_data/model_for_making water_test.csv",
    8: "../PASTA_data/new_processed_data/namis_careful_experiment_test.csv",
    9: "../PASTA_data/new_processed_data/natural_sugar_test.csv",
}

In [18]:
print("========== our method ==========")
m1_128 = get_merged_model([1, 2, 8])
m4_67 = get_merged_model([4, 6, 7])
m3_9 = get_merged_model([3, 9])

# output file name
output_log_file = "method1_gw_results.txt"

#
with open(output_log_file, "w") as f:
    original_stdout = sys.stdout  
    sys.stdout = f 
    
    print("========== Method 1: GW-SMM method ==========")

    for task_id in [1, 2, 8]:
        print(f"--- Testing task {task_id} with merged(1,2,8) ---")
        finetune_and_evaluate(
            merged_model=m1_128,
            original_model_path=model_paths[task_id],
            test_csv_path=test_paths[task_id],
            train_csv_path=train_paths[task_id],
            train_epochs=10,
            device=device
        )
        
    for task_id in [4,6,7]:
        print(f"--- Testing task {task_id} with merged(4,6,7) ---")
        finetune_and_evaluate(
            merged_model=m4_67,
            original_model_path=model_paths[task_id],
            test_csv_path=test_paths[task_id],
            train_csv_path=train_paths[task_id],
            train_epochs=10,
            device=device
        )

    for task_id in [3,9]:
        print(f"--- Testing task {task_id} with merged(3,9) ---")
        finetune_and_evaluate(
            merged_model=m3_9,
            original_model_path=model_paths[task_id],
            test_csv_path=test_paths[task_id],
            train_csv_path=train_paths[task_id],
            train_epochs=10,
            device=device
        )
    
    sys.stdout = original_stdout

print(f"results written  in: {output_log_file}")
    

Loading fine-tuned model from ../Models/best_model_anna_vs_carla.pt ...
Loading fine-tuned model from ../Models/best_model_breaking_down_hydrogen_peroxide.pt ...
Loading fine-tuned model from ../Models/best_model_namis_careful_experiment.pt ...




Model merging completed successfully (Classifier kept from base model)!
Loading fine-tuned model from ../Models/best_model_dry_ice_model.pt ...
Loading fine-tuned model from ../Models/best_model_layers_in_test_tube.pt ...
Loading fine-tuned model from ../Models/best_model_model_for_making water.pt ...
Model merging completed successfully (Classifier kept from base model)!
Loading fine-tuned model from ../Models/best_model_carlos_javier_ atomic_model.pt ...
Loading fine-tuned model from ../Models/best_model_natural_sugar.pt ...
Model merging completed successfully (Classifier kept from base model)!


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

results written  in: method1_gw_results.txt


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
