In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install torch pandas tqdm scikit-learn
!pip install torch_geometric
!pip install accelerate
!pip install -U bitsandbytes
!pip install peft

Mounted at /content/drive
Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.1


In [None]:
!pip install datasets



In [None]:
import random
import os
import sqlite3
import json
import warnings

import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample

from transformers import (
    AutoModel,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    TrainerCallback,
)

from datasets import Dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from IPython.display import display

# ---- clean, compatible stack (run once) ----
import torch, transformers, accelerate, peft, bitsandbytes
print("torch", torch.__version__)
print("transformers", transformers.__version__)
print("accelerate", accelerate.__version__)
print("peft", peft.__version__)
print("bitsandbytes", bitsandbytes.__version__)
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
print("bf16 supported:", torch.cuda.is_available() and torch.cuda.is_bf16_supported())



torch 2.9.0+cu126
transformers 5.0.0
accelerate 1.12.0
peft 0.18.1
bitsandbytes 0.49.1
GPU: NVIDIA L4
bf16 supported: True


In [None]:
# -------------------------------
# Environment Variables & Device
# -------------------------------
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["WANDB_DISABLED"] = "true"

warnings.simplefilter("ignore", category=UserWarning)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# -------------------------------
# Paths
# -------------------------------
BASE_DATA_PATH = '/data'
REPORT_SAVE_PATH = '/contentC'
TEMP_OUTPUT_DIR = "temp"  # Directory for temporary outputs (e.g., model checkpoints)

# -------------------------------
# Dataset Configurations
# -------------------------------
DATASETS_CONFIG = {
    f'{BASE_DATA_PATH}/fibvid': {'text_emo': 500,
                                 'text_only': 500
                                 },
    f'{BASE_DATA_PATH}/ts': {'text_emo': 500,
                             'text_only': 500
                             },
}

# -------------------------------
# Training Hyperparameters
# -------------------------------


TRAINING_ARGS_CONFIG = {
    "output_dir": TEMP_OUTPUT_DIR,

    "learning_rate": 1e-5,
    "weight_decay": 0.2,
    "lr_scheduler_type": "linear",
    "warmup_ratio": 0.01,
    "max_grad_norm": 0.5,
    "label_smoothing_factor": 0.1,

    "gradient_accumulation_steps": 1,
    "num_train_epochs": 300,          # cap lower; let early stopping do its job

    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 32,

    "eval_strategy": "steps",
    "eval_steps": 200,
    "save_strategy": "steps",
    "save_steps": 200,
    "save_total_limit": 2,

    "logging_strategy": "steps",
    "logging_steps": 50,

    "metric_for_best_model": "eval_loss",
    "load_best_model_at_end": True,
    "report_to": "none",

    # pick ONE of these:
    "fp16": True,
    # "bf16": True,
}



# Instantiate the TrainingArguments object
TRAINING_ARGS = TrainingArguments(**TRAINING_ARGS_CONFIG)

# -------------------------------
# Model & Tokenizer Hyperparameters
# -------------------------------
MODEL_NAME = "bert-large-cased"  # Changed to bert-large-cased
BITS_AND_BYTES_CONFIG = {"load_in_8bit": True}  # Quantization settings (adjust if needed)
MAX_LEN_DEFAULT = 500  # Default maximum token length for inputs

LR = TRAINING_ARGS_CONFIG['learning_rate']
EP = TRAINING_ARGS_CONFIG['num_train_epochs']
print(LR)

# -------------------------------
# LoRA (Low-Rank Adaptation) Configuration
# -------------------------------
# If using BERT for sequence classification, update the target modules and task type accordingly.
LORA_CONFIG = {
    "r": 16,
    "lora_alpha": 32,
    "target_modules": ["query", "value"],  # Adjusted for BERT's naming convention
    "lora_dropout": 0.2,
    "bias": "none",
    "task_type": "SEQ_CLS",  # Use 'SEQ_CLS' for sequence classification tasks
}

# -------------------------------
# Early Stopping Configuration
# -------------------------------
EARLY_STOPPING_PATIENCE = 15  # Number of evaluations to wait before stopping early if no improvement. Previously set to 20

# -------------------------------
# Additional Note:
# -------------------------------
# When instantiating your model and tokenizer, consider using:
#   tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
#   model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=<your_num_labels>)
# if you're performing a classification task.


import torch
from transformers import TrainingArguments

# ---- auto-pick bf16 if supported, else fp16 (ensures only ONE is True) ----
USE_BF16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

TRAINING_ARGS_CONFIG.pop("fp16", None)
TRAINING_ARGS_CONFIG.pop("bf16", None)
if USE_BF16:
    TRAINING_ARGS_CONFIG["bf16"] = True
else:
    TRAINING_ARGS_CONFIG["fp16"] = True

# ---- bitsandbytes: keep classifier OUT of int8 to avoid .CB crash ----
BITS_AND_BYTES_CONFIG = {
    "load_in_8bit": True,
    "llm_int8_skip_modules": ["classifier"],  # ✅ critical
    # optional but good practice: compute dtype for matmuls
    "bnb_8bit_compute_dtype": torch.bfloat16 if USE_BF16 else torch.float16,
}

TRAINING_ARGS = TrainingArguments(**TRAINING_ARGS_CONFIG)

print("Using bf16:", TRAINING_ARGS.bf16, "| Using fp16:", TRAINING_ARGS.fp16)
print("BITS_AND_BYTES_CONFIG:", BITS_AND_BYTES_CONFIG)


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


1e-05
Using bf16: True | Using fp16: False
BITS_AND_BYTES_CONFIG: {'load_in_8bit': True, 'llm_int8_skip_modules': ['classifier'], 'bnb_8bit_compute_dtype': torch.bfloat16}


In [None]:
def finetune_bert(
    data, input_column, label_column, dataset_identifier,
    max_len=MAX_LEN_DEFAULT,
    class_weights=None,
    training_args=None,
    seed=42,
    ):
    """
    Fine-tune a BERT model for binary classification (real news vs fake news) using the provided data.

    Returns:
        model: The fine-tuned model.
        report_df: DataFrame with the classification report.
        cm: Confusion matrix as a NumPy array.
        test_results: DataFrame with test text, actual label, and predicted label.
    """
    # Load model and tokenizer with 8-bit quantization
    quantization_config = BitsAndBytesConfig(**BITS_AND_BYTES_CONFIG)
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=2, quantization_config=quantization_config
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(**LORA_CONFIG)
    model = get_peft_model(model, lora_config)


    # Split the data into training, validation, and test sets
    train_data = data[data['SPLIT'] == 'train']
    val_data = data[data['SPLIT'] == 'val']
    test_data = data[data['SPLIT'] == 'test']

    # Upsample the training set to handle class imbalance
    train_data = upsample_minority_class(train_data, label_col=label_column, random_state=seed)
    display(train_data[label_column].value_counts())

    # Create tokenized datasets for training, validation, and testing
    train_dataset = create_dataset(tokenizer, train_data, input_column, label_column, max_len)
    val_dataset = create_dataset(tokenizer, val_data, input_column, label_column, max_len)
    test_dataset = create_dataset(tokenizer, test_data, input_column, label_column, max_len)

    # Define compute_metrics to calculate accuracy
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=1)
        accuracy = np.mean(preds == labels)
        return {"accuracy": accuracy}

    # Initialize Trainer with early stopping callback
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE)]
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the test set
    predictions = trainer.predict(test_dataset)
    logits = predictions.predictions
    preds = np.argmax(logits, axis=1).tolist()

    # Use the original test_data labels (binary values) for the classification report.
    actual_labels = test_data[label_column].tolist()
    from sklearn.metrics import classification_report, confusion_matrix
    report = classification_report(actual_labels, preds, output_dict=True)
    report_df = pd.DataFrame(report).transpose()

    # Generate confusion matrix using sklearn
    cm = confusion_matrix(actual_labels, preds)

    # Prepare a DataFrame with test examples (original text, actual label, predicted label)
    test_results = test_data.copy().reset_index(drop=True)
    test_results['predicted'] = preds
    test_results = test_results[[input_column, label_column, 'predicted']]
    test_results.rename(columns={input_column: 'text', label_column: 'actual'}, inplace=True)

    return model, report_df, cm, test_results


# ---------- Data Upsampling Function ----------
def upsample_minority_class(df, label_col='label', random_state=42):
    """
    Upsample each class in the DataFrame so that all classes have the same number of samples.
    """
    max_count = df[label_col].value_counts().max()
    df_list = []
    for label in df[label_col].unique():
        df_label = df[df[label_col] == label]
        df_label_upsampled = resample(
            df_label,
            replace=True,           # sample with replacement
            n_samples=max_count,    # upsample to the max count
            random_state=random_state
        )
        df_list.append(df_label_upsampled)
    df_upsampled = pd.concat(df_list)
    return df_upsampled


# ---------- Early Stopping Callback ----------
class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=EARLY_STOPPING_PATIENCE):
        self.early_stopping_patience = early_stopping_patience
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        eval_loss = metrics.get("eval_loss")
        if eval_loss is None:
            return

        if self.best_metric is None or eval_loss < self.best_metric:
            self.best_metric = eval_loss
            self.patience_counter = 0
        else:
            self.patience_counter += 1

        if self.patience_counter >= self.early_stopping_patience:
            control.should_training_stop = True


# ---------- Function to Print Trainable Parameters ----------
def print_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable params: {trainable_params} || All params: {all_params} || Trainable%: {100 * trainable_params / all_params:.2f}%")


# ---------- Dataset Creation Function ----------
def create_dataset(tokenizer, data, input_column, label_column, max_len):
    """
    Tokenize texts for classification and attach the integer labels directly.
    """
    def tokenize_function(examples):
        # Tokenize the input texts without any prefix.
        tokenized_inputs = tokenizer(
            examples[input_column],
            padding="max_length",
            truncation=True,
            max_length=max_len
        )
        # Directly assign labels (assumed to be integers: 0 for real, 1 for fake)
        tokenized_inputs["labels"] = examples[label_column]
        return tokenized_inputs

    dataset = Dataset.from_pandas(data)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    tokenized_dataset.set_format("torch")
    return tokenized_dataset


In [None]:
# cfg = TRAINING_ARGS.to_dict()
# cfg["output_dir"] = run_output_dir
# cfg["seed"] = seed
# cfg["data_seed"] = seed
# cfg["run_name"] = f"{ds}_{input_mode}_seed{seed}"

# training_args_seed = TrainingArguments(**cfg)

# ===============================
# Main Training Loop (Extended with Confusion Matrix and Test Results)
# ===============================

import sqlite3
import pandas as pd

SEEDS = [41,67,13,61,89,47]

import os, random
import numpy as np
import torch
from transformers import set_seed as hf_set_seed

def set_all_seeds(seed: int):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    hf_set_seed(seed)

    # optional: more reproducible, sometimes slower
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


# ===============================
# Main Training Loop (5 seeds)
# ===============================

import sqlite3
import pandas as pd
from transformers import TrainingArguments  # ensure imported

for dataset_path, modes_info in DATASETS_CONFIG.items():
    db_path = f'{dataset_path}_data.db'
    con = sqlite3.connect(db_path)
    print(f"Using database: {db_path}")

    ds = dataset_path.split('/')[-1]
    print(f"Dataset identifier: {ds}")

    for input_mode, mode_max_len in modes_info.items():
        # Read ONCE per mode (so all seeds start from same data)
        data = pd.read_sql_query('SELECT * FROM data', con)

        class_weights = None

        if input_mode == 'text_emo':
            data['input'] = data.target_emotion_stance + '[SEP]' + data.text
        else:
            data['input'] = data.text

        data = data[data.is_root == 1]
        display(data.head())

        # NEW: loop over seeds
        for seed in SEEDS:
            print(f"\n====================\nDataset={ds} Mode={input_mode} Seed={seed}\n====================")
            set_all_seeds(seed)

            # NEW: create fresh TrainingArguments per seed with unique output_dir
            run_output_dir = f"{TEMP_OUTPUT_DIR}/{ds}/{input_mode}/seed_{seed}"

            # NEW: create fresh TrainingArguments per seed with unique output_dir
            run_output_dir = f"{TEMP_OUTPUT_DIR}/{ds}/{input_mode}/seed_{seed}"

            cfg = dict(TRAINING_ARGS_CONFIG)          # copy
            cfg["output_dir"] = run_output_dir        # override safely
            cfg["seed"] = seed
            cfg["data_seed"] = seed
            cfg["run_name"] = f"{ds}_{input_mode}_seed{seed}"

            training_args_seed = TrainingArguments(**cfg)

            model, report, cm, test_results = finetune_bert(
                data,
                input_column='input',
                label_column='label',
                dataset_identifier=dataset_path,
                max_len=mode_max_len,
                class_weights=class_weights,
                training_args=training_args_seed,
                seed=seed,   # NEW: pass seed into finetune_bert
            )

            print(report)

            # NEW: include seed in filenames so they don't overwrite each other
            report_filename = f"{REPORT_SAVE_PATH}/{input_mode}_{ds}_seed{seed}_{LR}_es{EARLY_STOPPING_PATIENCE}_ep{EP}_bert_upsample_report.csv"
            report.to_csv(report_filename)
            print(f"Saved classification report to {report_filename}")

            cm_df = pd.DataFrame(cm, index=['Actual_0', 'Actual_1'], columns=['Pred_0', 'Pred_1'])
            cm_filename = f"{REPORT_SAVE_PATH}/{input_mode}_{ds}_seed{seed}_{LR}_es{EARLY_STOPPING_PATIENCE}_ep{EP}_bert_upsample_cm.csv"
            cm_df.to_csv(cm_filename)
            print(f"Saved confusion matrix to {cm_filename}")

            test_results_filename = f"{REPORT_SAVE_PATH}/{input_mode}_{ds}_seed{seed}_{LR}_es{EARLY_STOPPING_PATIENCE}_ep{EP}_bert_upsample_test_results.csv"
            test_results.to_csv(test_results_filename, index=False)
            print(f"Saved test results (text, actual, predicted) to {test_results_filename}")

    con.close()


Using database: /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/ts_data.db
Dataset identifier: ts


Unnamed: 0,SPLIT,tweet_id,is_root,root_node,label,parent_id,text,target_emotion_stance,create_date,root_created,...,0m,1m,15m,20m,60m,90m,24h,48h,gt_48h,input
1,train,378,1,378,0,378,Says Mike Huckabee appeared in diabetes infome...,neutral emotion & AGAINST stance towards Mike ...,2008-05-05 15:20:48,2008-05-05 15:20:48,...,1,1,1,1,1,1,1,1,1,neutral emotion & AGAINST stance towards Mike ...
3,train,88,1,88,0,88,"""The law says that mental health must be treat...",neutral emotion & NEUTRAL stance towards,2008-10-01 17:11:31,2008-10-01 17:11:31,...,1,1,1,1,1,1,1,1,1,neutral emotion & NEUTRAL stance towards [SEP]...
6,train,34,1,34,0,34,"""Higher education is one of America's stronges...",neutral emotion & FAVOR stance towards America,2008-10-29 07:41:43,2008-10-29 07:41:43,...,1,1,1,1,1,1,1,1,1,neutral emotion & FAVOR stance towards America...
8,train,78,1,78,0,78,"Americans ""say that what they want is a choice...",neutral emotion & NEUTRAL stance towards Medicare,2008-10-29 07:48:44,2008-10-29 07:48:44,...,1,1,1,1,1,1,1,1,1,neutral emotion & NEUTRAL stance towards Medic...
10,train,471,1,471,0,471,"""Supreme Court 15 times over the last 120 year...",neutral emotion & NEUTRAL stance towards Supre...,2008-11-15 06:48:14,2008-11-15 06:48:14,...,1,1,1,1,1,1,1,1,1,neutral emotion & NEUTRAL stance towards Supre...


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



Dataset=ts Mode=text_emo Seed=47




config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-large-cased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,412
1,412


Map:   0%|          | 0/824 [00:00<?, ? examples/s]

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy
200,0.70316,0.666132,0.679389
400,0.675724,0.62317,0.717557
600,0.463145,0.464301,0.816794
800,0.370997,0.41589,0.877863
1000,0.332692,0.429666,0.862595


In [None]:
# prompt: disconnect runtime

from google.colab import runtime
runtime.unassign()
