In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score

from federate import federated_train
from utils import compute_metrics


import torch

from datasets import Dataset
from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments
print(torch.__version__)
print(torch.cuda.is_available())


  from .autonotebook import tqdm as notebook_tqdm


2.6.0+cu124
True


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
# MODEL_PATH = "mBERT" # I have donloaded distilled mBERT (hugggingface not available in China)
MODEL_PATH = "distilbert-base-multilingual-cased"
DATA_DIR = "data"


In [4]:
def load_and_label(filepath, portion=0.1):
    df = pd.read_csv(filepath).sample(frac=portion, random_state=42)
    # Map stars to 0/1/2
    def map_sentiment(stars):
        if stars <= 2:
            return 0  # negative
        elif stars == 3:
            return 1  # neutral
        else:
            return 2  # positive
    df["label"] = df["stars"].apply(map_sentiment)
    return Dataset.from_pandas(df[["review_body", "label"]])

train_ds = load_and_label(os.path.join(DATA_DIR, "train.csv"))
val_ds = load_and_label(os.path.join(DATA_DIR, "validation.csv"))
test_ds = load_and_label(os.path.join(DATA_DIR, "test.csv"))


In [8]:

# ==== Tokenizer ====
print("Loading tokenizer from local path...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

def tokenize(example):
    return tokenizer(example["review_body"], truncation=True, padding="max_length",max_length=512)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

columns_to_keep = ["input_ids", "attention_mask", "label"]
train_ds.set_format(type="torch", columns=columns_to_keep)
val_ds.set_format(type="torch", columns=columns_to_keep)
test_ds.set_format(type="torch", columns=columns_to_keep)


Loading tokenizer from local path...


Map: 100%|██████████| 120000/120000 [00:22<00:00, 5325.11 examples/s]
Map: 100%|██████████| 3000/3000 [00:00<00:00, 5670.11 examples/s]
Map: 100%|██████████| 3000/3000 [00:00<00:00, 5167.63 examples/s]


In [9]:

NUM_LABELS = 3

# ==== Load Model ====
print("Loading model from local path...")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=NUM_LABELS)

# ==== Metrics ====


BATCH_SIZE = 32
EPOCHS = 3




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model from local path...


AttributeError: 'dict' object has no attribute 'column_names'

In [10]:
n = len(train_ds)
subset_size = n // 4

client_datasets = [
    train_ds.select(range(0, subset_size)),
    train_ds.select(range(subset_size, 2 * subset_size)),
    train_ds.select(range(2 * subset_size, 3 * subset_size)),
    train_ds.select(range(3 * subset_size, 4 * subset_size)),
]



federated_train(
    base_model=model,
    client_datasets=client_datasets,
    val_ds=val_ds,
    test_ds=test_ds,
    client_weights=[0.25,0.25,0.25,0.25],
    local_epochs=2,
    global_rounds=3,
    batch_size=BATCH_SIZE,
    learning_rate=5e-5,
    device=device
)


--- Global Round 1/3 ---
 Evaluating global model before client updates...
{'eval_loss': 1.0878561735153198, 'eval_model_preparation_time': 0.0009, 'eval_accuracy': 0.4023333333333333, 'eval_f1': 0.2577033526943883, 'eval_runtime': 56.7125, 'eval_samples_per_second': 52.898, 'eval_steps_per_second': 1.657}
 Global Evaluation (Round 1): {'eval_loss': 1.0878561735153198, 'eval_model_preparation_time': 0.0009, 'eval_accuracy': 0.4023333333333333, 'eval_f1': 0.2577033526943883, 'eval_runtime': 56.7125, 'eval_samples_per_second': 52.898, 'eval_steps_per_second': 1.657}
 Training on Client 1/4...


OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 7.91 GiB of which 62.94 MiB is free. Including non-PyTorch memory, this process has 7.19 GiB memory in use. Of the allocated memory 6.96 GiB is allocated by PyTorch, and 110.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)