# Training

## Imports and settings

In [None]:
import wandb 
import os
import random
import functools
import csv
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from pathlib import Path
from sklearn.metrics import f1_score, accuracy_score
from skmultilearn.model_selection import iterative_train_test_split
from datasets import Dataset, DatasetDict
from peft import PeftConfig, PeftModel, LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
from tqdm.notebook import tqdm 
from torch.utils.data import Dataset, DataLoader

from huggingface_hub import login

Чтобы логгировать результаты модели в wandb необходимо передать свой API ключ. Давать его в коде не лучая практика, можно тянуть из конфига, но на кагле было немного лениво этим заморачиваться. 

Доступ к модели Mistral выдается после принятия лицензионного соглашения, поэтому также понадобится ключи и от HuggingFace

In [None]:
# Фиксируем сиды
random.seed(1337)
np.random.seed(1337)

In [None]:
wandb.login(key='YOUR_WANDB_API_KEY')
login(token = 'YOUR_HF_API_KEY')

Для удобства задаем путь к папке с данными. Оставил тут как это было у меня в Kaggle. 

In [None]:
DATA_PATH = Path('/kaggle/input/dls-nlp-workshop')

## Data Preprocessing

In [None]:
with open(DATA_PATH / 'train.csv.csv', newline='') as csvfile:
    data = list(csv.reader(csvfile, delimiter=','))
    header_row = data.pop(0)

random.shuffle(data)

idx, text, labels = list(zip(*[(int(row[0]), f'tags: {row[3].strip()}\n\nReview: {row[4].strip()}', row[5:]) for row in data]))
labels = np.array(labels, dtype=int)

# Веса классов в теории могут помочь при дисбалансе 
label_weights = 1 - labels.sum(axis=0) / labels.sum()

row_ids = np.arange(len(labels))
train_idx, y_train, val_idx, y_val = iterative_train_test_split(row_ids[:,np.newaxis], labels, test_size = 0.1)
x_train = [text[i] for i in train_idx.flatten()]
x_val = [text[i] for i in val_idx.flatten()]

ds = DatasetDict({
    'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
    'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
})

text[0]

In [None]:
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

In [None]:
model_name = 'mistralai/Mistral-7B-v0.1'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')

# Квантизуем в 4бит по гайду
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# Параметры конфига из гайда 
lora_config = LoraConfig(
    r = 16,
    lora_alpha = 8,
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, 
    bias = 'none',
    task_type = 'SEQ_CLS'
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=labels.shape[1]
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Map:   0%|          | 0/4143 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Кастомная функция для препроцессинга батча
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels'])
    return d

# Метрики которые будем отслеживать на валидации
def compute_metrics(p):
    predictions, labels = p
    f1_micro = f1_score(labels, predictions > 0, average = 'micro')
    f1_macro = f1_score(labels, predictions > 0, average = 'macro')
    f1_weighted = f1_score(labels, predictions > 0, average = 'weighted')
    accuracy = accuracy_score(labels, predictions > 0)
    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'accuracy': accuracy
    }

In [None]:
class CustomTrainer(Trainer):

    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights = label_weights
    
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32), pos_weight=self.label_weights)
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir = 'multilabel_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    num_train_epochs = 3,
    weight_decay = 0.01,
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_ds['train'],
    eval_dataset = tokenized_ds['val'],
    tokenizer = tokenizer,
    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics = compute_metrics,
    label_weights = torch.tensor(label_weights, device=model.device)
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mstrangerone[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241002_003149-zdusa2sz[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mmultilabel_classification[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/strangerone/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/strangerone/huggingface/runs/zdusa2sz[0m
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocas

Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Weighted,Accuracy
1,0.1333,0.057007,0.551196,0.192709,0.486737,0.354167
2,0.0493,0.043728,0.67237,0.396372,0.635028,0.45625
3,0.0309,0.041099,0.725962,0.475849,0.704479,0.5125


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=1554, training_loss=0.0696430795441263, metrics={'train_runtime': 21091.718, 'train_samples_per_second': 0.589, 'train_steps_per_second': 0.074, 'total_flos': 5.775669079444685e+16, 'train_loss': 0.0696430795441263, 'epoch': 3.0})

In [None]:
# Сохраняем модель
peft_model_id = 'multilabel_mistral'
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

('multilabel_mistral/tokenizer_config.json',
 'multilabel_mistral/special_tokens_map.json',
 'multilabel_mistral/tokenizer.model',
 'multilabel_mistral/added_tokens.json',
 'multilabel_mistral/tokenizer.json')

# Inference

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme here
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# Грузим сохраненную модель
base_model_id = '/kaggle/input/mistral-3-epochs-model/multilabel_mistral'
base_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_id, 
    quantization_config = quantization_config,
    num_labels=50)

peft_model_id = '/kaggle/input/mistral-3-epochs-model/multilabel_mistral'
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = PeftModel.from_pretrained(base_model, peft_model_id)

tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
model.config.pad_token_id = tokenizer.pad_token_id
model.to('cuda')

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [None]:
with open(DATA_PATH / '/test.csv.csv', newline='') as csvfile:
    data = list(csv.reader(csvfile, delimiter=','))
    header_row = data.pop(0)

idx, text = list(zip(*[(int(row[0]), f'tags: {row[3].strip()}\n\nReview: {row[4].strip()}') for row in data]))
texts = [i for i in text]

dataset = TextDataset(texts)
batch_size = 32

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [None]:
model.eval()

all_logits = []

for batch_texts in tqdm(dataloader):
    tokenized_inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")
    tokenized_inputs = {k: v.to('cuda') for k, v in tokenized_inputs.items()}
    
    with torch.no_grad():
        outputs = model(**tokenized_inputs).logits
        all_logits.append(outputs.cpu())

all_logits = torch.cat(all_logits, dim=0)
probs = torch.sigmoid(all_logits).numpy()

In [None]:
predictions = []
threshold = 0.6 # ВЫБРАТЬ НА КРОСС_ВАЛИДАЦИИ

def clear(row):
    row = row[1:-1]
    row = row.replace(',', ' ')
    return row.strip()

for i in np.ndindex(probs.shape[0]):
    row_indices = np.where(probs[i] > threshold)[0]
    predictions.append(row_indices)


sub = pd.DataFrame({'index':idx, 'target':[clear(str(i)) for i in predictions]})
sub.to_csv(f'submission_mistral_{threshold}.csv', index=False)
np.savetxt(f"probs_mistral_{threshold}.csv", probs, delimiter=",")