# Imports

In [None]:
import random
import functools
import csv

from pathlib import Path
from dataclasses import dataclass

import wandb
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from sklearn.metrics import accuracy_score, f1_score
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from skmultilearn.model_selection import iterative_train_test_split
from datasets import Dataset, DatasetDict
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PeftModel
from huggingface_hub import login

In [None]:
# Фиксируем сиды
random.seed(1337)
np.random.seed(1337)

In [None]:
wandb.login(key='YOUR_WANDB_API_KEY')
login(token = 'YOUR_HF_API_KEY')

In [None]:
DATA_PATH = Path('/kaggle/input/dls-nlp-workshop')

In [None]:
with open(DATA_PATH / 'train.csv.csv', newline='') as csvfile:
    data = list(csv.reader(csvfile, delimiter=','))
    header_row = data.pop(0)

random.shuffle(data)

idx, text, labels = list(
    zip(
        *[
            (int(row[0]), 
             f'user assesment: {row[2]}\n\nTags: {row[3].strip()[1:-1]}\n\nReview: {row[4].strip()}', 
             row[5:]) for row in data
        ]
    )
)
labels = np.array(labels, dtype=int)
label_weights = 1 - labels.sum(axis=0) / labels.sum()

row_ids = np.arange(len(labels))
train_idx, y_train, val_idx, y_val = iterative_train_test_split(row_ids[:,np.newaxis], labels, test_size = 0.1)
x_train = [text[i] for i in train_idx.flatten()]
x_val = [text[i] for i in val_idx.flatten()]

ds = DatasetDict({
    'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
    'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
})

In [None]:
@dataclass
class Model_params:
    output_dir: str = "output"
    model_name: str = "unsloth/gemma-2-27b-it-bnb-4bit"
    optim_type: str = "adamw_8bit"
    train_batch_size: int = 1
    eval_batch_size: int = 4
    gradient_accumulation_steps: int = 2
    n_epochs: int = 2
    freeze_layers: int = 16 
    lr: float = 2e-4
    warmup_steps: int = 20
    
model_params = Model_params()

In [None]:
lora_model_params = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
        "up_proj",
        "o_proj",
        "k_proj",
        "gate_proj",
        "q_proj",
        "down_proj",
        "v_proj"],
    layers_to_transform=[i for i in range(42) if i >= 16],
    lora_dropout=0,
    bias='none',
    task_type=TaskType.SEQ_CLS,
)

In [None]:
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

In [None]:
model_name = model_params.model_name

tokenizer = GemmaTokenizerFast.from_pretrained(model_params.model_name)
tokenizer.add_eos_token = True 
tokenizer.padding_side = "right"
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')

model = Gemma2ForSequenceClassification.from_pretrained(
    model_params.model_name,
    num_labels=50,
    torch_dtype=torch.float16,
    device_map="auto",
)

model.model_params.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_model_params)

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Map:   0%|          | 0/4623 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors.index.json:   0%|          | 0.00/199k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/7.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/7.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-27b-it-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels'])
    return d

def compute_metrics(p):
    predictions, labels = p
    f1_micro = f1_score(labels, predictions > 0, average = 'micro')
    f1_macro = f1_score(labels, predictions > 0, average = 'macro')
    f1_weighted = f1_score(labels, predictions > 0, average = 'weighted')
    accuracy = accuracy_score(labels, predictions > 0)
    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'accuracy': accuracy
    }

In [None]:
class CustomTrainer(Trainer):

    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights = label_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")

        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32), pos_weight=self.label_weights)
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="output",
    report_to="none",
    num_train_epochs=model_params.n_epochs,
    per_device_train_batch_size=model_params.train_batch_size,
    gradient_accumulation_steps=model_params.train_batch_size,
    per_device_eval_batch_size=model_params.eval_batch_size,
    eval_strategy="epoch",
    save_strategy="steps",
    save_steps=500,
    optim=model_params.optim_type,
    fp16=True,
    learning_rate=model_params.lr,
    warmup_steps=model_params.warmup_steps,
)

trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_ds['train'],
    eval_dataset = tokenized_ds['val'],
    tokenizer = tokenizer,
    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics = compute_metrics,
    label_weights = torch.tensor(label_weights, device=model.device)
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Weighted,Accuracy
1,0.0337,0.024226,0.842632,0.688474,0.830605,0.71


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enab

TrainOutput(global_step=4623, training_loss=0.06654502172768438, metrics={'train_runtime': 30403.2439, 'train_samples_per_second': 0.152, 'train_steps_per_second': 0.152, 'total_flos': 3.288025744029389e+16, 'train_loss': 0.06654502172768438, 'epoch': 1.0})

# Inference

In [None]:
ADAPTER_PATH = "TheStrangerOne/gemma-2-27b-it-bnb-4bit-lora-multilabel"

base_model = Gemma2ForSequenceClassification.from_pretrained(
        model_params.model_name,
        num_labels=50,
        torch_dtype=torch.float16,
        device_map="auto",
        use_cache=False
    )

model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
tokenizer = GemmaTokenizerFast.from_pretrained(ADAPTER_PATH)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [None]:
with open(DATA_PATH / 'test.csv.csv', newline='') as csvfile:
    data = list(csv.reader(csvfile, delimiter=','))
    header_row = data.pop(0)

idx, text = list(
    zip(
        *[
            (int(row[1]), 
             f'User assesment: {row[2]}, tags: {row[3].strip()[1:-1]}\n\nReview: {row[4].strip()}') 
                    for row in data]
    ))
texts = [i for i in text]

dataset = TextDataset(texts)


batch_size = 6
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [None]:
model.eval()
all_logits = []

for batch_texts in tqdm(dataloader):

    tokenized_inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")
    tokenized_inputs = {k: v.to('cuda') for k, v in tokenized_inputs.items()}

    with torch.no_grad():
        outputs = model(**tokenized_inputs).logits
        all_logits.append(outputs.cpu())

all_logits = torch.cat(all_logits, dim=0)
probs = torch.sigmoid(all_logits).numpy()

In [None]:
def clear(row):
    row = row[1:-1]
    row = row.replace(',', ' ')
    row = row.strip()
    return " ".join(row.split())

predictions = []

for i in np.ndindex(probs.shape[0]):
    for t in (0.55, 0.5, 0.45, 0.4, 0.35, 0.3, 0.25, 0.2):
        row_indices = np.where(probs[i] > t)[0]
        if len(row_indices) != 0:
            break
    if len(row_indices) == 0:
        row_indices = [19]
    predictions.append(row_indices)

sub = pd.DataFrame({'index':idx, 'target':[clear(str(i)) for i in predictions]})
sub.to_csv('submission_gemma_27b.csv', index=False)
np.savetxt("probs_gemma_27b.csv", probs, delimiter=",")