# Config

In [1]:
!nvidia-smi

Sun Jun  8 23:34:01 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.02              Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1660 Ti     On  |   00000000:1C:00.0  On |                  N/A |
|  0%   52C    P8             19W /  130W |     948MiB /   6144MiB |     27%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import os

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    pipeline,
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

In [3]:
if not torch.cuda.is_available():
    print("Cuda is not available. Exiting.")

In [4]:
BASE_MODEL_ID = "EleutherAI/gpt-neo-125M" # TODO: check few different models
LORA_MODEL_OUTPUT_DIR = "./hate-speech-lora-model"
TRAIN_FILE = "data/hate_train.csv"
TEST_FILE = "data/hate_test_data.txt"
PREDICTION_FILE = "pred.csv"
DO_DATA_AUGMENTATION = True


SEED = 42 # reproductivity

# if os.path.exists(TEST_FILE):
#     print(f"Loading training data from {TRAIN_FILE}...")
# else:
#     raise FileNotFoundError(f"Training file {TRAIN_FILE} not found.")


## Labels, Tokenizer etc.

In [5]:
id2label = {0: "no-hate", 1: "hate"}
label2id = {"no-hate": 0, "hate": 1}
NUM_LABELS = len(id2label)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [7]:
PROMPT_TEMPLATE_FINETUNE = (
    "Classify the following text as 'hate' or 'no-hate'.\n\n"
    "Text: {text}\n"
    "Label: {label_str}{eos_token}"
)

PROMPT_TEMPLATE_INFERENCE = (
    "Classify the following text as 'hate' or 'no-hate'.\n\n"
    "Text: {text}\n"
    "Label:"
)

# TODO prompta chyba lepiej po polsku dla polskich modeli? ale idk

## Dataloading

In [None]:
try:
    df_train_full = pd.read_csv(TRAIN_FILE)
    with open(TEST_FILE, 'r', encoding='utf-8') as f:
        test_texts = [line.strip() for line in f]
    df_test = pd.DataFrame(test_texts, columns=['sentence'])
except FileNotFoundError as e:
    print(f"err: no file {e.filename}")
    exit()

print(f"Loaded {len(df_train_full)} training samples and {len(df_test)} test samples.")

print(df_train_full.head())
print()
print(df_train_full['label'].value_counts())

Loaded 10041 training samples and 1000 test samples.
                                            sentence  label
0  Dla mnie faworytem do tytułu będzie Cracovia. ...      0
1  @anonymized_account @anonymized_account Brawo ...      0
2  @anonymized_account @anonymized_account Super,...      0
3  @anonymized_account @anonymized_account Musi. ...      0
4    Odrzut natychmiastowy, kwaśna mina, mam problem      0

label
0    9190
1     851
Name: count, dtype: int64


### Balancing classes (augmentation)

In [9]:
print('Original distribution:')
print(df_train_full['label'].value_counts(normalize=True))

Original distribution:
label
0    0.915247
1    0.084753
Name: proportion, dtype: float64


In [10]:
df_majority = df_train_full[df_train_full['label'] == 0]
df_minority = df_train_full[df_train_full['label'] == 1]

df_minority_oversampled = df_minority.sample(
    n=len(df_majority),
    replace=True,
    random_state=SEED
)

In [11]:
df_train_balanced = pd.concat([df_majority, df_minority_oversampled])


df_train_balanced = df_train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [12]:
print('Balanced distribution:')
print(df_train_balanced['label'].value_counts(normalize=True))
print(f"\nBalanced class nums \n{df_train_balanced['label'].value_counts().to_string()}")


Balanced distribution:
label
1    0.5
0    0.5
Name: proportion, dtype: float64

Balanced class nums 
label
1    9190
0    9190


In [13]:
# 
df_train_full = df_train_balanced


### Data augmentation v2


In [14]:
# TODO mozna zobaczyc czy to ma wiekszy sens zamiast tego powyzej^

# if DO_DATA_AUGMENTATION:
#     print("\n--- Step 1a: Augmenting Data (Back-Translation) ---")
#     print("This may take a few minutes...")
#     try:
#         # Initialize translation pipelines
#         translator_pl_en = pipeline("translation", model="Helsinki-NLP/opus-mt-pl-en", device=0 if torch.cuda.is_available() else -1)
#         translator_en_pl = pipeline("translation", model="Helsinki-NLP/opus-mt-en-pl", device=0 if torch.cuda.is_available() else -1)

#         def back_translate(text):
#             try:
#                 en_text = translator_pl_en(text, max_length=128)[0]['translation_text']
#                 pl_text_augmented = translator_en_pl(en_text, max_length=128)[0]['translation_text']
#                 return pl_text_augmented
#             except Exception as e:
#                 print(f"Error during translation: {e}")
#                 return text  # Return original text on error

#         # Augment the minority class (hate speech) to balance the dataset
#         df_augmented_list = []
#         # We assume the 'hate' class (1) is the minority
#         texts_to_augment = df_train_full[df_train_full['label'] == 1]['text'].tolist()
#         print(f"Augmenting {len(texts_to_augment)} samples for the 'hate' class (label 1)")
#         for text in tqdm(texts_to_augment, desc="Augmenting 'hate' class"):
#             augmented_text = back_translate(text)
#             if augmented_text != text:
#                 df_augmented_list.append({'text': augmented_text, 'label': 1})

#         if df_augmented_list:
#             df_augmented = pd.DataFrame(df_augmented_list)
#             df_train_full = pd.concat([df_train_full, df_augmented], ignore_index=True)
#             print("\nTraining set after augmentation:")
#             print(f"New number of samples: {len(df_train_full)}")
#             print(df_train_full['label'].value_counts())

#     except Exception as e:
#         print(f"Could not perform data augmentation: {e}. Continuing without it.")




### Datasets

In [15]:
train_df, val_df = train_test_split(df_train_full, test_size=0.15, random_state=42, stratify=df_train_full['label'])

raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df),
    'test': Dataset.from_pandas(df_test)
})

def format_dataset_for_finetuning(examples):
    texts = examples['sentence']
    labels_int = examples['label']
    formatted_prompts = []
    for text, label_int in zip(texts, labels_int):
        label_str = id2label[label_int]
        formatted_prompts.append(
            PROMPT_TEMPLATE_FINETUNE.format(
                text=text,
                label_str=label_str,
                eos_token=tokenizer.eos_token
            )
        )
    return {"formatted_prompt": formatted_prompts}

def set_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

tokenized_datasets = {}
for split, data in raw_datasets.items():
    if split in ['train', 'validation']: # not on test
        formatted_data = data.map(format_dataset_for_finetuning, batched=True)
        
        tokenized_split = formatted_data.map(
            lambda examples: tokenizer(
                examples["formatted_prompt"],
                truncation=True,
                max_length=256, # TODO potencjalnie zwiekszyz (zerknac jaka jest srednia dlugosc tekstu w danych + dlugosc promtu)
                padding=False
            ),
            batched=True,
            remove_columns=data.column_names + ["formatted_prompt"]
        )
        tokenized_datasets[split] = tokenized_split.map(set_labels, batched=True)

print("\nPróbka danych po tokenizacji:")
print(tokenized_datasets["train"][0])
print("\nZdekodowany tekst próbki:")
print(tokenizer.decode(tokenized_datasets["train"][0]['input_ids']))


Map:   0%|          | 0/15623 [00:00<?, ? examples/s]

Map:   0%|          | 0/15623 [00:00<?, ? examples/s]

Map:   0%|          | 0/15623 [00:00<?, ? examples/s]

Map:   0%|          | 0/2757 [00:00<?, ? examples/s]

Map:   0%|          | 0/2757 [00:00<?, ? examples/s]

Map:   0%|          | 0/2757 [00:00<?, ? examples/s]


Próbka danych po tokenizacji:
{'input_ids': [9487, 1958, 262, 1708, 2420, 355, 705, 37035, 6, 393, 705, 3919, 12, 37035, 4458, 198, 198, 8206, 25, 2488, 272, 5177, 1143, 62, 23317, 573, 2188, 33721, 128, 247, 299, 444, 33320, 89, 769, 64, 41615, 368, 745, 13695, 494, 986, 1168, 707, 2101, 41615, 68, 129, 249, 25221, 1058, 35, 2488, 272, 5177, 1143, 62, 23317, 198, 33986, 25, 645, 12, 37035, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [9487, 1958, 262, 1708, 2420, 355, 705, 37035, 6, 393, 705, 3919, 12, 37035, 4458, 198, 198, 8206, 25, 2488, 272, 5177, 1143, 62, 23317, 573, 2188, 33721, 128, 247, 299, 444, 33320, 89, 769, 64, 41615, 368, 745, 13695, 494, 986, 1168, 707, 2101, 41615, 68, 129, 249, 25221, 1058, 35, 2488, 272, 5177, 1143, 62, 23317, 198, 33986, 25, 645, 12, 37035, 50256]}

Zdekodowany t

## Config LoRA

In [18]:
# bq_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
#     bnb_4bit_use_double_quant=True,
# ) 
# # TODO mialem err z  BitsAndBytesConfig

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    # quantization_config=bq_config,
    trust_remote_code=True,
    device_map="auto",
    load_in_8bit=False
)

model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
import subprocess
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "c_proj", "c_attn"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

/tmp/tmp42yz4h_6/main.c:5:10: fatal error: Python.h: No such file or directory
    5 | #include <Python.h>
      |          ^~~~~~~~~~
compilation terminated.


CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmp42yz4h_6/main.c', '-O3', '-shared', '-fPIC', '-Wno-psabi', '-o', '/tmp/tmp42yz4h_6/cuda_utils.cpython-313-x86_64-linux-gnu.so', '-lcuda', '-L/home/matimat/LLM/.venv/lib/python3.13/site-packages/triton/backends/nvidia/lib', '-L/usr/lib/wsl/lib', '-L/lib/x86_64-linux-gnu', '-I/home/matimat/LLM/.venv/lib/python3.13/site-packages/triton/backends/nvidia/include', '-I/tmp/tmp42yz4h_6', '-I/usr/include/python3.13']' returned non-zero exit status 1.

In [24]:
import bitsandbytes as bnb
print(bnb.__version__)

AttributeError: module 'bitsandbytes' has no attribute '__version__'