# Finetuning

In [2]:
! zip -r datasets_validation.zip datasets_validation

  adding: datasets_validation/ (stored 0%)
  adding: datasets_validation/mma_target.csv (deflated 84%)
  adding: datasets_validation/.DS_Store (deflated 94%)
  adding: datasets_validation/nsfw_363.json (deflated 72%)
  adding: datasets_validation/nsfwmma_adv_prompts.csv (deflated 71%)
  adding: datasets_validation/unsafe_dataset_for_comparison.csv (deflated 66%)
  adding: datasets_validation/sneakydataset.csv (deflated 72%)
  adding: datasets_validation/mma_text.csv (deflated 66%)
  adding: datasets_validation/mma_sanitized.csv (deflated 65%)
  adding: datasets_validation/nsfwsneaky.txt (deflated 66%)


In [None]:
! pip install requirements.txt

In [15]:
# Importing libraries
import pandas as pd
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, fbeta_score
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification
import torch
from torch import cuda, nn
import wandb
from datasets import load_dataset

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Setting up the device for GPU usage
if cuda.is_available():
    device = 'cuda'
    print("A GPU is available and will be utilized.")
    print("Using GPU:", torch.cuda.get_device_name(0))
    print("Total GPU memory:", torch.cuda.get_device_properties(device).total_memory / (1024**3), "GB")
    print("Available GPU memory:", torch.cuda.get_device_properties(device).total_memory / (1024**3), "GB")
    print("Number of available GPUs:", torch.cuda.device_count())
    device = 'cuda'
else:
    print("No GPU available. The CPU will be used instead.")
    print("CPU name:", torch.cuda.get_device_name(0))
    print("Number of logical cores:", torch.cuda.get_device_properties(0).multi_processor_count)
    print("CPU architecture:", torch.cuda.get_device_properties(0).major, ".", torch.cuda.get_device_properties(0).minor)
    device = "cpu"


A GPU is available and will be utilized.
Utilisation de GPU: NVIDIA GeForce RTX 3090
Mémoire totale de GPU: 23.68890380859375 GB
Mémoire disponible sur GPU: 23.68890380859375 GB
Nombre de GPU disponibles: 1


In [16]:
dataset = load_dataset("eliasalbouzidi/NSFW-Safe-Dataset")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 187788
    })
    validation: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 40241
    })
    test: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 40240
    })
    train_nopreprocessing: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 174191
    })
    validation_nopreprocessing: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 37209
    })
    test_nopreprocessing: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 37300
    })
})

In [18]:
labels = ['safe', 'nsfw']
num_labels=len(labels)
id2label = {idx: label for idx, label in enumerate(labels)}
id2label
{ 0: 'safe', 1: 'nsfw'}
label2id = {v:k for k,v in id2label.items()}
label2id
{'safe': 0, 'nsfw': 1}


{'safe': 0, 'nsfw': 1}

In [20]:
max_length=512
name="distilbert-base-uncased"


In [21]:
tokenizer = AutoTokenizer.from_pretrained(name,truncation=True, do_lower_case=True, padding = True,max_length=max_length)




In [22]:
def tokenize_text(examples):
 return tokenizer(examples["text"], truncation=True,max_length=max_length)
dataset = dataset.map(tokenize_text, batched=True)

Map: 100%|██████████| 40241/40241 [00:02<00:00, 13722.57 examples/s]


In [23]:
labels_series = pd.Series(dataset["train"]["labels"])
class_weights = (1 - ( labels_series.value_counts().sort_index() / len(dataset["train"]["labels"]) )   ).values
print(class_weights)

class_weights = torch.from_numpy(class_weights).float().to("cuda")



[0.37529022 0.62470978]


In [24]:
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):

        # Feed inputs to model and extract logits
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Extract labels
        labels = inputs.get("labels")

        # Define loss function with class weights
        loss_func = nn.CrossEntropyLoss(weight=class_weights)  # Assuming class_weights is defined elsewhere

        # Compute loss
        loss = loss_func(logits, labels)

        return (loss, outputs) if return_outputs else loss
        


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(name, num_labels=num_labels,id2label=id2label,label2id=label2id)

# Uncomment for transfer learning
# for param in model.base_model.parameters():
#     param.requires_grad = False

model.to("cuda")

In [26]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    accuracy = accuracy_score(labels, preds)
    recall = recall_score(labels, preds)
    precision = precision_score(labels, preds)
    f1 = f1_score(labels, preds)
    fbeta_1_6 = fbeta_score(labels, preds, beta=1.6)

    return {
        "accuracy": accuracy,
        "f1": f1,
        "fbeta_1.6": fbeta_1_6,
        "precision": precision,
        "recall": recall

    }

In [13]:
batch_size =32
output_dir = "distilbert-512-fbeta1.6"
num_train_steps = len(dataset["train"]) // batch_size 

training_args=TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="steps",
    logging_steps=num_train_steps //10,  # Log every 10% of the epoch
    eval_steps=num_train_steps // 10,  # Evaluate every 10% of the epoch
    fp16=True,  # Make it train fast!
    push_to_hub=True, # To push to the hub
    save_steps=num_train_steps // 10,
    save_strategy="steps",  
    # weight_decay=0.01,
    # seed=42,
    # optim="adamw_hf",
    # adam_beta1=0.9,
    # adam_beta2=0.999,
    # adam_epsilon=1e-8,
    lr_scheduler_type="linear",
    save_total_limit=3,
    report_to="wandb",
    warmup_steps = 600,
    load_best_model_at_end=True,
    metric_for_best_model="fbeta_1.6" # We pick the model with the best fbeta1.6 score
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
#distilbert-fbeta1.6

# Define the Trainer
trainer = WeightedLossTrainer(
    compute_metrics=compute_metrics,
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer
)
trainer.train()

In [None]:
trainer.evaluate()

In [16]:
model.save_pretrained("distilbertsave", from_pt=True) 


In [17]:
trainer.push_to_hub() 

CommitInfo(commit_url='https://huggingface.co/eliasalbouzidi/distilbert-512-fbeta1.6/commit/5a1c244498279eb64058703d1b3b827a48de8a96', commit_message='End of training', commit_description='', oid='5a1c244498279eb64058703d1b3b827a48de8a96', pr_url=None, pr_revision=None, pr_num=None)