## Knowledge Distillation approach with Llama3 
- Dataset das zum Training verwendet wird ist imdb

### Preparing environment (kdein)
Folgende Befehle in der bash ausführen
- conda create -n kdein python==3.10
- conda activate kdein
- pip install torch==2.0.1 transformers==4.40.2 datasets ipywidgets accelerate==0.30.1 wandb platformdirs
- python -m ipykernel install --user --name=kdein

In [2]:
# Control pytorch version --> Must be 2.0.1
!conda list | grep scikit

scikit-image              0.20.0          py310h9b08913_1    conda-forge
scikit-learn              1.2.2           py310h41b6a48_1    conda-forge


### Define Models, dataset and output dir

In [2]:
### Cuda specifics

In [1]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
!echo $CUDA_VISIBLE_DEVICES


2,3


In [3]:
import torch
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score

ModuleNotFoundError: No module named 'sklearn'

In [3]:

# Teacher Model
#teacher_dir = "/home/thsch026/masterarbeit/models/generated/prune/pruneme/merged-llama3"
#teacher_dir = "/home/thsch026/masterarbeit/models/llama3/Meta-Llama-3-8B-Instruct-HF"
#teacher_dir ="meta-llama/Meta-Llama-3-8B"
teacher_dir = "meta-llama/Llama-2-7b-chat-hf"
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_dir)

# Setze den Padding-Token auf einen numerischen Wert, falls noch nicht gesetzt
if teacher_tokenizer.pad_token is None:
    teacher_tokenizer.add_special_tokens({'pad_token': teacher_tokenizer.eos_token})

teacher_collator = DataCollatorWithPadding(tokenizer=teacher_tokenizer)
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_dir, num_labels=2)
teacher_model.config.pad_token_id = teacher_tokenizer.pad_token_id

#student_dir = "/home/thsch026/masterarbeit/models/generated/prune/pruneme/merged-llama3"
student_dir = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#student_dir = "/home/thsch026/masterarbeit/models/generated/prune/pruneme/merged-llama3-small"
student_tokenizer = AutoTokenizer.from_pretrained(student_dir)
# Setze den Padding-Token auf einen numerischen Wert, falls noch nicht gesetzt
if student_tokenizer.pad_token is None:
    student_tokenizer.add_special_tokens({'pad_token': student_tokenizer.eos_token})

student_model = AutoModelForSequenceClassification.from_pretrained(student_dir, num_labels=2)
student_collator = DataCollatorWithPadding(tokenizer=student_tokenizer)
student_model.config.pad_token_id = student_tokenizer.pad_token_id

# Memory consumption of the models
print(f"Memory footprint Teacher: {teacher_model.get_memory_footprint() / 1e6:.2f} MB")
print(f"Memory footprint Student: {student_model.get_memory_footprint() / 1e6:.2f} MB")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at TinyLlama/TinyLlama-1.1B-Chat-v1.0 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Memory footprint Teacher: 26563.63 MB
Memory footprint Student: 4161.14 MB


In [4]:
print ("Number of GPUs: ", torch.cuda.device_count())

Number of GPUs:  2


In [5]:
if torch.cuda.device_count() > 1:
    
    #teacher_model = torch.nn.parallel.DistributedDataParallel(teacher_model)
    #student_model = torch.nn.parallel.DistributedDataParallel(student_model)
    teacher_model = torch.nn.DataParallel(teacher_model)
    student_model = torch.nn.DataParallel(student_model)
    

## Prepare the dataset

### Dataset MS_Marco

In [23]:
# Loading
dataset = load_dataset('ms_marco','v1.1') # General dataset

# Funktion zum Hinzufügen des Labels
def add_label(example):
    example['label'] = 1
    return example

# Hinzufügen der Label-Spalte zum Trainingsdatensatz
dataset = dataset.map(add_label)

#dataset = dataset['train']
#ms_marco_data = ms_marco_data['train']
#dataset.rename_column('answers','labels')
#print("dataset", ms_marco_data)

Map:   0%|          | 0/10047 [00:00<?, ? examples/s]

Map:   0%|          | 0/82326 [00:00<?, ? examples/s]

Map:   0%|          | 0/9650 [00:00<?, ? examples/s]

In [24]:
# Definition der Preprocess Funktion
def preprocess_function(examples):
    return teacher_tokenizer(examples["query"], truncation=True, padding="max_length", max_length=512)
    #return teacher_tokenizer(examples["text"])

In [None]:
# Erstellen des Train Datasets
#dataset = dataset.rename_column('answers','labels')
train_dataset = dataset["train"].map(preprocess_function, batched=True)
#eval_dataset = dataset["test"].map(preprocess_function, batched=True)
#eval_dataset = dataset["test"].map(batched=True)

#train_dataset = train_dataset.remove_columns(['passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'])
#eval_dataset = eval_dataset.remove_columns(["text"])


# Zeige Beispiele
print("\nBeispiel Train Dataset:\n")
print(train_dataset[1])
#print("\nBeispiel Eval Dataset:\n")
#print(eval_dataset[1])

### Dataset imdb (Beispiel)

In [6]:
dataset = load_dataset("imdb")

In [7]:
# Definition der Preprocess Funktion
def preprocess_function(examples):
    return teacher_tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64)
    #return teacher_tokenizer(examples["text"])

In [8]:
# Erstellen des Train Datasets

train_dataset = dataset["train"].map(preprocess_function, batched=True)

#eval_dataset = dataset["test"].map(preprocess_function, batched=True)
#eval_dataset = dataset["test"].map(batched=True)

train_dataset = train_dataset.remove_columns(['text'])

#eval_dataset = eval_dataset.remove_columns(["text"])


# Zeige Beispiele
print("\nBeispiel Train Dataset:\n")
print(train_dataset[1])

#print("\nBeispiel Eval Dataset:\n")
#print(eval_dataset[1])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]


Beispiel Train Dataset:

{'label': 0, 'input_ids': [1, 376, 29902, 1913, 10837, 2738, 29901, 612, 4743, 29908, 338, 263, 5161, 1821, 322, 14794, 296, 2738, 1886, 11500, 282, 488, 29889, 739, 1838, 29915, 29873, 4383, 825, 697, 29915, 29879, 8604, 8386, 526, 1363, 445, 2706, 508, 15155, 367, 4586, 25798, 373, 738, 3233, 29889, 1094, 363, 278, 5995, 393, 4565, 284, 14263, 302, 566, 537, 338, 385, 18428, 25166, 29899, 29896], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


### Dataset boolq (Yes/no labels)

- Bei dem Dataset handelt es sich um Fragen mit Ja/Nein Antworten, die für eine Knowledge Distillation gut geeigent sein sollten
- Zur Nutzung für das Training wird die "answer" Spalte in Label umbenannt und der Datentyp in ClassLabel verändert. Dies ist nötig, damit der Tensor die richtigen Dimensionen hat

In [6]:

dataset = load_dataset("boolq")

dataset = dataset.rename_column("answer","labels")

dataset = dataset.cast_column('labels', ClassLabel(names=["False", "True"]))

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'labels', 'passage'],
        num_rows: 9427
    })
    validation: Dataset({
        features: ['question', 'labels', 'passage'],
        num_rows: 3270
    })
})


#### Mapping des Inputs

In [7]:
def preprocess_function(examples):
    return teacher_tokenizer(examples["question"], truncation=True, padding="max_length", max_length=64)
    #return teacher_tokenizer(examples["text"])

In [8]:
train_dataset = dataset["train"].map(preprocess_function, batched=True)

#eval_dataset = dataset["test"].map(preprocess_function, batched=True)
#eval_dataset = dataset["test"].map(batched=True)
train_dataset = train_dataset.remove_columns('question')
train_dataset = train_dataset.remove_columns('passage')
# Zeige Beispiele
print("\nBeispiel Train Dataset:\n")
print(train_dataset[1])


Map:   0%|          | 0/9427 [00:00<?, ? examples/s]


Beispiel Train Dataset:

{'labels': 1, 'input_ids': [1, 437, 1781, 3514, 279, 8929, 14243, 12566, 1906, 1058, 1371, 472, 385, 11423, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


### Prepare Training and needed functions

### configure Training

In [9]:

# Definieren der Trainingsargumente
training_args = TrainingArguments(
    per_device_train_batch_size=1, # optimized for low memory consumption
    per_device_eval_batch_size=1,  # optimized for low memory consumption
    gradient_accumulation_steps=1, # optimized for low memory consumption
    num_train_epochs=5,
    seed=42,
    remove_unused_columns=False,
    fp16=True,                     # optimized for low memory consumption
    # evaluation_strategy="epoch",
    save_steps=5000,
    logging_dir="../../work/train/logs",
    output_dir="../../work/train/out"
)

# Funktion zur Berechnung der distillationsverlust
def compute_distillation_loss(student_logits, teacher_logits, temperature=2.0, alpha=0.5):
    soft_labels = torch.nn.functional.softmax(teacher_logits / temperature, dim=-1)
    soft_loss = torch.nn.functional.kl_div(torch.nn.functional.log_softmax(student_logits / temperature, dim=-1), soft_labels, reduction='batchmean')
    hard_loss = torch.nn.functional.cross_entropy(student_logits, torch.argmax(soft_labels, dim=-1))
    return alpha * soft_loss + (1.0 - alpha) * hard_loss


# Funktion zum Trainieren des Schülermodells
def compute_metrics(eval_predictions):
    return {"accuracy": (eval_predictions.predictions.argmax(axis=1) == eval_predictions.label_ids).mean()}

# Definition des Trainerobjekts
trainer = Trainer(
    model=student_model,
    args=training_args,
    tokenizer=teacher_tokenizer,
    train_dataset=train_dataset,
    #eval_dataset=eval_dataset,
    data_collator=teacher_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
# Trainieren des Schülermodells mit Knowledge Distillation

#!export WANDB_NOTEBOOK_NAME="pumatest"
os.environ["WANDB_NOTEBOOK_NAME"] = "pumatest"
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mthomas-t-schmitt[0m ([33mpumaai[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
500,1.3731
1000,1.3372
1500,0.7469
2000,0.701
2500,0.6957
3000,0.6954
3500,0.7573
4000,0.8865
4500,0.8062
5000,0.7496




In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=4, collate_fn=teacher_collator)

for batch in train_dataloader:
    print(batch)
    break

In [None]:
save_path="/home/thsch026/masterarbeit/models/generated/kd3"
student_model.save_pretrained(save_path)
student_tokenizer.save_pretrained(save_path)


In [12]:
# Clean Memory
#del trainer # Specify variable
del student_model
torch.cuda.empty_cache()