In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification,\
DataCollatorForLanguageModeling, DataCollatorWithPadding, Trainer, TrainingArguments, AutoModel, EvalPrediction, AutoConfig
from datasets import load_dataset, Dataset, Features, load_metric, DatasetDict
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import random
import numpy as np
import csv
from typing import Union

In [3]:
import sys
import os
# Check Python executable path
print("Python executable:", sys.executable)

# Check if CUDA_LAUNCH_BLOCKING is set
cuda_launch_blocking = "CUDA_LAUNCH_BLOCKING" in os.environ
print("CUDA_LAUNCH_BLOCKING set:", cuda_launch_blocking)

# Check if TORCH_USE_CUDA_DSA is set
torch_use_cuda_dsa = "TORCH_USE_CUDA_DSA" in os.environ
print("TORCH_USE_CUDA_DSA set:", torch_use_cuda_dsa)


Python executable: /home/amonfadi/.conda/envs/gpu_env2/bin/python
CUDA_LAUNCH_BLOCKING set: False
TORCH_USE_CUDA_DSA set: False


In [4]:
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [5]:
import json
import csv

# Path to the JSONL input file
input_jsonl_file = "he-nli-test.jsonl"

# Path to the CSV output file
output_csv_file = "jsonl_to_csv.csv"

def convert_jsonl_to_csv(input_jsonl_file, output_csv_file):
    # Open the JSONL input file for reading
    with open(input_jsonl_file, "r", encoding="utf-8") as jsonl_file:
        # Open the CSV output file for writing
        with open(output_csv_file, "w", newline="", encoding="utf-8") as csv_file:
            # Create a CSV writer object
            csv_writer = csv.writer(csv_file)

            # Write the header row to the CSV file
            csv_writer.writerow(["premise", "hypothesis", "label"])

            # Read each line from the JSONL file
            for line in jsonl_file:
                # Parse the JSON object
                data = json.loads(line)

                # Extract relevant fields from the JSON object
                premise = data.get("translation1", "")
                hypothesis = data.get("translation2", "")
                label_list = data.get("annotator_labels", "[]")

                # If label_list is a string representation of a list, parse it
                if isinstance(label_list, str):
                    label_list = eval(label_list)

                # Get the first label if available
                label = label_list[0] if label_list else ""

                # Write the extracted data to the CSV file
                csv_writer.writerow([premise, hypothesis, label])

In [6]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification,\
DataCollatorForLanguageModeling, DataCollatorWithPadding, Trainer, TrainingArguments, AutoModel, EvalPrediction, AutoConfig
from datasets import load_dataset, Dataset, Features, load_metric, DatasetDict
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import random
import numpy as np
import csv
from typing import Union
from transformers import TrainerCallback

    
    
class SaveCheckpointByEpochCallback(TrainerCallback):
    """
    Callback to save the model and tokenizer at the end of each epoch during training.

    This callback saves the model and tokenizer state to a specified directory after each training epoch,
    allowing for periodic checkpoints of the training process.

    """

    def __init__(self, output_dir: str, tokenizer):
        """
        Initialize the SaveCheckpointByEpochCallback.

        Args:
            output_dir (str): The directory where the checkpoints will be saved.
            tokenizer: The tokenizer associated with the model being trained.
        """
        self.output_dir = output_dir  # Set the directory to save the checkpoints
        self.tokenizer = tokenizer  # Set the tokenizer to be saved with the model

    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        """
        Save the model and tokenizer at the end of each epoch.

        This method is called automatically by the Trainer at the end of each epoch.
        It saves the model and tokenizer to a subdirectory named after the current epoch.

        Args:
            args: The training arguments.
            state: The current state of the Trainer.
            control: The current control object.
            model: The model being trained.
            **kwargs: Additional keyword arguments.
        """
        epoch = state.epoch  # Get the current epoch number
        checkpoint_dir = f"{self.output_dir}/checkpoint-epoch-{int(epoch)}"  # Define the checkpoint directory for the current epoch
        model.save_pretrained(checkpoint_dir)  # Save the model to the checkpoint directory
        self.tokenizer.save_pretrained(checkpoint_dir)  # Save the tokenizer to the checkpoint directory    


class ModelTrainer:
        
    def __init__(self):
        pass
    
    def _set_nested_attribute(self, obj, attribute_string: str, value):
        """
        Set the value of a nested attribute in an object.

        This method sets the value of a nested attribute (e.g., "layer1.layer2.weight") in an object.

        Args:
            obj: The object containing the nested attribute.
            attribute_string (str): A string representing the nested attribute (e.g., "layer1.layer2.weight").
            value: The value to set for the specified nested attribute.
        """
        attrs = attribute_string.split('.')  # Split the attribute string into individual attributes
        current_obj = obj
        # Traverse the attribute hierarchy except for the last attribute
        for attr in attrs[:-1]:
            current_obj = getattr(current_obj, attr)  # Get the nested object
        setattr(current_obj, attrs[-1], value)  # Set the final attribute value

    def _get_nested_attribute(self, obj, attribute_string: str):
        """
        Get the value of a nested attribute from an object.

        This method retrieves the value of a nested attribute (e.g., "layer1.layer2.weight") from an object.

        Args:
            obj: The object containing the nested attribute.
            attribute_string (str): A string representing the nested attribute (e.g., "layer1.layer2.weight").

        Returns:
            The value of the specified nested attribute.
        """
        attributes = attribute_string.split(".")  # Split the attribute string into individual attributes
        layer_obj = obj
        # Traverse the attribute hierarchy
        for attribute_name in attributes:
            layer_obj = getattr(layer_obj, attribute_name)  # Get the nested object
        return layer_obj  # Return the final attribute value    
    
    
    
    
    
    
    def init_head(self, uninitialized_head : AutoModelForMaskedLM, initialized_head : AutoModelForMaskedLM, layers_to_init : list[str]):
        model_name = uninitialized_head.base_model.config._name_or_path   
        print(f"===================================Copying layers weights and biases to {model_name} model===========")
        # this is done to copy the whole layer and not just an attribute of it, for example, at first we get: "vocab_transform.weight", and I want to access the whole layer "vocab_transform"
        layers_to_init = list(set([".".join(layer.split(".")[:-1]) for layer in layers_to_init]))
        for init_layer_name in layers_to_init:
            if "." in init_layer_name: # if there are iterative nested attributes, for example: lm_head.decoder
                
                layer_obj = self._get_nested_attribute(initialized_head, init_layer_name)
                
                # attributes = init_layer_name.split(".")
                # layer_obj = initialized_head
                # for attribute_name in attributes:
                #     layer_obj = getattr(layer_obj, attribute_name)   
                self._set_nested_attribute(uninitialized_head, init_layer_name, layer_obj)
                
            else:           
                setattr(uninitialized_head, init_layer_name, getattr(initialized_head, init_layer_name))
            print(f"The {init_layer_name} layer was copied from the initialized head!")            
        print("===================================Done copying layers weights and biases===================================")
    
    
    
    
    def _preprocess_logits_for_metrics_mlm(self, logits, labels):
        if isinstance(logits, tuple):
            # Depending on the model and config, logits may contain extra tensors,
            # like past_key_values, but logits always come first
            logits = logits[0]
        return logits.argmax(dim=-1)


    def _compute_metrics_mlm(self, eval_pred):
        predictions, labels = eval_pred
        #predictions = logits.argmax(-1)
        metric = load_metric("accuracy")

        predictions = predictions.reshape(-1)
        labels = labels.reshape(-1)
        # Convert predictions and labels to lists
        mask = labels != -100
        labels = labels[mask]
        predictions = predictions[mask]

        return metric.compute(predictions=predictions, references=labels)
    
    
    def _compute_metrics_nli(self, p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = np.argmax(preds, axis=1)
        metric = load_metric("accuracy")
        result = metric.compute(predictions=preds, references=p.label_ids)
        if len(result) > 1:
            result["combined_score"] = np.mean(list(result.values())).item()
        return result

    
    def _train_mlm(self, model, tokenizer, dataset : Union[str, DatasetDict], num_samples_train, num_samples_validation, val_dataset, validate, batch_size, num_epochs, learning_rate, checkpoint_path, freeze_base, training_model_max_tokens):
                
#         class CustomTrainer(Trainer):
#             def save_model(self, output_dir=None, _internal_call=False):
#                 if output_dir is None:
#                     output_dir = self.args.output_dir
#                 super().save_model(output_dir, _internal_call)
#                 if self.tokenizer is not None:
#                     self.tokenizer.save_pretrained(output_dir)
#                     print(f"Tokenizer saved to {output_dir}")        

        # Tokenize the combined dataset
        def preprocess_function(dataset):
            return tokenizer(dataset['text'], truncation=True, padding=True, max_length=training_model_max_tokens)  
        
        if not isinstance(dataset, str) and not isinstance(dataset, DatasetDict):
            raise TypeError("dataset must be of type 'str' or 'Dataset'")
        
        if val_dataset is not None and not validate:
            raise ValueError("If a validation dataset is provided then validate must be True!")
        
        
        if isinstance(dataset, str):
            if dataset[-4:] != ".csv":
                raise ValueError("The dataset must be a path to a csv file.")
        
        
            sentences = []
            with open(dataset, newline='', encoding='utf-8') as csvfile:
                csv_reader = csv.reader(csvfile)
                for row in csv_reader:
                    if row[0]=="":
                        raise ValueError("There is an empty row at the dataset!")
                    # Assuming each row contains only one value
                    sentences.append(row[0])
                    
            #random.shuffle(sentences)
            if num_samples_train:
                training_set = sentences[:num_samples_train]
            else:
                training_set = sentences
                
            if val_dataset and validate:     
                validation_set=[]
                with open(val_dataset, newline='', encoding='utf-8') as csvfile:
                    csv_reader = csv.reader(csvfile)
                    for row in csv_reader:
                        if row[0]=="":
                            raise ValueError("There is an empty row at the dataset!")
                        # Assuming each row contains only one value
                        validation_set.append(row[0])
                            # Create Dataset objects for each split
                            
                train_dataset = Dataset.from_dict({"text": training_set})
                validation_dataset = Dataset.from_dict({"text": validation_set})

                dataset = DatasetDict({"train": train_dataset, "validation": validation_dataset})
            elif validate:
                # Split samples into training and validation sets
                if num_samples_train:
                    validation_set = sentences[num_samples_train:]
                
                else:
                    raise TypeError("Since num_samples_train is not provided, the validation dataset would include samples from training, so please specify num_samples_train")
                    
                # Create Dataset objects for each split
                train_dataset = Dataset.from_dict({"text": training_set})
                validation_dataset = Dataset.from_dict({"text": validation_set})

                dataset = DatasetDict({"train": train_dataset, "validation": validation_dataset})
                
            else:
                # Create Dataset objects for each split
                train_dataset = Dataset.from_dict({"text": training_set})
                dataset = DatasetDict({"train": train_dataset})
        
                      
        tokenized_dataset = dataset.map(preprocess_function, batched=True)

        if num_samples_train:
            print(f"Sampling {num_samples_train} training samples!")
            train_sampled_dataset = tokenized_dataset['train'].select(range(num_samples_train))
        else:
            print(f"num_samples_train was not provided, using whole {len(tokenized_dataset['train'])} training samples!")
            train_sampled_dataset = tokenized_dataset['train']
                    
        if num_samples_validation and validate:
            print(f"Sampling {num_samples_validation} validation samples!")
            validation_sampled_dataset = tokenized_dataset['validation'].select(range(num_samples_validation))
            
        elif validate:
            print(f"num_samples_validation was not provided, using whole {len(tokenized_dataset['validation'])} validation samples!")
            validation_sampled_dataset = tokenized_dataset['validation']
                
 
#         # Sample the indices of the items
#         train_sampled_indices = random.sample(range(len(tokenized_dataset['train'])), num_samples_train)
#         # Create a new dataset with the sampled items
#         train_sampled_dataset=tokenized_dataset['train'].select(train_sampled_indices)
        
          
#         validation_sampled_indices = random.sample(range(len(tokenized_dataset['validation'])), num_samples_validation)
#         validation_sampled_dataset=tokenized_dataset['validation'].select(validation_sampled_indices)



        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

        # Freeze/unfreeze base model
        for param in model.base_model.parameters():
            param.requires_grad = not freeze_base

        if validate:
            # Define training arguments
            training_args = TrainingArguments(
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                num_train_epochs=num_epochs,
                learning_rate=learning_rate,
                evaluation_strategy="epoch",  # Log metrics at the end of each epoch
                logging_dir="./mlm_training/logs/logging_mlm",
                output_dir="./mlm_training/output", 
                overwrite_output_dir = True,
                save_strategy="no",
                #save_strategy="epoch",  # Save checkpoint at the end of each epoch
            )

            # Define Trainer
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_sampled_dataset,
                eval_dataset=validation_sampled_dataset,
                data_collator=data_collator,
                compute_metrics=self._compute_metrics_mlm,
                preprocess_logits_for_metrics=self._preprocess_logits_for_metrics_mlm,
                callbacks=[SaveCheckpointByEpochCallback(checkpoint_path, tokenizer)],
            )
        else:
            # Define training arguments
            training_args = TrainingArguments(
                per_device_train_batch_size=batch_size,
                num_train_epochs=num_epochs,
                learning_rate=learning_rate,
                logging_dir="./mlm_training/logs/logging_mlm",  
                output_dir="./mlm_training/output", 
                overwrite_output_dir = True,
                save_strategy="no",
                #save_strategy="epoch",  # Save checkpoint at the end of each epoch
            )

            # Define Trainer
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_sampled_dataset,
                data_collator=data_collator,
                compute_metrics=self._compute_metrics_mlm,
                preprocess_logits_for_metrics=self._preprocess_logits_for_metrics_mlm,
                callbacks=[SaveCheckpointByEpochCallback(checkpoint_path, tokenizer)],

            )

        # Train the model
        trainer.train()
        return model

    
    
    def _train_nli(self, model, tokenizer, dataset : Union[str, DatasetDict], num_samples_train, num_samples_validation, val_dataset, validate, batch_size, num_epochs, learning_rate, checkpoint_path, freeze_base, training_model_max_tokens):
                  
                  
        # Tokenize the combined dataset
        def preprocess_function(dataset):
            return tokenizer(dataset['premise'], dataset['hypothesis'], padding=True, truncation=True, max_length=training_model_max_tokens)  
        
        
        if not isinstance(dataset, str) and not isinstance(dataset, DatasetDict):
            raise TypeError("dataset must be of type 'str' or 'Dataset'")
            
        if val_dataset is not None and not validate:
            raise ValueError("If a validation dataset is provided then validate must be True!")
        
        if isinstance(dataset, str):                
            if dataset[-4:] != ".csv":
                raise ValueError("The dataset must be a path to a csv file.")
        
            training_premise=[]
            training_hypothesis=[]
            training_label=[]
            label2_id = {'entailment': 0, 'neutral': 1, 'contradiction':2}
            with open(dataset, newline='', encoding='utf-8') as csvfile:
                csv_reader = csv.reader(csvfile)
                next(csv_reader)  # Skip the header row
                for row in csv_reader:
                    training_premise.append(row[0])
                    training_hypothesis.append(row[1])
                    training_label.append(label2_id[row[2]])
                    
            if num_samples_train:
                training_premise=training_premise[:num_samples_train]
                training_hypothesis=training_hypothesis[:num_samples_train]
                training_label=training_label[:num_samples_train]
                    
            if val_dataset and validate:     
                validation_premise=[]
                validation_hypothesis=[]
                validation_label=[]
                with open(val_dataset, newline='', encoding='utf-8') as csvfile:
                    csv_reader = csv.reader(csvfile)
                    next(csv_reader)  # Skip the header row
                    for row in csv_reader:
                        validation_premise.append(row[0])
                        validation_hypothesis.append(row[1])
                        validation_label.append(label2_id[row[2]])

                # Create a dictionary with the data
                training_set = {
                    'premise': training_premise,
                    'hypothesis': training_hypothesis,
                    'label': training_label
                }
                
                # Create a dictionary with the data
                validation_set = {
                    'premise': validation_premise,
                    'hypothesis': validation_hypothesis,
                    'label': validation_label
                }

                # Create Dataset objects for each split
                train_dataset = Dataset.from_dict(training_set)
                validation_dataset = Dataset.from_dict(validation_set)

                dataset = DatasetDict({"train": train_dataset, "validation_matched": validation_dataset})
                
                
                
            elif validate:  
                if num_samples_train:                             
                    validation_premise = training_premise[num_samples_train:]
                    validation_hypothesis = training_hypothesis[num_samples_train:]
                    validation_label = training_label[num_samples_train:]
                
                else:
                    raise TypeError("Since num_samples_train is not provided, the validation dataset would include samples from training, so please specify num_samples_train")
                # Create a dictionary with the data
                training_set = {
                    'premise': training_premise,
                    'hypothesis': training_hypothesis,
                    'label': training_label
                }
                
                # Create a dictionary with the data
                validation_set = {
                    'premise': validation_premise,
                    'hypothesis': validation_hypothesis,
                    'label': validation_label
                }

                # Create Dataset objects for each split
                train_dataset = Dataset.from_dict(training_set)
                validation_dataset = Dataset.from_dict(validation_set)

                dataset = DatasetDict({"train": train_dataset, "validation_matched": validation_dataset})
                
            else:
                training_set = {
                    'premise': training_premise,
                    'hypothesis': training_hypothesis,
                    'label': training_label
                }
                # Create Dataset objects for each split
                train_dataset = Dataset.from_dict({"features": training_set})
                dataset = DatasetDict({"train": train_dataset})
        

            
        tokenized_dataset = dataset.map(preprocess_function, batched=True)
        if num_samples_train:
            print(f"Sampling {num_samples_train} training samples!")
            train_sampled_dataset = tokenized_dataset['train'].select(range(num_samples_train))
        else:
            print(f"num_samples_train was not provided, using whole {len(tokenized_dataset['train'])} training samples!")
            train_sampled_dataset = tokenized_dataset['train']

        if num_samples_validation and validate:
            print(f"Sampling {num_samples_validation} validation samples!")
            validation_sampled_dataset = tokenized_dataset['validation_matched'].select(range(num_samples_validation))

        elif validate:
            print(f"num_samples_validation was not provided, using whole {len(tokenized_dataset['validation_matched'])} validation samples!")
            validation_sampled_dataset = tokenized_dataset['validation_matched']            
                  

        
#         train_random_indices = random.sample(range(len(tokenized_dataset['train'])), num_samples_train)
#         train_sampled_dataset = tokenized_dataset['train'].select(train_random_indices)        
        
#         validation_random_indices = random.sample(range(len(tokenized_dataset['validation_matched'])), num_samples_validation)
#         validation_sampled_dataset = tokenized_dataset['validation_matched'].select(validation_random_indices)
        
        
        
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
        # Freeze/unfreeze base model
        for param in model.base_model.parameters():
            param.requires_grad = not freeze_base
        
        
        if validate:
            # Define training arguments
            training_args = TrainingArguments(
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                num_train_epochs=num_epochs,
                learning_rate=learning_rate,
                evaluation_strategy="epoch",  # Log metrics at the end of each epoch
                logging_dir="./nli_training/logs/logging_nli",  
                output_dir="./nli_training/output_benevolent/",
                #save_strategy="epoch",  # Save checkpoint at the end of each epoch
                overwrite_output_dir = True,
                save_strategy="no",

            )

            # Define Trainer
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_sampled_dataset,
                eval_dataset=validation_sampled_dataset,
                data_collator=data_collator,
                compute_metrics=self._compute_metrics_mlm,
                preprocess_logits_for_metrics=self._preprocess_logits_for_metrics_mlm,
                callbacks=[SaveCheckpointByEpochCallback(checkpoint_path, tokenizer)],
            )
        else:
            # Define training arguments
            training_args = TrainingArguments(
                per_device_train_batch_size=batch_size,
                num_train_epochs=num_epochs,
                learning_rate=learning_rate,
                logging_dir="./nli_training/logs/logging_nli",  
                output_dir="./nli_training/output",
                #save_strategy="epoch",  # Save checkpoint at the end of each epoch
                overwrite_output_dir = True,
                save_strategy="no",

            )
            

            # Define Trainer
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_sampled_dataset,
                data_collator=data_collator,
                compute_metrics=self._compute_metrics_mlm,
                preprocess_logits_for_metrics=self._preprocess_logits_for_metrics_mlm,
                callbacks=[SaveCheckpointByEpochCallback(checkpoint_path, tokenizer)],
            )
    
        # Train the model
        trainer.train()
        return model
    
    
    def get_non_base_layers(self, model):
        
        all_layers = list(model.state_dict().keys())
        base_layers = list(model.base_model.state_dict().keys())
        head_layers=[]
        for layer in all_layers:
            if ".".join(layer.split(".")[1:]) not in base_layers: # when looping over the layers of the base model we want to remove the prefix of the layer which is the name of the model, hence the ".".join(layer.split(".")[1:])
                head_layers.append(layer)
                
        return head_layers
    
    
    def attach_head_to_model(self, model, model_identifier : str, head):       
        setattr(head, model_identifier, getattr(model,model_identifier))
    
        

    def train_head(self, model, tokenizer, dataset, nli_head=False, mlm_head=False, model_to_copy_weights_from=None, num_samples_train=None, num_samples_validation=None,val_dataset=None,validate=True,training_model_max_tokens=512, batch_size=16, num_epochs=10, learning_rate=2e-5, freeze_base = False, copy_weights=False, checkpoint_path=None):
        model_name = model.base_model.config._name_or_path         
        
        if  (not nli_head and not mlm_head) or (nli_head and mlm_head): # if both false or both true
            raise ValueError("You must have one head (nli_head or mlm_head) set to True at a time.")
            

        if copy_weights:
            
            if not model_to_copy_weights_from:
                raise ValueError("Please pass in a model (model_to_copy_weights_from=?) to load the initialized layers from!")
                
            
            get_initialized_layers = self.get_non_base_layers(model_to_copy_weights_from)
            get_uninitialized_layers = self.get_non_base_layers(model)
            if sorted(get_uninitialized_layers)!=sorted(get_initialized_layers):
                raise ValueError(f"Models architecture are not equal, make sure that {model_to_copy_weights_from.base_model.config._name_or_path} head layers are the same as {model_name}'s")
            self.init_head(model, model_to_copy_weights_from, get_uninitialized_layers)

        
        if nli_head:
            print(f"Detected {model_name} with an NLI head...")
            if not checkpoint_path:
                checkpoint_path = "./nli_training_checkpoint"
            self._train_nli(model, tokenizer, dataset, num_samples_train, num_samples_validation,val_dataset, validate, batch_size, num_epochs, learning_rate, checkpoint_path, freeze_base, training_model_max_tokens)
        elif mlm_head:
            print(f"Detected {model_name} with an MLM head...")
            if not checkpoint_path:
                checkpoint_path = "./mlm_training_checkpoint"
            self._train_mlm(model, tokenizer, dataset, num_samples_train, num_samples_validation,val_dataset, validate, batch_size, num_epochs, learning_rate, checkpoint_path, freeze_base, training_model_max_tokens)
            

### Example of an MNLI head loaded from an MNLI model trained with csv.

In [7]:
# Example usage
base_model_name = "typeform/distilbert-base-uncased-mnli"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
nli_model = AutoModelForSequenceClassification.from_pretrained(base_model_name)
dataset = "./jsonl_to_csv.csv"


trainer = ModelTrainer()
trainer.train_head(nli_model, tokenizer, nli_head=True, dataset=dataset, num_samples_train=4000, num_samples_validation=1000, copy_weights=False, freeze_base = False)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Detected typeform/distilbert-base-uncased-mnli with an NLI head...


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Sampling 4000 training samples!
Sampling 1000 validation samples!
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mfadi_[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,1.071400,No log
3,1.071400,No log
4,0.922700,No log
5,0.922700,No log
6,0.790300,No log
7,0.790300,No log
8,0.633600,No log
9,0.633600,No log
10,0.534000,No log


### Example of an MLM head loaded from an MLM model trained with dataset.

In [21]:
# Example usage
base_model_name = "distilbert/distilbert-base-uncased" 
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
mlm_model = AutoModelForMaskedLM.from_pretrained(base_model_name)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
#dataset = "../AMI/datasets/ami_hostility_towards_men.csv"
trainer = ModelTrainer()

trainer.train_head(model=mlm_model, tokenizer=tokenizer,dataset=dataset,freeze_base=False, mlm_head=True, num_samples_train=10000, num_samples_validation=2000,validate=True, batch_size=16, training_model_max_tokens=512)

Detected distilbert/distilbert-base-uncased with an MLM head...
Sampling 10000 training samples!
Sampling 2000 validation samples!


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,2.1207,1.847978,0.625923
2,1.9863,1.80268,0.64054
3,1.9045,1.832532,0.627107
4,1.8634,1.858084,0.628424
5,1.8219,1.836857,0.628479
6,1.7814,1.83294,0.631115
7,1.7669,1.829467,0.629548
8,1.7377,1.794244,0.635866
9,1.7088,1.794745,0.63546
10,1.7172,1.827078,0.634092


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datase

### Example of an MLM head loaded from an MLM model trained with csv.

In [6]:
# Example usage
base_model_name = "distilbert/distilbert-base-uncased" 
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
mlm_model = AutoModelForMaskedLM.from_pretrained(base_model_name)
#dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
dataset = "../AMI/datasets/ami_hostility_towards_men.csv"
trainer = ModelTrainer()

trainer.train_head(model=mlm_model, tokenizer=tokenizer,dataset=dataset,freeze_base=False, mlm_head=True,validate=False, batch_size=4, training_model_max_tokens=512)

Detected distilbert/distilbert-base-uncased with an MLM head...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

num_samples_train was not provided, using whole 100 training samples!
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mfadi_[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


### Example of an MLM head loaded from an NLI model with weights and biases initialized randomly, trained with dataset.

In [7]:
base_model_name = "typeform/distilbert-base-uncased-mnli" 
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
mlm_model = AutoModelForMaskedLM.from_pretrained(base_model_name)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
#dataset = "../AMI/datasets/ami_hostility_towards_men.csv"
trainer = ModelTrainer()

trainer.train_head(model=mlm_model, tokenizer=tokenizer,dataset=dataset, mlm_head=True, copy_weights=False,num_samples_train=10000, num_samples_validation=2000, batch_size=16, training_model_max_tokens=512)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
Some weights of the model checkpoint at typeform/distilbert-base-uncased-mnli were not used when initializing DistilBertForMaskedLM: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
- This IS expected if you are initializing DistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassific

Detected typeform/distilbert-base-uncased-mnli with an MLM head...
Sampling 10000 training samples!
Sampling 2000 validation samples!


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,5.9817,4.641768,0.32606
2,4.5498,3.976776,0.390801
3,4.0186,3.655701,0.426218
4,3.5206,3.470596,0.454778
5,3.3475,3.303706,0.472434
6,3.2063,3.212016,0.485991
7,3.1312,3.134902,0.495727
8,3.0195,3.025861,0.504404
9,2.9671,3.025415,0.506924
10,2.9444,3.044332,0.510294


  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric fro

### Example of an MLM head loaded from an NLI model with weights and biases initialized randomly, trained with csv.

In [8]:
base_model_name = "typeform/distilbert-base-uncased-mnli" 
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
mlm_model = AutoModelForMaskedLM.from_pretrained(base_model_name)
#dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
dataset = "../AMI/datasets/ami_hostility_towards_men.csv"
trainer = ModelTrainer()

trainer.train_head(model=mlm_model, tokenizer=tokenizer,dataset=dataset, mlm_head=True, validate=False, batch_size=4, training_model_max_tokens=512)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
Some weights of the model checkpoint at typeform/distilbert-base-uncased-mnli were not used when initializing DistilBertForMaskedLM: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
- This IS expected if you are initializing DistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassific

Detected typeform/distilbert-base-uncased-mnli with an MLM head...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

num_samples_train was not provided, using whole 100 training samples!


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


### Example of an MLM head loaded from an NLI model with copied weights and biases from a trained MLM head, trained on dataset.

In [9]:
# Example usage
base_model_name = "typeform/distilbert-base-uncased-mnli" 
mlm_initialized_head = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-uncased")
# Example usage
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
mlm_model = AutoModelForMaskedLM.from_pretrained(base_model_name)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
#dataset = "../AMI/datasets/ami_hostility_towards_men.csv"
trainer = ModelTrainer()

trainer.train_head(model=mlm_model, tokenizer=tokenizer,dataset=dataset, mlm_head=True, model_to_copy_weights_from=mlm_initialized_head, copy_weights=True,num_samples_train=10000, num_samples_validation=2000, batch_size=16, training_model_max_tokens=512)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
Some weights of the model checkpoint at typeform/distilbert-base-uncased-mnli were not used when initializing DistilBertForMaskedLM: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
- This IS expected if you are initializing DistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassific

The vocab_transform layer was copied from the initialized head!
The vocab_projector layer was copied from the initialized head!
The vocab_layer_norm layer was copied from the initialized head!
Detected typeform/distilbert-base-uncased-mnli with an MLM head...
Sampling 10000 training samples!
Sampling 2000 validation samples!


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,2.5653,2.080645,0.591742
2,2.2321,1.982854,0.611504
3,2.1126,1.991851,0.602558
4,2.0316,1.996808,0.608984
5,1.9829,1.966853,0.608812
6,1.9306,1.968479,0.609883
7,1.9099,1.959997,0.610469
8,1.8755,1.914934,0.619345
9,1.8448,1.925399,0.616508
10,1.8521,1.952806,0.614754


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datase

### Example of an MLM head loaded from an NLI model with copied weights and biases from a trained MLM head, trained on csv.

In [10]:
# Example usage
base_model_name = "typeform/distilbert-base-uncased-mnli" 
mlm_initialized_head = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-uncased")
# Example usage
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
mlm_model = AutoModelForMaskedLM.from_pretrained(base_model_name)
#dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
dataset = "../AMI/datasets/ami_hostility_towards_men.csv"
trainer = ModelTrainer()

trainer.train_head(model=mlm_model, tokenizer=tokenizer,dataset=dataset, mlm_head=True, model_to_copy_weights_from=mlm_initialized_head, copy_weights=True,validate=False, batch_size=4, training_model_max_tokens=512)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
Some weights of the model checkpoint at typeform/distilbert-base-uncased-mnli were not used when initializing DistilBertForMaskedLM: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
- This IS expected if you are initializing DistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassific

The vocab_transform layer was copied from the initialized head!
The vocab_projector layer was copied from the initialized head!
The vocab_layer_norm layer was copied from the initialized head!
Detected typeform/distilbert-base-uncased-mnli with an MLM head...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

num_samples_train was not provided, using whole 100 training samples!


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


### Example of an NLI head fine tuned with NLI model, trained on dataset.

In [11]:
# Example usage
base_model_name = "typeform/distilbert-base-uncased-mnli"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
nli_model = AutoModelForSequenceClassification.from_pretrained(base_model_name)
dataset = load_dataset('multi_nli')


trainer = ModelTrainer()
trainer.train_head(nli_model, tokenizer, nli_head=True, dataset=dataset, num_samples_train=10000, num_samples_validation=2000, copy_weights=False, freeze_base = False)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Detected typeform/distilbert-base-uncased-mnli with an NLI head...


Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Sampling 10000 training samples!
Sampling 2000 validation samples!


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1799,0.929243,0.8215
2,0.1211,1.063527,0.8055
3,0.0907,1.253619,0.812
4,0.0392,1.351176,0.8155
5,0.0253,1.431156,0.817
6,0.025,1.482971,0.822
7,0.0184,1.523677,0.8215
8,0.0166,1.564941,0.823
9,0.0083,1.591428,0.824
10,0.0094,1.601259,0.821


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datase

### Example of an NLI head loaded from an MLM model with weights and biases initialized randomly.

In [12]:
# Example usage
config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", num_labels = 3)
base_model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
nli_model = AutoModelForSequenceClassification.from_pretrained(base_model_name , config=config)
trainer = ModelTrainer()
dataset = load_dataset('multi_nli')


trainer.train_head(nli_model, tokenizer,dataset=dataset, nli_head=True, num_samples_train=10000, num_samples_validation=2000, copy_weights=False, freeze_base = False)

Some weights of the model checkpoint at distilbert/distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifi

Detected distilbert/distilbert-base-uncased with an NLI head...
Sampling 10000 training samples!
Sampling 2000 validation samples!


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9192,0.77236,0.664
2,0.6783,0.752366,0.68
3,0.5067,0.811651,0.704
4,0.2502,1.0528,0.6855
5,0.1504,1.327794,0.7
6,0.1255,1.646691,0.6815
7,0.0927,1.760503,0.7025
8,0.0599,1.821774,0.7035
9,0.0377,1.903767,0.7
10,0.0318,1.926256,0.7


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datase

### Example of an NLI head loaded from MLM model with copied weights and biases from a trained NLI head.

In [14]:
# Example usage
config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", num_labels = 3)
base_model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
nli_model = AutoModelForSequenceClassification.from_pretrained(base_model_name , config=config)
nli_initialized_head = AutoModelForSequenceClassification.from_pretrained("typeform/distilbert-base-uncased-mnli")
trainer = ModelTrainer()
dataset = load_dataset('multi_nli')

trainer.train_head(nli_model, tokenizer,dataset=dataset, nli_head=True, model_to_copy_weights_from=nli_initialized_head, num_samples_train=10000, num_samples_validation=2000, copy_weights=True, freeze_base = False)

Some weights of the model checkpoint at distilbert/distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifi

The pre_classifier layer was copied from the initialized head!
The classifier layer was copied from the initialized head!
Detected distilbert/distilbert-base-uncased with an NLI head...
Sampling 10000 training samples!
Sampling 2000 validation samples!


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8599,0.727506,0.6835
2,0.6282,0.756342,0.692
3,0.45,0.895567,0.698
4,0.2,1.156263,0.7005
5,0.1048,1.516546,0.706
6,0.0802,1.767782,0.7095
7,0.06,1.955309,0.7115
8,0.0245,2.112643,0.7115
9,0.0181,2.162693,0.7165
10,0.0174,2.167689,0.717


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datase

In [None]:
def test_mlm(model, tokenizer, sentence, device="cpu"):  # Pass device as an argument
    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors="pt")

    # Move inputs to the specified device
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move all tensors to device

    # Find the index of the [MASK] token in the tokenized sequence
    mask_index = inputs['input_ids'][0].tolist().index(tokenizer.mask_token_id)

    # Generate predictions
    with torch.no_grad():
        # Move the model to the specified device
        model.to(device)  # Move the model to device
        outputs = model(**inputs)

    # Get the predicted logits for the masked token
    masked_token_logits = outputs.logits[0, mask_index].cpu().numpy()

    # Convert logits to probabilities
    masked_token_probs = torch.softmax(torch.tensor(masked_token_logits), dim=-1).numpy()

    # Get the predicted token ID with highest probability
    predicted_token_id = int(masked_token_probs.argmax())

    # Get the predicted token
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_token_id])[0]

    return predicted_token

# Example sentence to test
sentence = "The quick brown [MASK] jumps over the lazy dog."

# Test the MLM model
predicted_token = test_mlm(mlm_model, tokenizer, sentence)  # Specify device as "cpu"

print("Predicted token:", predicted_token)
