In [1]:
!pip install --upgrade pip
!pip uninstall transformers -y #uninstall version you have currently, just bc we might need diff versions for diff fine tuning models
#!pip install transformers==4.18.0 #note that we need an older version of transformers to use this model
!pip install transformers 
!pip install --no-cache-dir transformers sentencepiece
!pip install datasets
!pip install accelerate -U

[33mDEPRECATION: Loading egg at /opt/homebrew/lib/python3.11/site-packages/jupyter-1.0.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
Found existing installation: transformers 4.41.2
Uninstalling transformers-4.41.2:
  Successfully uninstalled transformers-4.41.2
[33mDEPRECATION: Loading egg at /opt/homebrew/lib/python3.11/site-packages/jupyter-1.0.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mCollecting transformers
  Using cached transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
Using cached transformers-4.41.2-py3-none-any.whl (9.1 MB)
Installing collected packages: transformers
Successfully installed transformers-4.41.2
[33mDEPRECATION: Loading

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset, load_metric
import torch
import pandas as pd
import numpy as np
from google.colab import files
from google.colab import drive
import glob
import zipfile
from sklearn.model_selection import train_test_split

In [3]:
def compute_metrics(p):
    metric = load_metric("accuracy")
    logits, labels = p
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [4]:
# Instantiate the tokenizer for the french model
tokenizer = AutoTokenizer.from_pretrained("osiria/bert-tweet-italian-uncased-sentiment")


def tokenize_function(examples, tokenizer=tokenizer):
    """
    Function to tokenize the data.
    examples : data to tokenize ; dict
    tokenizer : tokenizer to use ; DistilBertTokenizer
    """
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

In [5]:
def model(train_comments, train_labels,
                    val_comments, val_labels, test_comments,
                    test_labels,
                    batch_size_train, batch_size_val,
                    epochs, tokenizer=tokenizer, learning_rate=1e-4, weight_decay=0.001):
    """
    Function to train a  model on the data.
    train_comments : comments for training ; lst of str
    train_labels : labels for training ; lst of int
    val_comments : comments for validation ; lst of str
    val_labels : labels for validation ; lst of int
    batch_size_train : batch size for training ; int
    batch_size_val : batch size for validation ; int
    epochs : number of epochs ; int
    tokenizer : tokenizer to use
    """
    # Set the device to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = AutoModelForSequenceClassification.from_pretrained("osiria/bert-tweet-italian-uncased-sentiment")
    model.to(device) # French model has no gpu acceleration

    # Setup the Hugging Face Dataset Class
    train_dataset_dict = {"text": train_comments, "label": train_labels}
    val_dataset_dict = {"text": val_comments, "label": val_labels}
    test_dataset_dict = {"text": test_comments, "label": test_labels}

    train_dataset = Dataset.from_dict(train_dataset_dict)
    val_dataset = Dataset.from_dict(val_dataset_dict)
    test_dataset = Dataset.from_dict(test_dataset_dict)

    # Apply the tokenizer to the datasets
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Remove columns we do not need for training
    train_dataset = train_dataset.remove_columns(["text"])
    val_dataset = val_dataset.remove_columns(["text"])
    test_dataset = test_dataset.remove_columns(["text"])

    # Set the format of the datasets to PyTorch tensors
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")


    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=epochs,              # total number of training epochs
        per_device_train_batch_size=batch_size_train,  # batch size for training
        per_device_eval_batch_size=batch_size_val,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=weight_decay,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        evaluation_strategy="steps",     # Evaluate every `eval_steps`
        eval_steps=10,                   # Number of steps between evaluations
        save_steps=10,                   # Save the model every `save_steps`
        load_best_model_at_end=True,     # Load the best model at the end of training#
        learning_rate= learning_rate,              # Set the learning rate
        metric_for_best_model="eval_loss", # Use evaluation loss to check how good our model is performing
        greater_is_better=False,
    )

    # Trainer
    trainer = Trainer(
        model=model,                         # model
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=val_dataset,            # evaluation dataset
        tokenizer=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=10)], # Early Stopping for Overfitting
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the test set
    test_results = trainer.evaluate(eval_dataset=test_dataset)

    # Get test accuracy
    print("Test Accuracy:", test_results['eval_accuracy'])



    return model, tokenizer, test_results



In [6]:
def save_model(model, tokenizer, path):
    """
    Function to save the model
    model : model to save ;
    tokenizer : tokenizer to save ;
    path : path to save the model ; str
    """

    model_save_path =  path
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

In [None]:
# Open a file upload dialog
# Select here all files to upload!
# If already uploaded, just press 'Cancel Upload'
# They are contained within a zip file.
uploaded = files.upload()

In [None]:
# Set the path to the data
# On local machine use the relative path, for example
# path = 'NLP labelled data preview/english set/'
# On Google Colab use this path
# '/content/'
path = '/content/'

In [None]:
# Unzip the folder
with zipfile.ZipFile(path + 'italian_data_finetuning.zip', 'r') as zip_ref:
    zip_ref.extractall('italian_data_finetuning')

In [None]:
# Load the dataset
all_comments_with_labels = glob.glob(path + 'italian_data_finetuning/italian_data_finetuning/*.csv')

In [None]:
# Read in the data
all_comments_with_labels = pd.concat([pd.read_csv(f) for f in all_comments_with_labels], ignore_index = True)

In [7]:
# TODO : remove, just testing purposes on local machine
all_comments_with_labels = pd.read_csv('/Users/marlon/VS-Code-Projects/Youtube/Comments DB/Italian/Finetuning/TEST_ITALIAN.csv')

In [8]:
import random
# FOR TESTING PURPOSES (since we don't have the labels for french yet)
# Add a new column to the dataframe all_comments_with_labels and fill it up with integer values randomly distributed between 0 and 2
all_comments_with_labels['label'] = [random.randint(0, 1) for i in range(len(all_comments_with_labels))]

In [9]:
# Conversion to labels of the model we fine-tune
# model stores 0 as negative, 1 as neutral and 2 as positive
all_comments_with_labels['label'] = all_comments_with_labels['label'].replace({0: 0, 1: 2})

In [10]:
# Seperate the two columns in the dataframe into comments and labels and turn them into lists
comments = all_comments_with_labels['Comment'].tolist()
labels = all_comments_with_labels['label'].tolist()


In [11]:
# Convert the labels to integers
labels = [int(label) for label in labels]
# Turn all elements in comments into strings
comments = [str(comment) for comment in comments]


In [12]:
# Assess distributions
print("Number of comments: ", len(comments))
print("Number of negative comments: ", labels.count(0))
print("Number of neutral comments: ", labels.count(2))



Number of comments:  3907
Number of negative comments:  1956
Number of neutral comments:  1951


In [13]:
# Split the data into a train and test set with stratification
train_comments, temp_comments, train_labels, temp_labels = train_test_split(
    comments, labels, test_size=0.3, random_state=42, stratify=labels)



In [14]:
# Split the data into training and validation sets with stratification
val_comments, test_comments, val_labels, test_labels = train_test_split(temp_comments, temp_labels, test_size=0.1, random_state=42, stratify= temp_labels)

In [15]:
# Train the model
model_trained, tokenizer_trained, test_results = model(train_comments, train_labels, val_comments, val_labels,test_comments, test_labels, batch_size_train = 16, batch_size_val = 8, epochs = 1, tokenizer = tokenizer, learning_rate=1e-4, weight_decay=0.001)


The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !


Map:   0%|          | 0/2734 [00:00<?, ? examples/s]

The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !


Map:   0%|          | 0/1055 [00:00<?, ? examples/s]

The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !


Map:   0%|          | 0/118 [00:00<?, ? examples/s]



  0%|          | 0/171 [00:00<?, ?it/s]



{'loss': 0.5793, 'grad_norm': 23.897062301635742, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.06}


  0%|          | 0/132 [00:00<?, ?it/s]

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.3808104395866394, 'eval_accuracy': 0.36492890995260663, 'eval_runtime': 8.4916, 'eval_samples_per_second': 124.24, 'eval_steps_per_second': 15.545, 'epoch': 0.06}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.2002, 'grad_norm': 14.385143280029297, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.12}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.17519886791706085, 'eval_accuracy': 0.42274881516587676, 'eval_runtime': 6.7832, 'eval_samples_per_second': 155.531, 'eval_steps_per_second': 19.46, 'epoch': 0.12}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0773, 'grad_norm': 2.75414776802063, 'learning_rate': 6e-06, 'epoch': 0.18}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.04268937557935715, 'eval_accuracy': 0.48530805687203793, 'eval_runtime': 6.4746, 'eval_samples_per_second': 162.944, 'eval_steps_per_second': 20.387, 'epoch': 0.18}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0153, 'grad_norm': 0.02131364494562149, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.23}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.006227274425327778, 'eval_accuracy': 0.4985781990521327, 'eval_runtime': 6.2555, 'eval_samples_per_second': 168.651, 'eval_steps_per_second': 21.101, 'epoch': 0.23}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0013, 'grad_norm': 0.4061475992202759, 'learning_rate': 1e-05, 'epoch': 0.29}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.0008363722590729594, 'eval_accuracy': 0.500473933649289, 'eval_runtime': 6.2904, 'eval_samples_per_second': 167.717, 'eval_steps_per_second': 20.985, 'epoch': 0.29}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0008, 'grad_norm': 0.007836715318262577, 'learning_rate': 1.2e-05, 'epoch': 0.35}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.00041940799565054476, 'eval_accuracy': 0.500473933649289, 'eval_runtime': 6.2841, 'eval_samples_per_second': 167.884, 'eval_steps_per_second': 21.005, 'epoch': 0.35}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0004, 'grad_norm': 0.012791426852345467, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.41}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.0002989808563143015, 'eval_accuracy': 0.500473933649289, 'eval_runtime': 6.2991, 'eval_samples_per_second': 167.485, 'eval_steps_per_second': 20.955, 'epoch': 0.41}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0005, 'grad_norm': 0.019003180786967278, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.47}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.00023443963436875492, 'eval_accuracy': 0.500473933649289, 'eval_runtime': 6.3083, 'eval_samples_per_second': 167.241, 'eval_steps_per_second': 20.925, 'epoch': 0.47}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0005, 'grad_norm': 0.020320503041148186, 'learning_rate': 1.8e-05, 'epoch': 0.53}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.0001916384935611859, 'eval_accuracy': 0.500473933649289, 'eval_runtime': 6.5665, 'eval_samples_per_second': 160.663, 'eval_steps_per_second': 20.102, 'epoch': 0.53}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0002, 'grad_norm': 0.019119389355182648, 'learning_rate': 2e-05, 'epoch': 0.58}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.0001588244194863364, 'eval_accuracy': 0.500473933649289, 'eval_runtime': 6.3014, 'eval_samples_per_second': 167.423, 'eval_steps_per_second': 20.948, 'epoch': 0.58}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0003, 'grad_norm': 0.006308269686996937, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.64}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.00013039256737101823, 'eval_accuracy': 0.500473933649289, 'eval_runtime': 6.5846, 'eval_samples_per_second': 160.223, 'eval_steps_per_second': 20.047, 'epoch': 0.64}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0002, 'grad_norm': 0.00529207568615675, 'learning_rate': 2.4e-05, 'epoch': 0.7}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.00010894491424551234, 'eval_accuracy': 0.500473933649289, 'eval_runtime': 7.3868, 'eval_samples_per_second': 142.822, 'eval_steps_per_second': 17.87, 'epoch': 0.7}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0003, 'grad_norm': 0.007099503185600042, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.76}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 9.237420454155654e-05, 'eval_accuracy': 0.500473933649289, 'eval_runtime': 6.329, 'eval_samples_per_second': 166.693, 'eval_steps_per_second': 20.856, 'epoch': 0.76}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0002, 'grad_norm': 0.00301172467879951, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.82}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 7.824739441275597e-05, 'eval_accuracy': 0.500473933649289, 'eval_runtime': 6.3019, 'eval_samples_per_second': 167.409, 'eval_steps_per_second': 20.946, 'epoch': 0.82}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0001, 'grad_norm': 0.0020587800536304712, 'learning_rate': 3e-05, 'epoch': 0.88}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 6.878768908791244e-05, 'eval_accuracy': 0.500473933649289, 'eval_runtime': 6.4718, 'eval_samples_per_second': 163.016, 'eval_steps_per_second': 20.396, 'epoch': 0.88}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0001, 'grad_norm': 0.0024895912501960993, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.94}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 6.049883450032212e-05, 'eval_accuracy': 0.500473933649289, 'eval_runtime': 6.7701, 'eval_samples_per_second': 155.831, 'eval_steps_per_second': 19.497, 'epoch': 0.94}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'loss': 0.0001, 'grad_norm': 0.002032345626503229, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.99}


  0%|          | 0/132 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 5.318586772773415e-05, 'eval_accuracy': 0.500473933649289, 'eval_runtime': 6.2946, 'eval_samples_per_second': 167.604, 'eval_steps_per_second': 20.97, 'epoch': 0.99}
The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !
{'train_runtime': 207.787, 'train_samples_per_second': 13.158, 'train_steps_per_second': 0.823, 'train_loss': 0.05128877933733407, 'epoch': 1.0}


  0%|          | 0/15 [00:00<?, ?it/s]

Test Accuracy: 0.5


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
# Save the model
save_model(model_trained, tokenizer_trained, path + 'italian_model')


In [16]:
# TODO : remove, just for local

save_model(model_trained, tokenizer_trained, '/Users/marlon/VS-Code-Projects/Youtube/sentiment_model_finetuned_italian')

The OrderedVocab you are attempting to save contains holes for indices [9, 12, 15, 16, 25, 29, 33, 38, 51, 59, 81, 197, 262, 314, 389, 404, 558, 561, 568, 634, 745, 917, 1143, 1683, 1971, 3283, 3341, 4052], your vocabulary could be corrupted !


In [None]:
# Save the fine-tuned model to your system

# Mount to drive
drive.mount('/content/drive')

!cp -r /content/italian_model /content/drive/MyDrive/

# Now download it from your Google Drive Account !