# The French Sentiment Model for Fine-Tuning

Using the translated and augmented french comments, we now fine tune a state-of-the-art sentiment model, trained on french language, with our data.

In [1]:
!pip install --upgrade pip
!pip uninstall transformers -y #uninstall version you have currently, just bc we might need diff versions for diff fine tuning models
#!pip install transformers==4.18.0 #note that we need an older version of transformers to use this model
!pip install transformers 
!pip install --no-cache-dir transformers sentencepiece
!pip install datasets
!pip install accelerate -U

[33mDEPRECATION: Loading egg at /opt/homebrew/lib/python3.11/site-packages/jupyter-1.0.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
Found existing installation: transformers 4.41.2
Uninstalling transformers-4.41.2:
  Successfully uninstalled transformers-4.41.2
[33mDEPRECATION: Loading egg at /opt/homebrew/lib/python3.11/site-packages/jupyter-1.0.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mCollecting transformers
  Using cached transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
Using cached transformers-4.41.2-py3-none-any.whl (9.1 MB)
Installing collected packages: transformers
Successfully installed transformers-4.41.2
[33mDEPRECATION: Loading

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset, load_metric
import torch
import pandas as pd
import numpy as np
#from google.colab import files
#from google.colab import drive
import glob
import zipfile
from sklearn.model_selection import train_test_split




In [4]:
def compute_metrics(p):
    metric = load_metric("accuracy")
    logits, labels = p
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [5]:
# Instantiate the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained("cmarkea/distilcamembert-base-sentiment")


def tokenize_function(examples, tokenizer=tokenizer):
    """
    Function to tokenize the data.
    examples : data to tokenize ; dict
    tokenizer : tokenizer to use ; DistilBertTokenizer
    """
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

In [6]:
def model(train_comments, train_labels,
                    val_comments, val_labels, test_comments,
                    test_labels,
                    batch_size_train, batch_size_val,
                    epochs, tokenizer=tokenizer, learning_rate=1e-4, weight_decay=0.001):
    """
    Function to train a  model on the data.
    train_comments : comments for training ; lst of str
    train_labels : labels for training ; lst of int
    val_comments : comments for validation ; lst of str
    val_labels : labels for validation ; lst of int
    batch_size_train : batch size for training ; int
    batch_size_val : batch size for validation ; int
    epochs : number of epochs ; int
    tokenizer : tokenizer to use
    """
    # Set the device to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = AutoModelForSequenceClassification.from_pretrained("cmarkea/distilcamembert-base-sentiment")
    model.to(device) 

    # Setup the Hugging Face Dataset Class
    train_dataset_dict = {"text": train_comments, "label": train_labels}
    val_dataset_dict = {"text": val_comments, "label": val_labels}
    test_dataset_dict = {"text": test_comments, "label": test_labels}

    train_dataset = Dataset.from_dict(train_dataset_dict)
    val_dataset = Dataset.from_dict(val_dataset_dict)
    test_dataset = Dataset.from_dict(test_dataset_dict)

    # Apply the tokenizer to the datasets
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Remove columns we do not need for training
    train_dataset = train_dataset.remove_columns(["text"])
    val_dataset = val_dataset.remove_columns(["text"])
    test_dataset = test_dataset.remove_columns(["text"])

    # Set the format of the datasets to PyTorch tensors
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")


    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=epochs,              # total number of training epochs
        per_device_train_batch_size=batch_size_train,  # batch size for training
        per_device_eval_batch_size=batch_size_val,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=weight_decay,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        evaluation_strategy="steps",     # Evaluate every `eval_steps`
        eval_steps=10,                   # Number of steps between evaluations
        save_steps=10,                   # Save the model every `save_steps`
        load_best_model_at_end=True,     # Load the best model at the end of training#
        learning_rate= learning_rate,              # Set the learning rate
        metric_for_best_model="eval_loss", # Use evaluation loss to check how good our model is performing
        greater_is_better=False,
    )

    # Trainer
    trainer = Trainer(
        model=model,                         # model
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=val_dataset,            # evaluation dataset
        tokenizer=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=10)], # Early Stopping for Overfitting
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the test set
    test_results = trainer.evaluate(eval_dataset=test_dataset)

    # Get test accuracy
    print("Test Accuracy:", test_results['eval_accuracy'])



    return model, tokenizer, test_results



In [7]:
def save_model(model, tokenizer, path):
    """
    Function to save the model
    model : model to save ;
    tokenizer : tokenizer to save ;
    path : path to save the model ; str
    """

    model_save_path =  path
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

In [None]:
# Open a file upload dialog
# Select here all files to upload!
# If already uploaded, just press 'Cancel Upload'
# They are contained within a zip file.
#uploaded = files.upload()

In [None]:
# Set the path to the data
# On local machine use the relative path, for example
# path = 'NLP labelled data preview/english set/'
# On Google Colab use this path
# '/content/'
#path = '/content/'

In [9]:
#all_comments_with_labels = pd.read_csv(path + 'french_combined_ready_for_finetuning.csv')
all_comments_with_labels = pd.read_csv('Comments DB/french/Finetuning/french_combined_ready_for_finetuning.csv')

In [10]:
# Conversion
# Since this model is trained with labels 1 to 5 stars, we will adjust our labels :
# 1 star : if 'Score' > 0.90 and 'Label' = 0
# 2 star : if 'Score' <= 0.90 and 'Label' = 0
# 3 star : no conversion, as we don't analyze neutral comments
# 4 star : if 'Score' <= 0ö.90 and 'Label' = 2
# 5 star : if 'Score' > 0.90 and 'Label' = 2 

# Conversion
# Since this model is trained with labels 1 to 5 stars, we will adjust our labels :
# 1 star : if 'Score' > 0.90 and 'Label' = 0
# 2 star : if 'Score' <= 0.90 and 'Label' = 0
# 3 star : no conversion, as we don't analyze neutral comments
# 4 star : if 'Score' <= 0.90 and 'Label' = 2
# 5 star : if 'Score' > 0.90 and 'Label' = 2 

# Create a new column to store the updated labels
updated_labels = all_comments_with_labels['Label'].copy()

# Apply the rules to the new column
updated_labels.loc[(all_comments_with_labels['Score'] > 0.90) & (all_comments_with_labels['Label'] == 0)] = 1
updated_labels.loc[(all_comments_with_labels['Label'] == 0) & (all_comments_with_labels['Score'] <= 0.90)] = 2
# No conversion needed for neutral comments (Label 1)
updated_labels.loc[(all_comments_with_labels['Label'] == 2) & (all_comments_with_labels['Score'] <= 0.90)] = 4
updated_labels.loc[(all_comments_with_labels['Score'] > 0.90) & (all_comments_with_labels['Label'] == 2)] = 5

# Assign the updated labels back to the original DataFrame
all_comments_with_labels['Label'] = updated_labels

# Print the first few rows to check the changes
print(all_comments_with_labels.head())

# Check the label count
print(all_comments_with_labels['Label'].value_counts())


   index  Label     Score                                            Comment
0      0      1  0.966221  En toute honnêteté libérez laissez lai prendre...
1      1      1  0.966221           Relâchezles laissez lai prendre contrôle
2      2      5  0.953260  jaime façon dont gens brillants pensent toujou...
3      3      5  0.953260          Les gens brillants pensent quils contrôle
4      4      1  0.967700  cest bullshit nest quune espoir va ni quoi uti...
5    21684
1    20040
2      815
4      594
Name: Label, dtype: int64


In [11]:
# Seperate the two columns in the dataframe into comments and labels and turn them into lists
comments = all_comments_with_labels['Comment'].tolist()
labels = all_comments_with_labels['Label'].tolist()

In [12]:
# Convert the labels to integers
labels = [int(label) for label in labels]
# Turn all elements in comments into strings
comments = [str(comment) for comment in comments]

In [13]:
# Split the data into a train and test set with stratification
train_comments, temp_comments, train_labels, temp_labels = train_test_split(
    comments, labels, test_size=0.3, random_state=42, stratify=labels)

In [14]:
# Split the data into training and validation sets with stratification
val_comments, test_comments, val_labels, test_labels = train_test_split(temp_comments, temp_labels, test_size=0.1, random_state=42, stratify= temp_labels)

In [15]:
# Look at the number of comments in the training and validation sets
print("Number of comments in the training set: ", len(train_comments))
print("Number of comments in the validation set: ", len(val_comments))
print("Number of comments in the test set: ", len(test_comments))
# Look at labels in the training and validation sets
print("Number of negative comments in the training set: ", train_labels.count(1) + train_labels.count(2))
print("Number of positive comments in the training set: ", train_labels.count(4)+ train_labels.count(5))
print("Number of negative comments in the validation set: ", val_labels.count(1) + val_labels.count(2))
print("Number of positive comments in the validation set: ", val_labels.count(4)+ val_labels.count(5))
print("Number of negative comments in the test set: ", test_labels.count(2) + test_labels.count(1))
print("Number of positive comments in the test set: ", test_labels.count(4) + test_labels.count(5))


Number of comments in the training set:  30193
Number of comments in the validation set:  11646
Number of comments in the test set:  1294
Number of negative comments in the training set:  14598
Number of positive comments in the training set:  15595
Number of negative comments in the validation set:  5631
Number of positive comments in the validation set:  6015
Number of negative comments in the test set:  626
Number of positive comments in the test set:  668


In [16]:
# Train the model
model_trained, tokenizer_trained, test_results = model(train_comments, train_labels, val_comments, val_labels,test_comments, test_labels, batch_size_train = 128, batch_size_val = 128, epochs = 2, tokenizer = tokenizer, learning_rate=1e-4, weight_decay=0.001)

Map:   0%|          | 0/30193 [00:00<?, ? examples/s]

Map:   0%|          | 0/11646 [00:00<?, ? examples/s]

Map:   0%|          | 0/1294 [00:00<?, ? examples/s]



  0%|          | 0/472 [00:00<?, ?it/s]

{'loss': 0.8235, 'grad_norm': 8.382003784179688, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.04}


  0%|          | 0/91 [00:00<?, ?it/s]

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.7684884667396545, 'eval_accuracy': 0.11274257255710116, 'eval_runtime': 22.9194, 'eval_samples_per_second': 508.128, 'eval_steps_per_second': 3.97, 'epoch': 0.04}
{'loss': 0.657, 'grad_norm': 6.153308391571045, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.08}


  0%|          | 0/91 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.5470730662345886, 'eval_accuracy': 0.288768675940237, 'eval_runtime': 22.2794, 'eval_samples_per_second': 522.726, 'eval_steps_per_second': 4.084, 'epoch': 0.08}
{'loss': 0.4517, 'grad_norm': 3.4255590438842773, 'learning_rate': 6e-06, 'epoch': 0.13}


  0%|          | 0/91 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Save the model
#save_model(model_trained, tokenizer_trained, path + 'french_model')
save_model(model_trained, tokenizer_trained, 'sentiment_model_finetuned_french')

In [None]:
# Save the fine-tuned model to your system

# Mount to drive
#drive.mount('/content/drive')

#!cp -r /content/french_model /content/drive/MyDrive/

# Now download it from your Google Drive Account !