In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install transformers
!pip install datasets
#!pip install wordcloud
#!pip install plotly
#!pip install nlp
!pip install huggingface_hub



In [4]:
#Importing data manipulation libraried
import pandas as pd
import numpy as np

#Importing Data visualization libraries
#import seaborn as sns
#import matplotlib.pyplot as plt
#import plotly.express as px
import plotly.graph_objects as go

import os

from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

from datasets import load_metric

import nlp  # Hugging Face's NLP library

import torch  # PyTorch, a deep learning framework
import torch.nn.functional as F  # Functions for PyTorch neural networks

In [5]:
# Disable W&B
os.environ["WANDB_DISABLED"] = "true"

In [6]:
# Load the training dataset from a CSV file located at the specified path.
train_path = "/content/drive/MyDrive/p5/p5 dataset/train_data.csv"

# Read the CSV file into a Pandas DataFrame for initial exploration.
df = pd.read_csv(train_path)
print(df.head())

                                                text  label  agreement  \
0  amp big homie meanboy mb mb mmr stegmanlife st...    0.0        1.0   
1  think devote career prove autism cause vaccine...    1.0        1.0   
2           whatcausesautism vaccine vaccinate child   -1.0        1.0   
3  mean immunize kid something wo secretly kill y...   -1.0        1.0   
4  thanks catch perform la nuit nyc show start je...    0.0        1.0   

                                          lemmatized  
0  ['amp', 'big', 'homie', 'meanboy', 'mb', 'mb',...  
1  ['think', 'devote', 'career', 'prove', 'autism...  
2  ['whatcausesautism', 'vaccine', 'vaccinate', '...  
3  ['mean', 'immunize', 'kid', 'something', 'wo',...  
4  ['thanks', 'catch', 'perform', 'la', 'nuit', '...  


In [7]:
# Split the train data => {train, eval}
train, eval = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

In [8]:
train.head()

Unnamed: 0,text,label,agreement,lemmatized
1641,new hey love mmr manymenrecords youaintgotnoea...,0.0,1.0,"['new', 'hey', 'love', 'mmr', 'manymenrecords'..."
3907,new extends exemption charitable immunity cert...,0.0,1.0,"['new', 'extends', 'exemption', 'charitable', ..."
336,esp mercury free vaccine available,1.0,0.666667,"['esp', 'mercury', 'free', 'vaccine', 'availab..."
6861,life entertainment yotc mmr exotics,0.0,1.0,"['life', 'entertainment', 'yotc', 'mmr', 'exot..."
720,baby luna sore vaccine poorpuppy,0.0,0.666667,"['baby', 'luna', 'sore', 'vaccine', 'poorpuppy']"


In [9]:
eval.head()

Unnamed: 0,text,label,agreement,lemmatized
5818,nervous baby get vaccine clinic southwest,1.0,0.666667,"['nervous', 'baby', 'get', 'vaccine', 'clinic'..."
7842,aid n malaria vaccine child hiv plasmodium fal...,0.0,0.666667,"['aid', 'n', 'malaria', 'vaccine', 'child', 'h..."
880,measles outbreak hit texas church preach vacci...,1.0,0.666667,"['measles', 'outbreak', 'hit', 'texas', 'churc..."
9072,thank mtg look forward support increase child ...,1.0,1.0,"['thank', 'mtg', 'look', 'forward', 'support',..."
288,health district offer immunization uninsured c...,1.0,0.666667,"['health', 'district', 'offer', 'immunization'..."


In [10]:
from datasets import load_dataset
dataset = load_dataset('csv',
                        data_files={'train': '/content/drive/MyDrive/p5/p5 dataset/train_subset.csv',
                        'eval': '/content/drive/MyDrive/p5/p5 dataset/eval_subset.csv'}, encoding = "ISO-8859-1")

In [11]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained( "bert-base-uncased")

In [12]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
def transform_labels(label):

    label = label['label']
    num = 0
    if label == -1: #'Negative'
        num = 0
    elif label == 0: #'Neutral'
        num = 1
    elif label == 1: #'Positive'
        num = 2

    return {'labels': num}

def tokenize_data(example):
    return tokenizer(example['lemmatized'], padding='max_length', truncation=True, return_tensors='pt')


#def tokenize_data(example):
#return tokenizer(example['text'], padding='max_length')

# Change the tweets to tokens that the models can exploit
dataset = dataset.map(tokenize_data, batched=True)

# Transform	labels and remove the useless columns
remove_columns = [ 'label', 'text', 'agreement','lemmatized']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

Map:   0%|          | 0/2001 [00:00<?, ? examples/s]

Map:   0%|          | 0/2001 [00:00<?, ? examples/s]

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    eval: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2001
    })
})

In [14]:
!pip install accelerate>=0.20.1 transformers

In [1]:
from transformers import TrainingArguments
# Configure the training parameters like `num_train_epochs`.

training_args = TrainingArguments(output_dir="FineTuned-BertTweet-Classification-Model", num_train_epochs=3, load_best_model_at_end=True, push_to_hub=True,
                                  evaluation_strategy= "steps", save_strategy= 'steps')

In [15]:
# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=3)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
train_dataset = dataset['train'].shuffle(seed=10)
eval_dataset = dataset['eval'].shuffle(seed=10)

In [17]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
from transformers import Trainer
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
)

In [21]:
# Launch the learning process: training
trainer.train()

Step,Training Loss,Validation Loss
500,0.7979,0.742704
1000,0.7377,0.719788
1500,0.6595,0.686905
2000,0.6483,0.654387
2500,0.5111,0.701024
3000,0.4947,0.707315


TrainOutput(global_step=3000, training_loss=0.6415259857177734, metrics={'train_runtime': 2709.6358, 'train_samples_per_second': 8.857, 'train_steps_per_second': 1.107, 'total_flos': 6314722025472000.0, 'train_loss': 0.6415259857177734, 'epoch': 3.0})

In [22]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [24]:
# Launch the final evaluation
trainer.evaluate()

{'eval_loss': 0.6543869376182556,
 'eval_accuracy': 0.728135932033983,
 'eval_runtime': 68.022,
 'eval_samples_per_second': 29.417,
 'eval_steps_per_second': 3.69}

In [25]:
trainer.push_to_hub("FineTuned-BertTweet-Classification-Model")

events.out.tfevents.1699435128.1f3cce7c474b.1279.0:   0%|          | 0.00/7.35k [00:00<?, ?B/s]

events.out.tfevents.1699437927.1f3cce7c474b.1279.1:   0%|          | 0.00/346 [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

'https://huggingface.co/Afia-manubea/FineTuned-BertTweet-Classification-Model/tree/main/'

In [26]:
model.push_to_hub("FineTuned-BertTweet-Classification-Model")

CommitInfo(commit_url='https://huggingface.co/Afia-manubea/FineTuned-BertTweet-Classification-Model/commit/bd232605b606ffd1006cf353b2b00280a7e4bea5', commit_message='Upload BertForSequenceClassification', commit_description='', oid='bd232605b606ffd1006cf353b2b00280a7e4bea5', pr_url=None, pr_revision=None, pr_num=None)

In [27]:
tokenizer.push_to_hub("FineTuned-BertTweet-Classification-Model")

CommitInfo(commit_url='https://huggingface.co/Afia-manubea/FineTuned-BertTweet-Classification-Model/commit/6d15b5caf9b6b400297792952224ebce39338a52', commit_message='Upload tokenizer', commit_description='', oid='6d15b5caf9b6b400297792952224ebce39338a52', pr_url=None, pr_revision=None, pr_num=None)

This table summarizes the key metrics for Model 1 and Model 2, making it easier to compare their performance.

| Metric                 | Model 1            | Model 2            |
|------------------------|--------------------|--------------------|
| Training Loss          | 0.677              | 0.638              |
| Training Runtime (s)   | 2721               | 1361               |
| Training Samples/Sec   | 8.819              | 17.623             |
| Evaluation Loss        | 0.663              | 0.669              |
| Evaluation Accuracy    | 0.739              | 0.723              |
| Evaluation Runtime (s) | 68.238             | 33.543             |
| Evaluation Samples/Sec | 29.324             | 59.655             |


From this table we observe that

* Model 2 has a lower training loss  compared to Model 1. While, Model 1 has a longer training runtime  compared to Model 2.

*  Model 2 has a higher training samples per second compared to Model 1. This indicates that Model 2 processes more training samples per second.

* Model 2 has a slightly higher evaluation loss (0.669) compared to Model 1 (0.663). A lower evaluation loss is usually desirable, but the difference is relatively small in this case.

* Model 1 has a higher evaluation accuracy (0.7386) compared to Model 2 (0.7231). A higher accuracy is much desirable, indicating better performance on the evaluation dataset.