In [None]:
#import os
#os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [1]:
#This is needed to make the training arguments (from the huggingface) work in one of the following cells below.
!pip install transformers==4.28.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
#To make sure that our runtime is using gpu.
#If not you can change your runtime environment above from the toolbar: Runtime, Cambio tipo di runtime then GPU.
import torch
torch.cuda.get_device_name(0)

'Tesla T4'

In [None]:
#!python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

# **Inference**

In [3]:
from transformers import pipeline

**Sentiment Analysis**

In [None]:
classifier = pipeline ("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")
classifier("This restaurant is awesome")

[{'label': 'POSITIVE', 'score': 0.9998743534088135}]

**Natural Language Inference**

In [None]:
classifier  = pipeline("zero-shot-classification", model="roberta-large-mnli")
classifier ("Scuola Normale Superiore welcomes students from all over the world.", "Scuola Normale Superiore is a university.")

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'sequence': 'Scuola Normale Superiore welcomes students from all over the world.',
 'labels': ['Scuola Normale Superiore is a university.'],
 'scores': [0.8751774430274963]}

In [None]:
classifier("Scuola Normale Superiore welcomes students from all over the world.","Only people from Italy study at Scuola Normale Superiore.")

{'sequence': 'Scuola Normale Superiore welcomes students from all over the world.',
 'labels': ['Only people from Italy study at Scuola Normale Superiore.'],
 'scores': [0.00018539096345193684]}

**Text Generation**

In [None]:
generator = pipeline("text-generation")
generator("As far as I am concerned, I will", max_length = 50, do_sample = False)

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'As far as I am concerned, I will be the first to admit that I am not a fan of the idea of a "free market." I think that the idea of a free market is a bit of a stretch. I think that the idea'}]

**Named Entity Recognition (NER)**

In [None]:
pipe = pipeline("ner")
pipe("Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge which is visible from the window.")

**Summarization**

In [None]:
pipe = pipeline("summarization")
article = "New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband."
pipe(article, max_length = 130, min_length = 30, do_sample = False)

# **Fine-tuning**

In [2]:
#Installing the necessary libraries
!pip install datasets evaluate accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#main_path for the folder in the Google Drive
main_path = "/content/drive/MyDrive/Hands-on/"

In [5]:
import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

In [6]:
# Read in data from Google Drive directly
data = pd.read_csv(main_path + 'amazon_cells_labelled.txt', sep='\t', names=['review', 'label'])

In [7]:
data.head()

Unnamed: 0,review,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  1000 non-null   object
 1   label   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [9]:
data['label'].value_counts()

0    500
1    500
Name: label, dtype: int64

In [10]:
#We splitted the dataset into train, validation, and test sets.
#Convention is to split the dataset as 0.6, 0.2, 0.2 for train, val, and test sets.
#In our case, we have 0.64, 0.16, 0.20, one can also use exactly 0.6, 0.2, 0.2.
train = data.sample(frac=0.8, random_state=42)

# Testing dataset
test = data.drop(train.index)

new_train = train.sample(frac=0.8, random_state=42)
val = train.drop(new_train.index)

In [11]:
print(f'The training dataset has {len(new_train)} records.')
print(f'The val dataset has {len(val)} records.')
print(f'The testing dataset has {len(test)} records.')

The training dataset has 640 records.
The val dataset has 160 records.
The testing dataset has 200 records.


In [12]:
# Convert pyhton dataframe to Hugging Face arrow dataset
train_data = Dataset.from_pandas(train)
val_data = Dataset.from_pandas(val)
test_data = Dataset.from_pandas(test)

In [13]:
#Loading the specific tokenizer (model and the tokenizer should be the same)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [14]:
def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

In [15]:
#Each data split is tokenized using the defined function.
dataset_train = train_data.map(tokenize_function)
dataset_val = val_data.map(tokenize_function)
dataset_test = test_data.map(tokenize_function)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [16]:
#Loading the model, since this is a binary classification tasks (two classes), we specified it as num_labels = 2. 
#If you have more than 2 classes then you can specify accordingly.
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [17]:
#We set the number of epochs as 3 in this example.
#But ideally you should continue till the point that the validation loss starts to increase.
#For this, we need to use a patience value and early stopping by saving the best model at each epoch.
training_args = TrainingArguments(
    output_dir="./sentiment_transfer_learning_transformer/",          
    logging_dir='./sentiment_transfer_learning_transformer/logs',            
    logging_strategy='epoch',
    logging_steps=100,    
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,  
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

In [18]:
# Function to compute the metric
# Since the dataset is balanced, we used accuracy. But for other applications, or for imbalanced datasets one can use F1-score or AUC with the losses.
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [19]:
# Train the model
# At each epoch, we will evaluate our current trained model on the validation set and monitor the train and validation losses to understand if there is overfitting, underfitting etc.
# Patience value for Early Stopping is 1 for this example, but you might want to wait a bit more (# of patience steps) before terminating the training process.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.6971,0.619653,0.6125
2,0.5631,0.432137,0.9
3,0.4252,0.349539,0.91875


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

TrainOutput(global_step=150, training_loss=0.5618255488077799, metrics={'train_runtime': 255.124, 'train_samples_per_second': 9.407, 'train_steps_per_second': 0.588, 'total_flos': 631466532864000.0, 'train_loss': 0.5618255488077799, 'epoch': 3.0})

In [20]:
# Trainer evaluate
trainer.evaluate(dataset_test)

{'eval_loss': 0.38916337490081787,
 'eval_accuracy': 0.905,
 'eval_runtime': 7.3271,
 'eval_samples_per_second': 27.296,
 'eval_steps_per_second': 1.774,
 'epoch': 3.0}

In [None]:
#trainer.state.log_history