In [None]:
#This is needed to make the training arguments (from the huggingface) work in one of the following cells below.
!pip install transformers==4.28.0

In [None]:
#To make sure that our runtime is using gpu.
#If not you can change your runtime environment above from the toolbar: Runtime, Cambio tipo di runtime then GPU.
import torch
torch.cuda.get_device_name(0)

In [None]:
#!python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

# **Inference**

In [3]:
from transformers import pipeline

**Sentiment Analysis**

In [None]:
classifier = pipeline ("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")
classifier("This restaurant is awesome")

**Natural Language Inference**

In [None]:
classifier  = pipeline("zero-shot-classification", model="roberta-large-mnli")
classifier ("Scuola Normale Superiore welcomes students from all over the world.", "Scuola Normale Superiore is a university.")

In [None]:
classifier("Scuola Normale Superiore welcomes students from all over the world.","Only people from Italy study at Scuola Normale Superiore.")

**Text Generation**

In [None]:
generator = pipeline("text-generation")
generator("As far as I am concerned, I will", max_length = 50, do_sample = False)

**Named Entity Recognition (NER)**

In [None]:
pipe = pipeline("ner")
pipe("Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge which is visible from the window.")

**Summarization**

In [None]:
pipe = pipeline("summarization")
article = "New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband."
pipe(article, max_length = 130, min_length = 30, do_sample = False)

# **Fine-tuning**

In [None]:
#Installing the necessary libraries
!pip install datasets evaluate accelerate

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#main_path for the folder in the Google Drive
main_path = "/content/drive/MyDrive/Hands-on/"

In [5]:
import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

In [6]:
# Read in data from Google Drive directly
data = pd.read_csv(main_path + 'amazon_cells_labelled.txt', sep='\t', names=['review', 'label'])

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data['label'].value_counts()

In [10]:
#We splitted the dataset into train, validation, and test sets.
#Convention is to split the dataset as 0.6, 0.2, 0.2 for train, val, and test sets.
#In our case, we have 0.64, 0.16, 0.20, one can also use exactly 0.6, 0.2, 0.2.
train = data.sample(frac=0.8, random_state=42)

# Testing dataset
test = data.drop(train.index)

new_train = train.sample(frac=0.8, random_state=42)
val = train.drop(new_train.index)

In [None]:
print(f'The training dataset has {len(new_train)} records.')
print(f'The val dataset has {len(val)} records.')
print(f'The testing dataset has {len(test)} records.')

In [12]:
# Convert pyhton dataframe to Hugging Face arrow dataset
train_data = Dataset.from_pandas(train)
val_data = Dataset.from_pandas(val)
test_data = Dataset.from_pandas(test)

In [13]:
#Loading the specific tokenizer (model and the tokenizer should be the same)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [14]:
def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

In [15]:
#Each data split is tokenized using the defined function.
dataset_train = train_data.map(tokenize_function)
dataset_val = val_data.map(tokenize_function)
dataset_test = test_data.map(tokenize_function)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
#Loading the model, since this is a binary classification tasks (two classes), we specified it as num_labels = 2. 
#If you have more than 2 classes then you can specify accordingly.
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

In [17]:
#We set the number of epochs as 3 in this example.
#But ideally you should continue till the point that the validation loss starts to increase.
#For this, we need to use a patience value and early stopping by saving the best model at each epoch.
training_args = TrainingArguments(
    output_dir="./sentiment_transfer_learning_transformer/",          
    logging_dir='./sentiment_transfer_learning_transformer/logs',            
    logging_strategy='epoch',
    logging_steps=100,    
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,  
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

In [18]:
# Function to compute the metric
# Since the dataset is balanced, we used accuracy. But for other applications, or for imbalanced datasets one can use F1-score or AUC with the losses.
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# Train the model
# At each epoch, we will evaluate our current trained model on the validation set and monitor the train and validation losses to understand if there is overfitting, underfitting etc.
# Patience value for Early Stopping is 1 for this example, but you might want to wait a bit more (# of patience steps) before terminating the training process.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()

In [None]:
# Trainer evaluate
trainer.evaluate(dataset_test)