In [None]:
#import os
#os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
!pip install transformers

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

In [None]:
#!python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

# **Inference**

In [None]:
from transformers import pipeline

**Sentiment Analysis**

In [None]:
classifier = pipeline ("sentiment-analysis")
classifier("This restaurant is awesome")

In [None]:
classifier_roberta = pipeline("sentiment-analysis", model="roberta-large-mnli")
classifier_roberta("This restaurant is awesome")

In [None]:
classifier_roberta(["This restaurant is awesome", "This restaurant is awful"])

In [None]:
zero_pipe = pipeline("text-classification", model="roberta-large-mnli")
zero_pipe("Where is the capital of France? Paris is the capital of France.")

**Text Generation**

In [None]:
generator = pipeline("text-generation")
generator("As far as I am concerned, I will", max_length = 50, do_sample = False)

In [None]:
generator("As far as I am concerned, I will", max_length = 50)

**NER**

In [None]:
ner_pipe = pipeline("ner")
ner_pipe("Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO,therefore very close to the Manhattan Bridge which is visible from the window.")

**Summarization**

In [None]:
pipe = pipeline("summarization")
article = "New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband."
pipe(article, max_length = 130, min_length = 30, do_sample = False)

# **Fine-tuning**

In [None]:
!pip install transformers datasets evaluate

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
main_path = "/content/drive/MyDrive/Hands-on/"

In [None]:
import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

In [None]:
# Read in data
data = pd.read_csv(main_path + 'amazon_cells_labelled.txt', sep='\t', names=['review', 'label'])

In [None]:
data.head()

Unnamed: 0,review,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  1000 non-null   object
 1   label   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [None]:
data['label'].value_counts()

0    500
1    500
Name: label, dtype: int64

In [None]:
train = data.sample(frac=0.8, random_state=42)

# Testing dataset
test = data.drop(train.index)

new_train = train.sample(frac=0.8, random_state=42)
val = train.drop(new_train.index)

In [None]:
print(f'The training dataset has {len(new_train)} records.')
print(f'The val dataset has {len(val)} records.')
print(f'The testing dataset has {len(test)} records.')

The training dataset has 640 records.
The val dataset has 160 records.
The testing dataset has 200 records.


In [None]:
# Convert pyhton dataframe to Hugging Face arrow dataset
train_data = Dataset.from_pandas(train)
val_data = Dataset.from_pandas(val)
test_data = Dataset.from_pandas(test)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

In [None]:
dataset_train = train_data.map(tokenize_function)
dataset_val = val_data.map(tokenize_function)
dataset_test = test_data.map(tokenize_function)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

In [None]:
training_args = TrainingArguments(
    output_dir="./sentiment_transfer_learning_transformer/",          
    logging_dir='./sentiment_transfer_learning_transformer/logs',            
    logging_strategy='epoch',
    logging_steps=100,    
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,  
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

In [None]:
# Function to compute the metric
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()

In [None]:
# Trainer evaluate
trainer.evaluate(dataset_test)

In [None]:
#trainer.state.log_history