# 🤗 Transformers Episodio 3 - Fine tuning en la nube

[twitch.tv/dataista0](http://twitch.tv/dataista0)


In [1]:
import numpy as np
import datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer

metric = datasets.load_metric("accuracy")


def get_tokenized_dataset(tokenizer):
    raw_datasets = datasets.load_dataset("imdb")
    
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)
    
    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)    
    
    return tokenized_datasets


def get_tokenizer_and_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    return tokenizer, model


def get_trainer(model, ds_train, ds_eval, args=None):
    if args is None:
        args = TrainingArguments("/home/dataista/git/twitch-streams/data/")
    trainer = Trainer(model=model, args=args, train_dataset=ds_train, eval_dataset=ds_eval)
    return trainer


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [2]:
model_name = "bert-base-cased"
tokenizer, model = get_tokenizer_and_model(model_name)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [3]:
data = pd.read_csv('data/nlp-getting-started/train.csv')
df = data[['text', 'target']].rename(columns={'target': 'label'})
df

Unnamed: 0,text,label
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


In [4]:
df_train, df_val = train_test_split(df, test_size=0.2)

In [5]:
data_test = pd.read_csv('data/nlp-getting-started/test.csv')
submission = pd.read_csv('data/nlp-getting-started/sample_submission.csv')
df_test = data_test[['id', 'text']]

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [7]:
train_tokenized = tokenizer(df_train.text.tolist(), padding="max_length", truncation=True)
val_tokenized = tokenizer(df_val.text.tolist(), padding="max_length", truncation=True)
test_tokenized = tokenizer(df_test.text.tolist(), padding="max_length", truncation=True)

In [14]:
args = TrainingArguments("/home/dataista/git/twitch-streams/data/", 
                         per_device_train_batch_size=8,
                         per_device_eval_batch_size=8,
                         num_train_epochs=1,
                         evaluation_strategy="epoch")

In [15]:
train_tokenized['label'] = df_train.label.tolist()
val_tokenized['label'] = df_val.label.tolist()

In [16]:
train = [dict(zip(train_tokenized,t)) for t in zip(*train_tokenized.values())]
val = [dict(zip(val_tokenized,t)) for t in zip(*val_tokenized.values())]
test = [dict(zip(test_tokenized,t)) for t in zip(*test_tokenized.values())]

In [17]:
#train_pairs = zip(train_tokenized, df_train.label.tolist())
#val_pairs = zip(val_tokenized, df_val.label.tolist())

In [18]:
train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'label'])

In [19]:
# import torch
# del model
# del trainer
# torch.cuda.empty_cache()


In [20]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train[:4000],
    eval_dataset=val[:1000],
    compute_metrics=compute_metrics   
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.517,0.459197,0.804


TrainOutput(global_step=500, training_loss=0.5170485229492188, metrics={'train_runtime': 423.3522, 'train_samples_per_second': 9.448, 'train_steps_per_second': 1.181, 'total_flos': 1330935521280000.0, 'train_loss': 0.5170485229492188, 'epoch': 1.0})

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3951,0.459197,0.804


TrainOutput(global_step=500, training_loss=0.39512298583984373, metrics={'train_runtime': 443.4282, 'train_samples_per_second': 9.021, 'train_steps_per_second': 1.128, 'total_flos': 1330935521280000.0, 'train_loss': 0.39512298583984373, 'epoch': 1.0})

In [23]:
preds = trainer.predict(test)

In [28]:
import torch
import torch.nn.functional as F

final_preds = F.softmax(torch.from_numpy(preds.predictions), dim=-1)


In [36]:
submission.shape

(3263, 2)

In [34]:
final_binary_preds = (final_preds[:, 1] > 0.5).numpy().astype(int)

In [35]:
len(final_binary_preds)

3263

In [37]:
submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [38]:
submission['target'] = final_binary_preds

In [39]:
submission.to_csv("bert_first_submission.csv", index=False)