## TRANSFORMER NOTEBOOK
We trained BERT and RoBERTa using this notebook.

In [None]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files='../input/disaster/train_pp_key.csv')

In [None]:
dataset

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
train_test_dataset = tokenized_datasets["train"].train_test_split(test_size=0.1)

In [None]:
train_test_dataset

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.to(device)

In [None]:
import numpy as np
from datasets import load_metric

acc_metric = load_metric("accuracy")
f1_metric = load_metric("f1")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    return {'accuracy':acc, 'f1':f1}

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="roberta_pp_key_trainer", evaluation_strategy="epoch", learning_rate=2e-5, num_train_epochs=8, save_steps=800)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test_dataset['train'],
    eval_dataset=train_test_dataset['test'],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
import pandas as pd

test_df = pd.read_csv('../input/disaster/test_pp_key.csv')

In [None]:
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained('/kaggle/working/roberta_pp_key_trainer/checkpoint-2400', num_labels=2)
classifier = pipeline(task="text-classification", model=model, tokenizer=tokenizer)
pred_labels = classifier(list(test_df['text']))

In [None]:
sub_dict = {'id':list(test_df['id']), 'target':[int(pred['label'][-1]) for pred in pred_labels]}
sub_df = pd.DataFrame.from_dict(sub_dict)
sub_df.to_csv('submission_clean.csv', index=False)