## FastBERT

In [None]:
!pip install fast-bert

In [None]:
import logging
import numpy as np
import pandas as pd
import torch

from transformers import BertTokenizer
from fast_bert.data_cls import BertDataBunch
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy

Before execution create directories with `mkdir twitterdata labels`

Then set paths:

In [None]:
PATH_TO_DATA = "./twitterdata/"
PATH_TO_LABELS = "./labels/"
OUTPUT_DIR = "./"

Read relevant data from Chapter 5, split data set (60/20/20) and save data sets as csv

In [None]:
df = pd.read_csv('../chapter5/train-processed.csv', encoding='latin-1')
df = df.drop(df.columns[[0, 1, 2, 3, 4, 6]], axis=1)
df.columns = ['text', 'label']

# https://stackoverflow.com/questions/38250710/
# how-to-split-data-into-3-sets-train-validation-and-test/38251213#38251213
np.random.seed(0)
train, valid, test = \
                np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

train.to_csv('./twitterdata/train.csv', index=False)
valid.to_csv('./twitterdata/valid.csv', index=False)
test.to_csv('./twitterdata/test.csv', index=False)

Get labels and save them in separate directory `labels`/`PATH_TO_LABELS` as csv

In [None]:
labels = pd.DataFrame(df.label.unique())
labels.to_csv("./labels/labels.csv", header=False, index=False)

Define and train model

In [None]:
device = torch.device('cuda')
logger = logging.getLogger()
metrics = [{'name': 'accuracy', 'function': accuracy}]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

databunch = BertDataBunch(PATH_TO_DATA,
                          PATH_TO_LABELS,
                          tokenizer,
                          train_file="train.csv",
                          val_file="valid.csv",
                          test_data="test.csv",
                          text_col=0, label_col=1,
                          batch_size_per_gpu=32,
                          max_seq_length=140,
                          multi_gpu=False,
                          multi_label=False,
                          model_type="bert")

learner = BertLearner.from_pretrained_model(databunch,
                                            'bert-base-uncased',
                                            metrics=metrics,
                                            device=device,
                                            logger=logger,
                                            output_dir=OUTPUT_DIR,
                                            is_fp16=False,
                                            multi_gpu=False,
                                            multi_label=False)

learner.fit(3, lr=1e-2)
