In [1]:
from pathlib import Path
import logging
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, ClassLabel, load_metric

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from evaluation import evaluate
from loading import load_train, load_test
from bert import train

In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

### Configuration

In [5]:
FULL=False

MODEL = 'distilbert-base-uncased' # 'cardiffnlp/twitter-roberta-base-sentiment-latest'
TOKENIZER = 'bert-base-uncased'

EPOCHS = 1
BATCH_SIZE = 1028

In [None]:
model, accuracy = train(MODEL, TOKENIZER, device, full=FULL, batch_size=BATCH_SIZE, epochs=EPOCHS)

### Load data

In [6]:
df_train, df_val = load_train(full=FULL, eval_frac=0.2, x_col='text', y_col='label', neg_label=0, pos_label=1)
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)

new_features = dataset_train.features.copy()
new_features['label'] = ClassLabel(names=['0', '1'])

dataset_train = dataset_train.cast(new_features)
dataset_val = dataset_val.cast(new_features)

df_test = load_test(x_col='text')
dataset_test = Dataset.from_pandas(df_test)

Casting the dataset: 100%|██████████| 16/16 [00:00<00:00, 44.93ba/s]
Casting the dataset: 100%|██████████| 4/4 [00:00<00:00, 45.28ba/s]


### Tokenize

In [7]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

In [8]:
def tokenize_function(ds):
  return tokenizer(ds['text'], padding=True, truncation=True)


def load_or_tokenize(ds, path, force_retokenize=False):
  if not force_retokenize and Path(path).exists():
    return Dataset.load_from_disk(path)
  else:
    ds_tokenized = ds.map(tokenize_function, batched=True)
    ds_tokenized.save_to_disk(path)
    return ds_tokenized

In [9]:
train_tokenized = load_or_tokenize(dataset_train, path='bert/train_tokenized', force_retokenize=FORCE_RETOKENIZE)
val_tokenized = load_or_tokenize(dataset_val, path='bert/val_tokenized', force_retokenize=FORCE_RETOKENIZE)
test_tokenized = load_or_tokenize(dataset_test, path='bert/test_tokenized', force_retokenize=FORCE_RETOKENIZE)

### Load model

In [10]:
def get_BERT(model_name=MODEL):
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
  model.save_pretrained(model_name)
  return model

In [11]:
model = get_BERT()

### Tuning

In [12]:
training_args = TrainingArguments(
  output_dir="bert_data/test_trainer",
  num_train_epochs=EPOCHS,
  save_strategy="epoch",
  evaluation_strategy="epoch",
  per_device_train_batch_size=TRAIN_BATCH_SIZE,
  load_best_model_at_end=True)

In [13]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return metric.compute(predictions=predictions, references=labels)

In [14]:
trainer = Trainer(
  model,
  training_args,
  train_dataset=train_tokenized,
  eval_dataset=val_tokenized,
  tokenizer=tokenizer,
  compute_metrics=compute_metrics)

In [15]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: index, token_type_ids, __index_level_0__, text. If index, token_type_ids, __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 160000
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 15000


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3091,0.30134,0.87015


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: index, token_type_ids, __index_level_0__, text. If index, token_type_ids, __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40000
  Batch size = 8
Saving model checkpoint to bert_data/test_trainer/checkpoint-5000
Configuration saved in bert_data/test_trainer/checkpoint-5000/config.json
Model weights saved in bert_data/test_trainer/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in bert_data/test_trainer/checkpoint-5000/tokenizer_config.json
Special tokens file saved in bert_data/test_trainer/checkpoint-5000/special_tokens_map.json


In [18]:
model.save_pretrained('test0')

Configuration saved in test1/config.json
Model weights saved in test1/pytorch_model.bin


In [19]:
train_pred = trainer.predict(train_tokenized)
df_train['Prediction'] = np.argmax(train_pred.predictions, axis=1)
df_train.to_csv("bert_data/bert_pred_train.csv")
df_train['log_neg'] = train_pred.predictions[:, 0]
df_train['log_pos'] = train_pred.predictions[:, 1]
# store logits, e.g. for ensemble learning, ..
df_train.to_csv("bert_data/bert_pred_train_logits.csv")
acc_train, prec_train, recall_train, f1_train, bce_train, auc_train = evaluate(df_train['Prediction'], df_train["label"])

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: index, __index_level_0__, token_type_ids, text. If index, __index_level_0__, token_type_ids, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 160000
  Batch size = 8


In [20]:
val_pred = trainer.predict(val_tokenized)
df_val['Prediction'] = np.argmax(val_pred.predictions, axis=1)
df_val.to_csv("bert_data/bert_pred_val.csv")
df_val['log_neg'] = val_pred.predictions[:, 0]
df_val['log_pos'] = val_pred.predictions[:, 1]
df_val.to_csv("bert_data/bert_pred_val_logits.csv")
acc_val, prec_val, recall_val, f1_val, bce_val, auc_val = evaluate(df_val['Prediction'], df_val["label"])

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, token_type_ids, index, __index_level_0__. If text, token_type_ids, index, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 500000
  Batch size = 8


In [19]:
test_pred = trainer.predict(test_tokenized)
df_test['Prediction'] = np.argmax(test_pred.predictions, axis=1)
df_test.to_csv("bert_data/bert_pred_test.csv")
df_test['log_neg'] = test_pred.predictions[:, 0]
df_test['log_pos'] = test_pred.predictions[:, 1]
df_test.to_csv("bert_data/bert_pred_test_logits.csv")

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, token_type_ids, __index_level_0__. If text, token_type_ids, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10000
  Batch size = 8


In [None]:
acc_val, acc_train

(0.87755, 0.90886875)