# Prep data for model

In [42]:
import pandas as pd
import numpy as np
import os

In [43]:
def process_df_classification(path_name:str):
    df = pd.concat (
    [pd.read_csv(os.path.join(path_name, "train_cleaned.csv"), index_col = [0]),
    pd.read_csv(os.path.join(path_name, "val_cleaned.csv"), index_col = [0])], axis=0
        )
    df['len'] = df.statement.str.split().str.len()
    labels = list(df['label'].unique())
    vals = np.linspace(0, 1, 7)
    label_to_val = {
        "pants-fire": 0.0,
        "false" : 1.0,
        "barely-true": 2.0, 
        "half-true": 3.0, 
        "mostly-true" : 4.0, 
        "true" : 5.0,
    }
    df = df.query("len <= 100") # drop everything longer than 100 tokens
    df = df[['label', 'statement']]
    df.label = df['label'].map(label_to_val)
    return (df.statement.values, df.label.values) 

In [44]:
def process_df(path_name:str):
    df = pd.concat (
    [pd.read_csv(os.path.join(path_name, "train_cleaned.csv"), index_col = [0]),
    pd.read_csv(os.path.join(path_name, "val_cleaned.csv"), index_col = [0])], axis=0
        )
    df['len'] = df.statement.str.split().str.len()
    labels = list(df['label'].unique())
    vals = np.linspace(0, 1, 7)
    label_to_val = {
        "pants-fire": vals[0],
        "false" : vals[1],
        "barely-true": vals[2],
        "half-true": vals[3],
        "mostly-true" :vals[4],
        "true" : vals[5]
    }
    df = df.query("len <= 100") # drop everything longer than 100 tokens
    df = df[['label', 'statement']]
    df.label = df['label'].map(label_to_val)
    return (df.statement.values, df.label.values) 

In [70]:
sentences, labels = process_df_classification("")

# Tokenization and Input Formatting

In [71]:
from transformers import BertTokenizer

In [72]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)



In [73]:
max_len = 0

for sent in sentences:
  input_ids = tokenizer.encode(sent, add_special_tokens=True) # add required special tokens
  max_len = max(max_len, len(input_ids))

In [74]:
max_len

96

Use `tokenizer.encode_plus` to split sentence into tokens, add [CLS] and [SEP] tokens, map tokens to Ids, pad or truncate sequences, and create attention masks which differentiate real tokens from [PAD] tokens. We only consider the first 100 tokens.

In [75]:
input_ids = []
attention_masks = []

for sent in sentences:
  encoded_dict = tokenizer.encode_plus(
    sent, 
    add_special_tokens = True, 
    max_length = 100,
    pad_to_max_length = True, 
    return_attention_mask = True,
    return_tensors = 'pt'
  )
  input_ids.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [77]:
import torch

In [78]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels, dtype=torch.long)

# Training and Validation split

In [81]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

Iterator using dataloader

In [82]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 32

train_dataloader = DataLoader(
  train_dataset,
  sampler = RandomSampler(train_dataset),
  batch_size = batch_size
)
val_dataloader = DataLoader(
  val_dataset,
  sampler = SequentialSampler(val_dataset),
  batch_size =batch_size
)

In [83]:
# Do BERT for classificaiton first!

In [84]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
num_labels = 6,
output_attentions = False,
output_hidden_states = False
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [85]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



In [86]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4.
epochs = 3

# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [87]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [88]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [89]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [94]:
model.train()

for epoch in range(epochs):
  total_loss = 0
  all_predictions = []
  all_labels = []

  t0 = time.time()
  for i, batch in enumerate(train_dataloader):
      print( " Batch {:>5,} of {:>5,}.".format(i, len(train_dataloader)))
      optimizer.zero_grad()
      
      b_input_ids = batch[0].to(device)
      b_attention_mask = batch[1].to(device)
      b_labels = batch[2].to(device)

      outputs = model(input_ids=b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
      loss = outputs.loss
      total_loss += loss.item()

      # Get predictions
      logits = outputs.logits
      predictions = torch.argmax(logits, dim=-1)

      all_predictions.extend(predictions.cpu().numpy())
      all_labels.extend(labels.cpu().numpy())

      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      optimizer.step()

  avg_loss = total_loss / len(dataloader)
  print(f"Training time took {format_time(time.time() - t0)}")
  print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}')
# Tracking variables
  total_eval_accuracy = 0
  total_eval_loss = 0
  nb_eval_steps = 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:

      # Unpack this training batch from our dataloader.
      #
      # As we unpack the batch, we'll also copy each tensor to the GPU using
      # the `to` method.
      #
      # `batch` contains three pytorch tensors:
      #   [0]: input ids
      #   [1]: attention masks
      #   [2]: labels
      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)

      # Tell pytorch not to bother with constructing the compute graph during
      # the forward pass, since this is only needed for backprop (training).
      with torch.no_grad():

          # Forward pass, calculate logit predictions.
          # token_type_ids is the same as the "segment ids", which
          # differentiates sentence 1 and 2 in 2-sentence tasks.
          result = model(b_input_ids,
                          token_type_ids=None,
                          attention_mask=b_input_mask,
                          labels=b_labels,
                          return_dict=True)

      # Get the loss and "logits" output by the model. The "logits" are the
      # output values prior to applying an activation function like the
      # softmax.
      loss = result.loss
      logits = result.logits

      # Accumulate the validation loss.
      total_eval_loss += loss.item()

      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()

      # Calculate the accuracy for this batch of test sentences, and
      # accumulate it over all batches.
      total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
  avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
  print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

 Batch     0 of   324.
