<a href="https://colab.research.google.com/github/elliot-brooks/nlu-coursework/blob/main/src/av_model_two.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
!pip install -U transformers
!pip install -U accelerate



In [2]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import re

# Load training data

In [3]:
training_corpus = pd.read_csv("train.csv", encoding='utf-8')

# Pre-process training data

In [4]:
def preprocess(string):
  output = str(string).lower()
  separated_string = re.sub(r'([^\w\s])', r' \1 ', str(string))
  return output

# Prepare data for Distilled Bert
def prepare_data(data) :
  data["text_1"] = data["text_1"].apply(lambda x: preprocess(x))
  data["text_2"] = data["text_2"].apply(lambda x: preprocess(x))
  CONCAT_SYMBOL = "[SEP]"
  concat_pairs = []
  for index, row in data.iterrows():
      concatenated_pair = row["text_1"] + " " + CONCAT_SYMBOL + " " + row["text_2"]
      concat_pairs.append(concatenated_pair)
  return concat_pairs

# Create BERT Embeddings
tokenizer = BertTokenizer.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(prepare_data(training_corpus), truncation=True, padding=True)


# val_encodings = tokenizer(val_texts, truncation=True, padding=True)

class AuthorshipDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_labels = np.array(training_corpus['label'])
train_dataset = AuthorshipDataset(train_encodings, train_labels)
# val_dataset = AuthorshipDataset(val_encodings, val_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Define Language Model

In [5]:
model = BertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train Model

In [6]:
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    output_dir='./output'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.7052
1000,0.7024
1500,0.7021
2000,0.7022
2500,0.7066
3000,0.7008
3500,0.6998
4000,0.7051
4500,0.7042
5000,0.7023


Step,Training Loss
500,0.7052
1000,0.7024
1500,0.7021
2000,0.7022
2500,0.7066
3000,0.7008
3500,0.6998
4000,0.7051
4500,0.7042
5000,0.7023


KeyboardInterrupt: 

# Save Model

In [7]:
!zip -r /content/SHIT-BERT.zip /content/output/checkpoint-16000

  adding: content/output/checkpoint-16000/ (stored 0%)
  adding: content/output/checkpoint-16000/optimizer.pt (deflated 16%)
  adding: content/output/checkpoint-16000/training_args.bin (deflated 51%)
  adding: content/output/checkpoint-16000/scheduler.pt (deflated 55%)
  adding: content/output/checkpoint-16000/rng_state.pth (deflated 25%)
  adding: content/output/checkpoint-16000/model.safetensors (deflated 7%)
  adding: content/output/checkpoint-16000/trainer_state.json (deflated 80%)
  adding: content/output/checkpoint-16000/config.json (deflated 49%)
