Installed required python packages through pip

In [None]:
!pip install -q transformers kaggle

Mount Google drive

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Import required python dependencies 

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pandas
from sklearn.model_selection import train_test_split
from collections import defaultdict

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Upload kaggle api creds to download twitter dataset

In [None]:
!mkdir /root/.kaggle
from google.colab import files
uploaded = files.upload()

Download twitter dataset and unzip

In [None]:
!mv /content/kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!mkdir /content/kaggle
!kaggle datasets download -d kazanova/sentiment140 -p '/content/kaggle'
!unzip /content/kaggle/sentiment140.zip -d /content/kaggle/

Import dataset csv and store as dataframe, drop extra columns

In [None]:
df = pd.read_csv(
    "/content/kaggle/training.1600000.processed.noemoticon.csv", 
    encoding = "ISO-8859-1",
    names = [
           'target',
           'id',
           'date',
           'flag',
           'user',
           'tweet'
    ]
)
df = df.drop(columns=['id', 'date','flag', 'user'])

As the twitter dataset classifies negative sentiment as 0 and positive as 4, we need to transform all the 4s to 1 in order to simplify the dataset.
We also need to reduce the size of the dataset as it would take days to train in its initial size.

In [None]:
def to_sentiment(target):
  rating = int(target)
  if rating == 4:
    return 1
  else: 
    return 0

df['target'] = df.target.apply(to_sentiment)

for n in range(6):
  negative_mask = (df.target == 0)
  negative_idx, = np.where(negative_mask)
  df = df.drop(df.index[negative_idx[:len(negative_idx)//2]])

  positive_mask = (df.target == 1)
  positive_idx, = np.where(positive_mask)
  df = df.drop(df.index[positive_idx[:len(positive_idx)//2]])

Set max length of each tweet to 250

In [None]:
MAX_LEN = 250

Create a dataset class for the Twitter dataset so torch can interpret it 

In [None]:
class TweetDataset(Dataset):

  def __init__(self, tweets, targets, tokenizer, max_len):
    self.tweets = tweets
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.tweets)
  
  def __getitem__(self, item):
    tweet = str(self.tweets[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      tweet,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'tweet_text': tweet,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

Split the dataset into 2 dataframes for training and testing

In [None]:
df_train, df_test = train_test_split(df, test_size=0.1)
df_val, df_test = train_test_split(df_test, test_size=0.5)
df_train.shape, df_val.shape, df_test.shape

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = TweetDataset(
    tweets=df.tweet.to_numpy(),
    targets=df.target.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
BATCH_SIZE = 16
TRAINED_BERT_MODEL = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(TRAINED_BERT_MODEL)
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

data = next(iter(train_data_loader))
data.keys()
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

Create a class for the bert classifier with meta parameters

In [None]:
class Classifier(nn.Module):

  def __init__(self, n_classes):
    super(Classifier, self).__init__()
    self.bert = BertModel.from_pretrained(TRAINED_BERT_MODEL)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
model = Classifier(len(class_names))
model = model.to(device)

In [None]:
EPOCHS = 10

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(df_train)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn, 
    device, 
    len(df_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

In [None]:
torch.save(model.state_dict(), '/content/model.pt')

In [None]:
!cp /content/model.pt "/content/drive/My Drive"