Refrenced/followed two different tutorials:

https://towardsdatascience.com/fine-tuning-bert-for-text-classification-54e7df642894

https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613

In [43]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [44]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [45]:
import pandas as pd
from google.colab import files
import io

uploaded = files.upload()

Saving ds340_data - Sheet1.csv to ds340_data - Sheet1 (2).csv


In [46]:
df = pd.read_csv(io.BytesIO(uploaded['ds340_data - Sheet1.csv']))
df.head()

Unnamed: 0,Text,Label
0,"When the earth's axis points away, this seaso...",winter
1,This season tends to be gloomy and very diffic...,winter
2,"cold, dry",winter
3,It’s cold.,winter
4,its is a period of time that is cold and when ...,winter


In [47]:
df['Label'].value_counts()

spring    61
summer    61
fall      55
winter    49
Name: Label, dtype: int64

In [48]:
# adjust labels of seasons to be numerical
possible_labels = df.Label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'winter': 0, 'spring': 1, 'summer': 2, 'fall': 3}

In [49]:
df['label'] = df.Label.replace(label_dict)
text = df.Text.values
labels = df.label.values

# download the pretrained tokenizer
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

# find the longest sentence to know how much padding to add to shorter sentences
max_len = 0

# For every sentence...
for sent in text:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  215


In [50]:
# an example of a sentence and the token ids given
def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  index = random.randint(0, len(text)-1)
  table = np.array([tokenizer.tokenize(text[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒══════════╤═════════════╕
│ Tokens   │   Token IDs │
╞══════════╪═════════════╡
│ getting  │        2893 │
├──────────┼─────────────┤
│ sun      │        3103 │
├──────────┼─────────────┤
│ burnt    │       11060 │
├──────────┼─────────────┤
│ while    │        2096 │
├──────────┼─────────────┤
│ being    │        2108 │
├──────────┼─────────────┤
│ in       │        1999 │
├──────────┼─────────────┤
│ an       │        2019 │
├──────────┼─────────────┤
│ outdoor  │        7254 │
├──────────┼─────────────┤
│ pool     │        4770 │
├──────────┼─────────────┤
│ for      │        2005 │
├──────────┼─────────────┤
│ too      │        2205 │
├──────────┼─────────────┤
│ long     │        2146 │
├──────────┼─────────────┤
│ .        │        1012 │
╘══════════╧═════════════╛


In [51]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation = False
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)



In [52]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 3

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

df['data_type'] = ['not_set']*df.shape[0]

df.loc[train_idx, 'data_type'] = 'train'
df.loc[val_idx, 'data_type'] = 'val'

df.groupby(['Label', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Text
Label,label,data_type,Unnamed: 3_level_1
fall,3,train,44
fall,3,val,11
spring,1,train,48
spring,1,val,13
summer,2,train,49
summer,2,val,12
winter,0,train,39
winter,0,val,10


In [None]:
# train the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

model.cuda()

In [54]:
#need to change for all 4 classes!
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TP): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_tp2(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 2'''
  return sum([preds == labels and preds == 2 for preds, labels in zip(preds, labels)])

def b_tp3(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 3'''
  return sum([preds == labels and preds == 3 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  tp2 = b_tp2(preds, labels)
  tp3 = b_tp3(preds, labels)

  b_accuracy = (tp + tn + tp2 + tp3) / len(labels)
  return b_accuracy

In [55]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 4

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))


Epoch:  25%|██▌       | 1/4 [00:09<00:27,  9.33s/it]


	 - Train loss: 1.3987
	 - Validation Accuracy: 0.3958


Epoch:  50%|█████     | 2/4 [00:18<00:18,  9.34s/it]


	 - Train loss: 1.2959
	 - Validation Accuracy: 0.5208


Epoch:  75%|███████▌  | 3/4 [00:28<00:09,  9.39s/it]


	 - Train loss: 0.6738
	 - Validation Accuracy: 0.7917


Epoch: 100%|██████████| 4/4 [00:37<00:00,  9.42s/it]


	 - Train loss: 0.2705
	 - Validation Accuracy: 0.8750





In [63]:
new_sentence = input("Enter a sentence about season: ")

# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

# Forward pass, calculate logit predictions
with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

num = np.argmax(output.logits.cpu().numpy()).flatten().item()
prediction = ""
if num == 0:
  prediction = "winter"
elif num == 1:
  prediction = "spring"
elif num == 2:
  prediction = "summer"
elif num == 3:
  prediction = "fall"

print('Predicted Class: ', prediction)

Enter a sentence about season: The weather is very warm and I like to do a lot of outdoor activities
Predicted Class:  summer


Evaluate results compared to Bert without fine-tuning.
followed tutorial here: https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=iCoyxRJ7ECTA

In [64]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [65]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [66]:
tokenized = df['Text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [67]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
                   
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(226, 215)

In [68]:
input_ids = torch.tensor(np.array(padded))

with torch.no_grad():
    last_hidden_states = model(input_ids)

In [69]:
features = last_hidden_states[0][:,0,:].numpy()
labels = df['Label']

In [70]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [71]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [72]:
lr_clf.score(test_features, test_labels)

0.631578947368421

In [73]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.266 (+/- 0.01)
