In this notebook we compute BERT embeddings of text and apply clustering to these embeddings. <hr>

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import mysql.connector
from datasets import Dataset, DatasetDict

# from gensim.models import Word2Vec
# import numpy as np
# from nltk.corpus import stopwords
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.cluster import KMeans
# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt


In [3]:
# Load data 
df = pd.read_csv("data/all_augmented_tasks_EN.csv") 
df = df.dropna(subset=["description"])
df = df[df["word_count"] > 4] # removing test descriptions # TODO move to data cleaning steps
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,taskId,language,description,topic_id,word_count
0,9oqJmtbKXts6Rr9Szw4OIS,eng,What are the courses that clients can book at ...,,12
1,a0pzxEfKq8c9D0dRZlQcm9,eng,Write a rule for astronauts. Use a conditional...,,43
2,9Hjn2yUwBcs7DZK6HARkE4,eng,Can you guess the most frequently spoken langu...,,22
3,6AYw9CEZMTN7LN8u0LfYVb,eng,Complete the sentence with going to. Example: ...,,42
4,8QIKtMOE9zV6lnfc2vHMUd,eng,The following are examples of vegetables. carr...,,16


> **Distribution of words:** Refer to 4_concat_data.ipynb

In [4]:
cnx = mysql.connector.connect(user='root', password='taskbase',
                              host='127.0.0.1', port='3309', database='lernnavi')

query = "SELECT * FROM TaskAspects"
df_taskAspects = pd.read_sql(query, cnx)
df_taskAspects.head()

Unnamed: 0,id,taskId,aspectId,sampleSolution
0,3,5ElPCuVMbAy8pzupzU7R3x,2,
1,54,2VX1HHa4SZp9Cs6Suof4ho,1,
2,74,2j6rJkYxYa98ydGaSCW17D,4,
3,75,2j6rJkYxYa98ydGaSCW17D,20,"{""type"": ""DEFAULT"", ""sampleSolutionGroups"": [{..."
4,76,2j6rJkYxYa98ydGaSCW17D,21,"{""type"": ""DEFAULT"", ""sampleSolutionGroups"": [{..."


In [5]:
# Attribute labels (aspectId) to descriptions
df = pd.merge(df[["taskId", "description"]], df_taskAspects[["taskId", "aspectId"]], how="inner", on=["taskId"]) 
df_pivot = df.pivot_table(index='description', columns='aspectId', aggfunc='size', fill_value=None)
df_pivot = df_pivot.notnull().reset_index()
df_pivot.head(1)

aspectId,description,4,20,21,47,69,70,71,72,74,...,265322,265323,265326,265327,265404,265405,265406,265407,265408,265409
0,"""Climate change is a dangerous threat,"" he said.",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
# Convert to Dataset 
dataset = Dataset.from_pandas(df_pivot) 

# Make splits: train, test, validation
train_test = dataset.train_test_split(test_size=0.3)
test_val = train_test["test"].train_test_split(test_size=0.33)

# Recreate Dataset with the three splits 
data = DatasetDict({
    'train': train_test['train'],
    'test': test_val['train'],
    'validation': test_val['test']
})

# data

In [7]:
len(data['train']), len(data['test']), len(data['validation'])

(722, 207, 103)

In [8]:
labels = [label for label in data['train'].features.keys() if label not in ['description']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

# Training BERT-tiny

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    

**Preprocess**

In [10]:
# Define Tokenizer
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny") 

In [11]:
# Preprocessing 
def preprocess_data(examples):
  # take a batch of texts
  text = examples["description"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

encoded_dataset = data.map(preprocess_data, batched=True, remove_columns=data['train'].column_names)

Map:   0%|          | 0/722 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Map:   0%|          | 0/103 [00:00<?, ? examples/s]

In [12]:
encoded_dataset.set_format("torch")

**Training the model**

In [13]:
# Define the model
model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
batch_size = 8
metric_name = "f1"

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-8,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_steps=1,
    #push_to_hub=True,
)

In [15]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [16]:
encoded_dataset['train'][0]['labels'].type() 

'torch.FloatTensor'

In [17]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  8683, 16473,  1037, 22200,  1999,  2010,  2067,  4220,  2008,
         2003,  7032,  2519,  2146,  1998,  4376,  2519,  2898,  1012,  2828,
         1999,  1996,  8522,  2008, 16463,  1996, 13443,  1997,  2023, 22200,
         1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [18]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.6950, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.0490, -0.1720,  0.0216,  ...,  0.1357, -0.1400, -0.0925]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [19]:
# Training 
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.6921,0.693991,0.041511,0.49064,0.0
2,0.6942,0.693964,0.041524,0.490727,0.0
3,0.6943,0.693946,0.041532,0.490779,0.0
4,0.6946,0.693934,0.041536,0.490799,0.0
5,0.6934,0.693931,0.041537,0.49081,0.0


TrainOutput(global_step=455, training_loss=0.6942593687183254, metrics={'train_runtime': 40.5758, 'train_samples_per_second': 88.969, 'train_steps_per_second': 11.214, 'total_flos': 1659126888960.0, 'train_loss': 0.6942593687183254, 'epoch': 5.0})

**Evaluating**

In [20]:
trainer.evaluate()

{'eval_loss': 0.6939312815666199,
 'eval_f1': 0.041537272167407005,
 'eval_roc_auc': 0.4908097957286409,
 'eval_accuracy': 0.0,
 'eval_runtime': 0.3559,
 'eval_samples_per_second': 289.408,
 'eval_steps_per_second': 36.527,
 'epoch': 5.0}