In this notebook we compute BERT embeddings of text and apply clustering to these embeddings. <hr>

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import mysql.connector
from datasets import Dataset, DatasetDict

In [3]:
# Load data 
df = pd.read_csv("data/all_augmented_tasks_EN.csv") 
df = df.dropna(subset=["description"])
df = df[df["word_count"] > 4] # removing test descriptions # TODO move to data cleaning steps
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,taskId,language,description,topic_id,word_count
0,9oqJmtbKXts6Rr9Szw4OIS,eng,What are the courses that clients can book at ...,,12
1,a0pzxEfKq8c9D0dRZlQcm9,eng,Write a rule for astronauts. Use a conditional...,,43
2,9Hjn2yUwBcs7DZK6HARkE4,eng,Can you guess the most frequently spoken langu...,,22
3,6AYw9CEZMTN7LN8u0LfYVb,eng,Complete the sentence with going to. Example: ...,,42
4,8QIKtMOE9zV6lnfc2vHMUd,eng,The following are examples of vegetables. carr...,,16


> **Distribution of words:** Refer to 4_concat_data.ipynb

In [4]:
cnx = mysql.connector.connect(user='root', password='taskbase',
                              host='127.0.0.1', port='3309', database='lernnavi')

query = "SELECT * FROM TaskAspects"
df_taskAspects = pd.read_sql(query, cnx)
df_taskAspects.head()

Unnamed: 0,id,taskId,aspectId,sampleSolution
0,3,5ElPCuVMbAy8pzupzU7R3x,2,
1,54,2VX1HHa4SZp9Cs6Suof4ho,1,
2,74,2j6rJkYxYa98ydGaSCW17D,4,
3,75,2j6rJkYxYa98ydGaSCW17D,20,"{""type"": ""DEFAULT"", ""sampleSolutionGroups"": [{..."
4,76,2j6rJkYxYa98ydGaSCW17D,21,"{""type"": ""DEFAULT"", ""sampleSolutionGroups"": [{..."


In [5]:
# Attribute labels (aspectId) to descriptions
df = pd.merge(df[["taskId", "description"]], df_taskAspects[["taskId", "aspectId"]], how="inner", on=["taskId"]) 
df_pivot = df.pivot_table(index='description', columns='aspectId', aggfunc='size', fill_value=None)
df_pivot = df_pivot.notnull().reset_index()
df_pivot.head(1)

aspectId,description,4,20,21,47,69,70,71,72,74,...,272853,272854,272882,272883,272887,272888,272894,272895,272900,272901
0,"""Climate change is a dangerous threat,"" he said.",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
# Convert to Dataset 
dataset = Dataset.from_pandas(df_pivot) 

# Make splits: train, test, validation
train_test = dataset.train_test_split(test_size=0.3)
test_val = train_test["test"].train_test_split(test_size=0.33)

# Recreate Dataset with the three splits 
data = DatasetDict({
    'train': train_test['train'],
    'test': test_val['train'],
    'validation': test_val['test']
})

# data

In [7]:
len(data['train']), len(data['test']), len(data['validation'])

(843, 242, 120)

In [8]:
labels = [label for label in data['train'].features.keys() if label not in ['description']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

# Training BERT-tiny

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    

**Preprocess**

In [10]:
# Define Tokenizer
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny") 

In [11]:
# Preprocessing 
def preprocess_data(examples):
  # take a batch of texts
  text = examples["description"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

encoded_dataset = data.map(preprocess_data, batched=True, remove_columns=data['train'].column_names)

Map:   0%|          | 0/843 [00:00<?, ? examples/s]

Map:   0%|          | 0/242 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [12]:
encoded_dataset.set_format("torch")

**Training the model**

In [13]:
# Define the model
model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
batch_size = 8
metric_name = "f1"

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-8,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_steps=1,
    #push_to_hub=True,
)

In [15]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [16]:
encoded_dataset['train'][0]['labels'].type() 

'torch.FloatTensor'

In [17]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  7981, 10663,  1996,  3160,  2000,  4797,  9587, 29068,  3686,
         7981,  5928,  1029,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [18]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.6968, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.1880,  0.2092,  0.0108,  ..., -0.0116, -0.0600, -0.1763]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [19]:
# Training 
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.6984,0.69813,0.041255,0.48768,0.0
2,0.6985,0.698097,0.041243,0.487612,0.0
3,0.6992,0.698075,0.041228,0.487527,0.0
4,0.6993,0.698061,0.041232,0.487553,0.0
5,0.6985,0.698057,0.041234,0.487564,0.0


TrainOutput(global_step=530, training_loss=0.6983522784035161, metrics={'train_runtime': 29.8038, 'train_samples_per_second': 141.425, 'train_steps_per_second': 17.783, 'total_flos': 1948037310720.0, 'train_loss': 0.6983522784035161, 'epoch': 5.0})

**Evaluating**

In [20]:
trainer.evaluate()

{'eval_loss': 0.6981296539306641,
 'eval_f1': 0.041255046817283744,
 'eval_roc_auc': 0.48767955642333494,
 'eval_accuracy': 0.0,
 'eval_runtime': 0.2274,
 'eval_samples_per_second': 527.693,
 'eval_steps_per_second': 65.962,
 'epoch': 5.0}

In [23]:
len(labels), len(data['train'])

(1461, 843)

# Testing

In [39]:
d = df.groupby('taskId')['aspectId'].apply(list).reset_index()
d.head()

Unnamed: 0,taskId,aspectId
0,14OS9eQKgfv63ZY7d5k4T8,"[70948, 70949, 70950, 70951, 70952, 70953, 709..."
1,14ambh1obhw7TYMQE8lcC1,"[9637, 9638, 9639, 9641, 9642, 9637, 9638, 963..."
2,15kxeWhEKDnaQToOCK9BR2,"[68053, 68174, 8432, 68188, 68177, 68096, 8632..."
3,15sKzdWMaXB8f0Mx9Aomk1,"[190218, 190219, 265063, 172024, 190201, 19020..."
4,18Ccvc8NMJT5xqLv9nAgTH,"[9839, 9843, 9847, 9850, 9859, 9864, 9867, 987..."


In [46]:
df[df['taskId'] == "14OS9eQKgfv63ZY7d5k4T8"]['description'][19494]

'Write a sentence in the past tense. Use the following words: Yesterday - I - tell - story - a'

In [42]:
len(d.loc[0]['aspectId'])

306

In [54]:
text = "Write a sentence in the past tense. Use the following words: Yesterday - I - tell - story - a"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)
logits = outputs.logits

# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.55)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['106', '5792', '5793', '5893', '7987', '7989', '8316', '8512', '8768', '8774', '8775', '8846', '8849', '8945', '9092', '9105', '9156', '9245', '9248', '9285', '9578', '9628', '9635', '9638', '9647', '9654', '9672', '9680', '9843', '9871', '9941', '9943', '9944', '9945', '9984', '9992', '10085', '10145', '10313', '10316', '10331', '10560', '10767', '10768', '10772', '11048', '11093', '11399', '11401', '11410', '11555', '11562', '11564', '11565', '11664', '11665', '11680', '11691', '11703', '11710', '11859', '11866', '12194', '12197', '12201', '12212', '12215', '12216', '12366', '12368', '12576', '12629', '12640', '12642', '12714', '12731', '12739', '12745', '12756', '12764', '12766', '12774', '12778', '12779', '12986', '13078', '13089', '13423', '13924', '13968', '13993', '14019', '14024', '14077', '14155', '16077', '16149', '27901', '27937', '27969', '28035', '60818', '60824', '60864', '60876', '60891', '60894', '60909', '61027', '61090', '62357', '68053', '68096', '68174', '68177', '

In [55]:
len(predicted_labels)

207