In this notebook we compute BERT embeddings of text and apply clustering to these embeddings. <hr>

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import mysql.connector
from datasets import Dataset, DatasetDict

In [3]:
# Load data 
df = pd.read_csv("data/all_augmented_tasks_EN.csv") 
df = df.dropna(subset=["description"])
df.reset_index(inplace=True, drop=True)
# df = df.sample(500)
df.head()

Unnamed: 0,taskId,language,description,topic_id,word_count
0,9oqJmtbKXts6Rr9Szw4OIS,eng,What are the courses that clients can book at ...,,12
1,aOrgjKFodXC7uGMKqdMKMg,eng,Empty response,,2
2,a0pzxEfKq8c9D0dRZlQcm9,eng,Write a rule for astronauts. Use a conditional...,,43
3,9Hjn2yUwBcs7DZK6HARkE4,eng,Can you guess the most frequently spoken langu...,,22
4,6AYw9CEZMTN7LN8u0LfYVb,eng,Complete the sentence with going to. Example: ...,,42


> **Distribution of words:** Refer to 4_concat_data.ipynb

In [4]:
df_taskAspects = pd.read_csv("data/taskAspects.csv") 
df_taskAspects.head()

Unnamed: 0,id,taskId,aspectId,sampleSolution
0,3,5ElPCuVMbAy8pzupzU7R3x,2,
1,54,2VX1HHa4SZp9Cs6Suof4ho,1,
2,74,2j6rJkYxYa98ydGaSCW17D,4,
3,75,2j6rJkYxYa98ydGaSCW17D,20,"{""type"": ""DEFAULT"", ""sampleSolutionGroups"": [{..."
4,76,2j6rJkYxYa98ydGaSCW17D,21,"{""type"": ""DEFAULT"", ""sampleSolutionGroups"": [{..."


# Prepare dataset 

Wanted format: <br>
- Examples look like this: <br>
  <t> {'set': { <br>
  <t> <t> <t> 'query': 's' <br>
  <t> <t> <t> 'pos': ['s1', 's2', ...], <br>
  <t> <t> <t> 'neg': ['sA', 'sB', ...]}, ...}

In [5]:
mapping_df = pd.merge(df[["taskId", "description"]], df_taskAspects[["taskId", "aspectId"]], how="inner", on=["taskId"]) 
mapping_df = mapping_df.groupby(by="taskId")["aspectId"].apply(list).reset_index()
mapping_df.head()

Unnamed: 0,taskId,aspectId
0,14OS9eQKgfv63ZY7d5k4T8,"[70948, 70949, 70950, 70951, 70952, 70953, 709..."
1,14ambh1obhw7TYMQE8lcC1,"[9637, 9638, 9639, 9641, 9642]"
2,15kxeWhEKDnaQToOCK9BR2,"[68053, 68174, 8432, 68188, 68177, 68096, 8632..."
3,15sKzdWMaXB8f0Mx9Aomk1,"[190218, 190219, 265063, 172024, 190201, 19020..."
4,18Ccvc8NMJT5xqLv9nAgTH,"[9839, 9843, 9847, 9850, 9859, 9864, 9867, 987..."


In [6]:
elements = mapping_df["taskId"].to_list()
labels = {}
for index, row in mapping_df.iterrows():
    taskId = row["taskId"]
    labels[taskId] = row["aspectId"]
# labels

In [8]:
# Positive pairs: those that share (a max number of) labels 
# Negative pairs: those that have NO common label 
from itertools import combinations

def extract_pairs(elements, labels):
    positive_pairs = []
    negative_pairs = []
    for e1, e2 in combinations(elements, 2):
        common_labels = set(labels[e1]).intersection(labels[e2])
        if len(common_labels) > 50:
            positive_pairs.append((e1, e2))
        if not common_labels:
            negative_pairs.append((e1, e2))
    return positive_pairs, negative_pairs

positive_pairs, negative_pairs = extract_pairs(elements[:50], labels)

# len(positive_pairs), len(negative_pairs), len(elements)

In [9]:
from itertools import product
_combinations = list(product(positive_pairs, negative_pairs))
len(_combinations)

36330

In [11]:
import random
triplets = {"anchor": [], "positive": [], "negative": []} 
idx = 0
# combinations = random.sample(_combinations, 100000)
for comb in _combinations: 
    anchor_id = comb[0][0] 
    pos_id = comb[0][1] 
    neg_id = comb[1][1]

    anchor = df[df["taskId"] == anchor_id ]["description"].values[0]
    pos = df[df["taskId"] == pos_id ]["description"].values[0]
    neg = df[df["taskId"] == neg_id]["description"].values[0]

    # print(anchor) 
    # print(pos)

    triplets["anchor"].append(anchor)
    triplets["positive"].append(pos)
    triplets["negative"].append(neg)
    

In [20]:
import pickle 

with open('triplets.pkl', 'wb') as f:
    pickle.dump(triplets, f)

<hr>

In [6]:
import pickle
with open('data/triplets.pkl', 'rb') as f:
    triplets = pickle.load(f) 

In [22]:
# Convert to Dataset 
dataset = Dataset.from_dict(triplets) 

In [23]:
# Make splits: train, test, validation
train_test = dataset.train_test_split(test_size=0.3)
test_val = train_test["test"].train_test_split(test_size=0.33)

# Recreate Dataset with the three splits 
dataset = DatasetDict({
    'train': train_test['train'],
    'test': test_val['train'],
    'validation': test_val['test']
})

# data

In [24]:
len(dataset['train']), len(dataset['test']), len(dataset['validation'])

(70000, 20100, 9900)

In [25]:
# dataset["train"][0:5]

In [27]:
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
from sentence_transformers import losses

# 1. Load a model to finetune with 2. (Optional) model card data
model = SentenceTransformer("prajjwal1/bert-tiny")

# 3. Load a dataset to finetune on
train_dataset = dataset["train"]#.select(range(100_000))
eval_dataset = dataset["validation"]
test_dataset = dataset["test"]

# 4. Define a loss function
loss = losses.TripletLoss(model=model)

# 5. (Optional) Specify training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/bert-tiny-triplet",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=500,
    # run_name="bert-tiny-triplet",  # Will be used in W&B if `wandb` is installed
)

# 6. (Optional) Create an evaluator & evaluate the base model
dev_evaluator = TripletEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
)
dev_evaluator(model)

# 7. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)

trainer.train()

No sentence-transformers model found with name prajjwal1/bert-tiny. Creating a new one with mean pooling.


Step,Training Loss,Validation Loss,Cosine Accuracy,Dot Accuracy,Manhattan Accuracy,Euclidean Accuracy,Max Accuracy
100,No log,3.762044,0.745758,0.249798,0.735657,0.733737,0.745758
200,No log,2.381528,0.855253,0.166162,0.851313,0.85404,0.855253
300,No log,1.14541,0.915152,0.090808,0.915859,0.916869,0.916869
400,No log,0.799755,0.93798,0.061313,0.938283,0.937475,0.938283
500,2.183600,0.72014,0.942222,0.055556,0.941515,0.942121,0.942222
600,2.183600,0.682798,0.943636,0.055556,0.943434,0.944646,0.944646
700,2.183600,0.629473,0.949697,0.04697,0.948586,0.949394,0.949697
800,2.183600,0.626571,0.948687,0.047576,0.947071,0.948788,0.948788
900,2.183600,0.596237,0.949899,0.046162,0.950101,0.95,0.950101
1000,0.568200,0.596051,0.94899,0.045657,0.949495,0.949495,0.949495


Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

KeyboardInterrupt: 

In [None]:
# (Optional) Evaluate the trained model on the test set
test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"]
)
test_evaluator(model)

In [None]:
# # 8. Save the trained model
# model.save_pretrained("models/mpnet-base-all-nli-triplet/final")

# # 9. (Optional) Push it to the Hugging Face Hub
# model.push_to_hub("mpnet-base-all-nli-triplet")

In [6]:
from itertools import product
a = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 4)]
b = [(0, 5), (0, 6), (1, 7), (1, 8), (1, 9)] 
p = product(a, b)

In [7]:
for i in p: 
    print(i)

((0, 1), (0, 5))
((0, 1), (0, 6))
((0, 1), (1, 7))
((0, 1), (1, 8))
((0, 1), (1, 9))
((0, 2), (0, 5))
((0, 2), (0, 6))
((0, 2), (1, 7))
((0, 2), (1, 8))
((0, 2), (1, 9))
((0, 3), (0, 5))
((0, 3), (0, 6))
((0, 3), (1, 7))
((0, 3), (1, 8))
((0, 3), (1, 9))
((1, 2), (0, 5))
((1, 2), (0, 6))
((1, 2), (1, 7))
((1, 2), (1, 8))
((1, 2), (1, 9))
((1, 4), (0, 5))
((1, 4), (0, 6))
((1, 4), (1, 7))
((1, 4), (1, 8))
((1, 4), (1, 9))


In [12]:
l = list(product(a, b))
[1,2] + [3,4]

[1, 2, 3, 4]