# Data Preprocess

In [31]:
import pandas as pd

df_train = pd.read_csv("dataset/sts-indonesia/train.tsv", sep="\t", nrows=10000)
df_train = df_train[["text1_id", "text2_id", "score"]]
df_train["score"] = df_train["score"] / 5.0
df_train


Unnamed: 0,text1_id,text2_id,score
0,Sebuah sepeda motor diparkir di dekat dinding ...,Sebuah sepeda motor diparkir oleh mural sebuah...,0.68
1,"dia menikahimu, memilih untuk memiliki anak be...",mereka tidak pernah mengangkat masalah moral a...,0.10
2,Wanita yang meninggal itu juga mengenakan cinc...,Seorang wanita berambut pirang mengenakan arlo...,0.52
3,Kedua komponen harus berada di jalur tertutup.,bohlam dan baterai berada di jalur tertutup,0.76
4,Seperti yang sudah saya jelaskan pada bacaan k...,Seperti yang telah saya katakan dalam bacaan k...,0.95
...,...,...,...
9995,Semua ada di kepalanya.,Ini semua tentang adhesi.,0.00
9996,"Ketika kita dihadapkan pada risiko potensial, ...",Ketika kita dihadapkan dengan risiko potensial...,0.96
9997,"Tn. Drummond, ya ... menunjukkan umur saya: Co...","Conrad Bain, Aktor di? Diff? Rent Strokes ,? M...",0.84
9998,Capitol AS dievakuasi kemarin setelah pihak be...,Polisi Capitol AS mengevakuasi Capitol kemarin...,0.64


In [32]:
df_test = pd.read_csv("dataset/sts-indonesia/test.tsv", sep="\t", nrows=1000)
df_test = df_test[["text1_id", "text2_id", "score"]]
df_test["score"] = df_test["score"] / 5.0
df_test

Unnamed: 0,text1_id,text2_id,score
0,Partai oposisi Thailand memboikot pemilihan umum,Oposisi Thailand mengumumkan boikot pemilu,0.96
1,"WHO mengatakan, kasus terbaru tidak sesuai den...",WHO mengatakan kasus Singapura tidak sesuai de...,0.76
2,"Banyak orang yang mengikuti balap sepeda, term...",Seorang lelaki mengendarai kursi roda tiga.,0.52
3,Jeff Bezos bertaruh $ 250 Juta untuk Menghidup...,Jeff Bezos Membayar $ 250 Juta Untuk The Washi...,0.84
4,perbatasan atau batas suatu objek,"merencanakan, mengatur, dan melaksanakan (suat...",0.00
...,...,...,...
995,Seorang pria menyemprotkan cairan dari selang ...,Seorang pria duduk di sepedanya dengan satu ro...,0.12
996,Upaya tersebut merupakan upaya terbaru pemerin...,10 Juli - Upaya terbaru pemerintah Bush untuk ...,0.65
997,Haruskah saya memberi tahu calon pemberi kerja...,Haruskah saya memberi tahu bos saya bahwa saya...,0.60
998,Ratu Beatrix dari Belanda akan turun tahta unt...,Ratu Beatrix dari Belanda mengundurkan diri de...,1.00


# Model Construction

#### Architecture Initialization

In [33]:
from sentence_transformers import SentenceTransformer, InputExample, models

# TODO: use indobert pretrained model
# TODO: try more complex architecture
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#### Training Part

In [34]:
from torch.utils.data import DataLoader
from sentence_transformers import losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# create a train examples to be used in DataLoader
train_examples = []
for i, row in df_train.iterrows():
    train_examples.append(InputExample(texts=[row["text1_id"], row["text2_id"]], label=row["score"]))

# we wrap our train_examples with the standard PyTorch DataLoader, which shuffles our data and produces batches of certain sizes.
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# define the loss function
# TODO: use and experiment with another loss functions
train_loss = losses.CosineSimilarityLoss(model)

# define the evaluator
# evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation in comparison to the gold standard labels
# https://github.com/UKPLab/sentence-transformers/blob/83eeb5a7b9b81d17a235d76e101cc2912ee1a30d/examples/evaluation/evaluation_stsbenchmark.py#L10
# TODO: this one should be a dev_examples not test_examples
test_examples = []
for i, row in df_test.iterrows():
    test_examples.append(InputExample(texts=[row["text1_id"], row["text2_id"]], label=row["score"]))
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_examples)

# tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, evaluator=evaluator)

# TODO: create a new embedding evaluator with test dataset (not dev dataset) and then evaluate the finetuned model here
model.evaluate(evaluator)


Iteration: 100%|██████████| 625/625 [35:40<00:00,  3.42s/it]
Epoch: 100%|██████████| 1/1 [36:12<00:00, 2172.30s/it]


0.6680118226204218

Usage of bert-base-uncased + indonesian sts seems pretty bad. Will try to use indoBERT as the base model

# Experiments

Experiment 1: will try to use indoBERT as the base word embedding BERT model

In [35]:
from sentence_transformers import SentenceTransformer, InputExample, models

word_embedding_model = models.Transformer('indolem/indobert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

from torch.utils.data import DataLoader
from sentence_transformers import losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

train_examples = []
for i, row in df_train.iterrows():
    train_examples.append(InputExample(texts=[row["text1_id"], row["text2_id"]], label=row["score"]))

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

test_examples = []
for i, row in df_test.iterrows():
    test_examples.append(InputExample(texts=[row["text1_id"], row["text2_id"]], label=row["score"]))
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_examples)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, evaluator=evaluator)
model.evaluate(evaluator)


Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Iteration: 100%|██████████| 625/625 [18:37<00:00,  1.79s/it]
Epoch: 100%|██████████| 1/1 [18:58<00:00, 1138.19s/it]


0.7773440751254502

Using IndoBERT shows the score is much better than using basic bert model with score 77.73 