# Data Preprocess

In [14]:
import pandas as pd

df = pd.read_csv("dataset/sts-indonesia/train.tsv", sep="\t", nrows=1000)
df = df[["text1_id", "text2_id", "score"]]
df


Unnamed: 0,text1_id,text2_id,score
0,Sebuah sepeda motor diparkir di dekat dinding ...,Sebuah sepeda motor diparkir oleh mural sebuah...,3.40
1,"dia menikahimu, memilih untuk memiliki anak be...",mereka tidak pernah mengangkat masalah moral a...,0.50
2,Wanita yang meninggal itu juga mengenakan cinc...,Seorang wanita berambut pirang mengenakan arlo...,2.60
3,Kedua komponen harus berada di jalur tertutup.,bohlam dan baterai berada di jalur tertutup,3.80
4,Seperti yang sudah saya jelaskan pada bacaan k...,Seperti yang telah saya katakan dalam bacaan k...,4.75
...,...,...,...
995,"Bapak Presiden, hadirin sekalian, saya ingin m...","Tuan Presiden, hadirin, saya ingin berbicara t...",4.40
996,wilayah perbatasan afghanistan-iran telah menj...,daerah dekat kota mirjaveh telah menjadi tempa...,3.80
997,"tindakan menggabungkan, memadukan, mengintegra...",tindakan chipping sesuatu.,0.80
998,"influencing_variable, influencing_situation, a...",efek dari satu hal (atau orang) pada yang lain;,2.40


# Model Construction

#### Architecture Initialization

In [15]:
from sentence_transformers import SentenceTransformer, InputExample, models

# TODO: use indobert pretrained model
# TODO: try more complex architecture
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#### Training Part

In [16]:
from torch.utils.data import DataLoader
from sentence_transformers import losses

train_examples = []
for i, row in df.iterrows():
    train_examples.append(InputExample(texts=[row["text1_id"], row["text2_id"]], label=row["score"]))

# we wrap our train_examples with the standard PyTorch DataLoader, which shuffles our data and produces batches of certain sizes.
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# define the loss function
# TODO: use and experiment with another loss functions
train_loss = losses.CosineSimilarityLoss(model)


# tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

# TODO: evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation in comparison to the gold standard labels
# https://github.com/UKPLab/sentence-transformers/blob/83eeb5a7b9b81d17a235d76e101cc2912ee1a30d/examples/evaluation/evaluation_stsbenchmark.py#L10



Iteration: 100%|██████████| 63/63 [03:34<00:00,  3.40s/it]
Epoch: 100%|██████████| 1/1 [03:34<00:00, 214.10s/it]
