In [8]:
import pandas as pd

df_train = pd.read_csv("dataset/sts-indonesia/train.tsv", sep="\t")
df_train = df_train[["text1_id", "text2_id", "score"]]
df_train["score"] = df_train["score"] / 5.0
df_train

df_test = pd.read_csv("dataset/sts-indonesia/test.tsv", sep="\t")
df_test = df_test[["text1_id", "text2_id", "score"]]
df_test["score"] = df_test["score"] / 5.0
df_test

df = pd.concat([df_train, df_test])
df

Unnamed: 0,text1_id,text2_id,score
0,Sebuah sepeda motor diparkir di dekat dinding ...,Sebuah sepeda motor diparkir oleh mural sebuah...,0.68
1,"dia menikahimu, memilih untuk memiliki anak be...",mereka tidak pernah mengangkat masalah moral a...,0.10
2,Wanita yang meninggal itu juga mengenakan cinc...,Seorang wanita berambut pirang mengenakan arlo...,0.52
3,Kedua komponen harus berada di jalur tertutup.,bohlam dan baterai berada di jalur tertutup,0.76
4,Seperti yang sudah saya jelaskan pada bacaan k...,Seperti yang telah saya katakan dalam bacaan k...,0.95
...,...,...,...
2575,PM Turki mendesak untuk mengakhiri protes di I...,Polisi Turki menembakkan gas air mata ke pengu...,0.56
2576,Karena tegangannya tidak mencapai bohlam.,jalannya tidak tertutup,0.36
2577,Mereka kemudian jatuh dan telah mendukung sera...,"Kedua negara yang menginvasi kemudian jatuh, d...",0.70
2578,Lem underlayment ke beton: berapa lama waktu y...,Berapa lama waktu yang dibutuhkan untuk menyem...,0.60


# INDOBERT

In [7]:
from sentence_transformers import SentenceTransformer, InputExample, models
from torch import nn

word_embedding_model = models.Transformer('indolem/indobert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

from torch.utils.data import DataLoader
from sentence_transformers import losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

train_examples = []
for i, row in df_train.iterrows():
    train_examples.append(InputExample(texts=[row["text1_id"], row["text2_id"]], label=row["score"]))

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

test_examples = []
for i, row in df_test.iterrows():
    test_examples.append(InputExample(texts=[row["text1_id"], row["text2_id"]], label=row["score"]))

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_examples)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, evaluator=evaluator)
model.save("model/indobert-base-uncased", model_name="indobert-base-uncased")

model.evaluate(evaluator)



Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Iteration: 100%|██████████| 646/646 [3:14:13<00:00, 18.04s/it]
Epoch: 100%|██████████| 1/1 [3:15:04<00:00, 11704.38s/it]


0.45549373476299

# BERT BASE UNCASED + DENSE LAYER

In [11]:
from sentence_transformers import SentenceTransformer, InputExample, models
from torch import nn

word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

from torch.utils.data import DataLoader
from sentence_transformers import losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

train_examples = []
for i, row in df_train.iterrows():
    train_examples.append(InputExample(texts=[row["text1_id"], row["text2_id"]], label=row["score"]))

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

test_examples = []
for i, row in df_test.iterrows():
    test_examples.append(InputExample(texts=[row["text1_id"], row["text2_id"]], label=row["score"]))
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_examples)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, evaluator=evaluator)
model.save("model/bert-base-uncased-dense", model_name="bert-base-uncased-dense")
model.evaluate(evaluator)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Iteration: 100%|██████████| 646/646 [41:15<00:00,  3.83s/it]
Epoch: 100%|██████████| 1/1 [42:36<00:00, 2556.59s/it]


0.6966998906191842

# BERT BASED CASED + DENSE LAYER

In [10]:
from sentence_transformers import SentenceTransformer, InputExample, models
from torch import nn

word_embedding_model = models.Transformer('bert-base-cased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

from torch.utils.data import DataLoader
from sentence_transformers import losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

train_examples = []
for i, row in df_train.iterrows():
    train_examples.append(InputExample(texts=[row["text1_id"], row["text2_id"]], label=row["score"]))

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

test_examples = []
for i, row in df_test.iterrows():
    test_examples.append(InputExample(texts=[row["text1_id"], row["text2_id"]], label=row["score"]))

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, evaluator=evaluator)
model.save("model/bert-base-cased-dense", model_name="bert-base-cased-dense")

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_examples)
model.evaluate(evaluator)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Iteration: 100%|██████████| 646/646 [2:40:46<00:00, 14.93s/it]
Epoch: 100%|██████████| 1/1 [2:42:18<00:00, 9738.29s/it]


0.6806033636732052

# INDOBERT + DENSE LAYER

In [9]:
from sentence_transformers import SentenceTransformer, InputExample, models
from torch import nn

word_embedding_model = models.Transformer('indolem/indobert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

from torch.utils.data import DataLoader
from sentence_transformers import losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

train_examples = []
for i, row in df_train.iterrows():
    train_examples.append(InputExample(texts=[row["text1_id"], row["text2_id"]], label=row["score"]))

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

test_examples = []
for i, row in df_test.iterrows():
    test_examples.append(InputExample(texts=[row["text1_id"], row["text2_id"]], label=row["score"]))

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_examples)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, evaluator=evaluator)
model.save("model/indobert-base-uncased-dense", model_name="indobert-base-uncased-dense")

model.evaluate(evaluator)



Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Iteration: 100%|██████████| 646/646 [19:33<00:00,  1.82s/it]
Epoch: 100%|██████████| 1/1 [20:27<00:00, 1227.91s/it]


0.7783104346209084