In [1]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models, LoggingHandler, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# logger
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [3]:
pretrained_model_name = 'klue/roberta-base'
sts_num_epochs = 4
train_batch_size = 32

sts_model_save_path = 'output/training_sts-'+pretrained_model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# 1. Load Dataset & Preprocessing

## 1.1. KLUE-STS

In [4]:
# load KLUE-STS Dataset
klue_sts_train = load_dataset("klue", "sts", split='train[:90%]')
klue_sts_valid = load_dataset("klue", "sts", split='train[-10%:]') # train의 10%를 validation set으로 사용
klue_sts_test = load_dataset("klue", "sts", split='validation')

print('Length of Train : ',len(klue_sts_train))
print('Length of Valid : ',len(klue_sts_valid))
print('Length of Test : ',len(klue_sts_test))

2022-06-04 11:12:39 - Reusing dataset klue (/home/irteam/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)
2022-06-04 11:12:40 - Reusing dataset klue (/home/irteam/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)
2022-06-04 11:12:41 - Reusing dataset klue (/home/irteam/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)
Length of Train :  10501
Length of Valid :  1167
Length of Test :  519


In [5]:
def make_sts_input_example(dataset):
    ''' 
    Transform to InputExample
    ''' 
    input_examples = []
    for i, data in enumerate(dataset):
        sentence1 = data['sentence1']
        sentence2 = data['sentence2']
        score = (data['labels']['label']) / 5.0  # normalize 0 to 5
        input_examples.append(InputExample(texts=[sentence1, sentence2], label=score))

    return input_examples

In [6]:
sts_train_examples = make_sts_input_example(klue_sts_train)
sts_valid_examples = make_sts_input_example(klue_sts_valid)
sts_test_examples = make_sts_input_example(klue_sts_test)

In [7]:
train_sampler = torch.utils.data.distributed.DistributedSampler(sts_train_examples)

# Train Dataloader
train_dataloader = DataLoader(
    sts_train_examples,
    shuffle=True,
    batch_size=train_batch_size,
)

# Evaluator by sts-validation
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_valid_examples,
    name="sts-dev",
)

# Evaluator by sts-test
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_test_examples,
    name="sts-test",
)

# 2. Load Embedding Model

In [8]:
# Load Embedding Model
embedding_model = models.Transformer(
    model_name_or_path=pretrained_model_name, 
    max_seq_length=256,
    do_lower_case=True
)

# Only use Mean Pooling -> Pooling all token embedding vectors of sentence.
pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

model = SentenceTransformer(modules=[embedding_model, pooling_model])

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for

2022-06-04 11:12:59 - Use pytorch device: cuda


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ## specify the GPU id's, GPU id's start from 0.

model= torch.nn.DataParallel(model, device_ids = [0,1,2])
model.to(device)

DataParallel(
  (module): SentenceTransformer(
    (0): Transformer({'max_seq_length': 256, 'do_lower_case': True}) with Transformer model: RobertaModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
)

# 3. STS training

In [10]:
# Use CosineSimilarityLoss
train_loss = losses.CosineSimilarityLoss(model=model)

# warmup steps
warmup_steps = math.ceil(len(sts_train_examples) * sts_num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

2022-06-04 11:13:16 - Warmup-steps: 132


In [11]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"  

In [12]:
torch.cuda.device_count()

8

In [13]:
# Training
model.module.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=sts_num_epochs,
    evaluation_steps=int(len(train_dataloader)*0.1),
    warmup_steps=warmup_steps,
    output_path=sts_model_save_path
)

                                            
Epoch:   0%|          | 0/4 [00:13<?, ?it/s]               

2022-06-04 11:13:36 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 32 steps:


                                            
                                                           
                                                           
                                                           
                                                           
Epoch:   0%|          | 0/4 [00:15<?, ?it/s]               

2022-06-04 11:13:38 - Cosine-Similarity :	Pearson: 0.8958	Spearman: 0.8927
2022-06-04 11:13:38 - Manhattan-Distance:	Pearson: 0.8774	Spearman: 0.8812
2022-06-04 11:13:38 - Euclidean-Distance:	Pearson: 0.8761	Spearman: 0.8802
2022-06-04 11:13:38 - Dot-Product-Similarity:	Pearson: 0.8470	Spearman: 0.8489
2022-06-04 11:13:38 - Save model to output/training_sts-klue-roberta-base-2022-06-04_11-12-34


                                            
Epoch:   0%|          | 0/4 [00:25<?, ?it/s]               

2022-06-04 11:13:48 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 64 steps:


                                            
                                                           
                                                           
                                                           
Epoch:   0%|          | 0/4 [00:27<?, ?it/s]               

2022-06-04 11:13:51 - Cosine-Similarity :	Pearson: 0.9165	Spearman: 0.8885
2022-06-04 11:13:51 - Manhattan-Distance:	Pearson: 0.9105	Spearman: 0.8883
2022-06-04 11:13:51 - Euclidean-Distance:	Pearson: 0.9100	Spearman: 0.8879
2022-06-04 11:13:51 - Dot-Product-Similarity:	Pearson: 0.9034	Spearman: 0.8720


                                            
Epoch:   0%|          | 0/4 [00:36<?, ?it/s]               

2022-06-04 11:13:59 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 96 steps:


                                            
                                                           
                                                           
                                                           
                                                           
Epoch:   0%|          | 0/4 [00:38<?, ?it/s]               

2022-06-04 11:14:02 - Cosine-Similarity :	Pearson: 0.9364	Spearman: 0.9017
2022-06-04 11:14:02 - Manhattan-Distance:	Pearson: 0.9278	Spearman: 0.9009
2022-06-04 11:14:02 - Euclidean-Distance:	Pearson: 0.9276	Spearman: 0.9005
2022-06-04 11:14:02 - Dot-Product-Similarity:	Pearson: 0.9283	Spearman: 0.8886
2022-06-04 11:14:02 - Save model to output/training_sts-klue-roberta-base-2022-06-04_11-12-34


                                            
Epoch:   0%|          | 0/4 [00:48<?, ?it/s]                

2022-06-04 11:14:12 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 128 steps:


                                            
                                                            
                                                            
                                                            
Epoch:   0%|          | 0/4 [00:51<?, ?it/s]                

2022-06-04 11:14:14 - Cosine-Similarity :	Pearson: 0.9403	Spearman: 0.9004
2022-06-04 11:14:14 - Manhattan-Distance:	Pearson: 0.9327	Spearman: 0.8997
2022-06-04 11:14:14 - Euclidean-Distance:	Pearson: 0.9328	Spearman: 0.8997
2022-06-04 11:14:14 - Dot-Product-Similarity:	Pearson: 0.9322	Spearman: 0.8852


                                            
Epoch:   0%|          | 0/4 [01:00<?, ?it/s]                

2022-06-04 11:14:23 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 160 steps:


                                            
                                                            
                                                            
                                                            
                                                            
Epoch:   0%|          | 0/4 [01:02<?, ?it/s]                

2022-06-04 11:14:25 - Cosine-Similarity :	Pearson: 0.9470	Spearman: 0.9068
2022-06-04 11:14:25 - Manhattan-Distance:	Pearson: 0.9419	Spearman: 0.9066
2022-06-04 11:14:25 - Euclidean-Distance:	Pearson: 0.9419	Spearman: 0.9066
2022-06-04 11:14:25 - Dot-Product-Similarity:	Pearson: 0.9404	Spearman: 0.8925
2022-06-04 11:14:25 - Save model to output/training_sts-klue-roberta-base-2022-06-04_11-12-34


                                            
Epoch:   0%|          | 0/4 [01:12<?, ?it/s]                

2022-06-04 11:14:35 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 192 steps:


                                            
                                                            
                                                            
                                                            
                                                            
Epoch:   0%|          | 0/4 [01:14<?, ?it/s]                

2022-06-04 11:14:37 - Cosine-Similarity :	Pearson: 0.9540	Spearman: 0.9146
2022-06-04 11:14:37 - Manhattan-Distance:	Pearson: 0.9487	Spearman: 0.9133
2022-06-04 11:14:37 - Euclidean-Distance:	Pearson: 0.9487	Spearman: 0.9133
2022-06-04 11:14:37 - Dot-Product-Similarity:	Pearson: 0.9496	Spearman: 0.9047
2022-06-04 11:14:37 - Save model to output/training_sts-klue-roberta-base-2022-06-04_11-12-34


                                            
Epoch:   0%|          | 0/4 [01:24<?, ?it/s]                

2022-06-04 11:14:48 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 224 steps:


                                            
                                                            
                                                            
                                                            
                                                            
Epoch:   0%|          | 0/4 [01:26<?, ?it/s]                

2022-06-04 11:14:50 - Cosine-Similarity :	Pearson: 0.9551	Spearman: 0.9182
2022-06-04 11:14:50 - Manhattan-Distance:	Pearson: 0.9515	Spearman: 0.9173
2022-06-04 11:14:50 - Euclidean-Distance:	Pearson: 0.9514	Spearman: 0.9174
2022-06-04 11:14:50 - Dot-Product-Similarity:	Pearson: 0.9500	Spearman: 0.9063
2022-06-04 11:14:50 - Save model to output/training_sts-klue-roberta-base-2022-06-04_11-12-34


                                            
Epoch:   0%|          | 0/4 [01:37<?, ?it/s]                

2022-06-04 11:15:00 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 256 steps:


                                            
                                                            
                                                            
                                                            
Epoch:   0%|          | 0/4 [01:39<?, ?it/s]                

2022-06-04 11:15:02 - Cosine-Similarity :	Pearson: 0.9580	Spearman: 0.9165
2022-06-04 11:15:02 - Manhattan-Distance:	Pearson: 0.9527	Spearman: 0.9161
2022-06-04 11:15:02 - Euclidean-Distance:	Pearson: 0.9526	Spearman: 0.9161
2022-06-04 11:15:02 - Dot-Product-Similarity:	Pearson: 0.9533	Spearman: 0.9076


                                            
Epoch:   0%|          | 0/4 [01:48<?, ?it/s]                

2022-06-04 11:15:11 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 288 steps:


                                            
                                                            
                                                            
                                                            
Epoch:   0%|          | 0/4 [01:50<?, ?it/s]                

2022-06-04 11:15:13 - Cosine-Similarity :	Pearson: 0.9575	Spearman: 0.9175
2022-06-04 11:15:13 - Manhattan-Distance:	Pearson: 0.9534	Spearman: 0.9161
2022-06-04 11:15:13 - Euclidean-Distance:	Pearson: 0.9532	Spearman: 0.9156
2022-06-04 11:15:13 - Dot-Product-Similarity:	Pearson: 0.9527	Spearman: 0.9062


                                            
Epoch:   0%|          | 0/4 [01:59<?, ?it/s]                

2022-06-04 11:15:22 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 320 steps:


                                            
                                                            
                                                            
                                                            
                                                            
Epoch:   0%|          | 0/4 [02:01<?, ?it/s]                

2022-06-04 11:15:25 - Cosine-Similarity :	Pearson: 0.9595	Spearman: 0.9201
2022-06-04 11:15:25 - Manhattan-Distance:	Pearson: 0.9521	Spearman: 0.9178
2022-06-04 11:15:25 - Euclidean-Distance:	Pearson: 0.9519	Spearman: 0.9174
2022-06-04 11:15:25 - Dot-Product-Similarity:	Pearson: 0.9545	Spearman: 0.9117
2022-06-04 11:15:25 - Save model to output/training_sts-klue-roberta-base-2022-06-04_11-12-34


Iteration: 100%|██████████| 329/329 [02:05<00:00,  2.63it/s]
Epoch:   0%|          | 0/4 [02:05<?, ?it/s]

2022-06-04 11:15:28 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 0:


Epoch:  25%|██▌       | 1/4 [02:07<06:22, 127.60s/it]

2022-06-04 11:15:30 - Cosine-Similarity :	Pearson: 0.9583	Spearman: 0.9171
2022-06-04 11:15:30 - Manhattan-Distance:	Pearson: 0.9503	Spearman: 0.9150
2022-06-04 11:15:30 - Euclidean-Distance:	Pearson: 0.9500	Spearman: 0.9145
2022-06-04 11:15:30 - Dot-Product-Similarity:	Pearson: 0.9531	Spearman: 0.9081


                                                     
Epoch:  25%|██▌       | 1/4 [02:16<06:22, 127.60s/it]      

2022-06-04 11:15:39 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 32 steps:


                                                     
                                                           
                                                           
                                                           
Epoch:  25%|██▌       | 1/4 [02:18<06:22, 127.60s/it]      

2022-06-04 11:15:42 - Cosine-Similarity :	Pearson: 0.9590	Spearman: 0.9184
2022-06-04 11:15:42 - Manhattan-Distance:	Pearson: 0.9539	Spearman: 0.9175
2022-06-04 11:15:42 - Euclidean-Distance:	Pearson: 0.9539	Spearman: 0.9175
2022-06-04 11:15:42 - Dot-Product-Similarity:	Pearson: 0.9544	Spearman: 0.9082


                                                     
Epoch:  25%|██▌       | 1/4 [02:27<06:22, 127.60s/it]      

2022-06-04 11:15:51 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 64 steps:


                                                     
                                                           
                                                           
                                                           
Epoch:  25%|██▌       | 1/4 [02:29<06:22, 127.60s/it]      

2022-06-04 11:15:53 - Cosine-Similarity :	Pearson: 0.9588	Spearman: 0.9174
2022-06-04 11:15:53 - Manhattan-Distance:	Pearson: 0.9537	Spearman: 0.9172
2022-06-04 11:15:53 - Euclidean-Distance:	Pearson: 0.9537	Spearman: 0.9169
2022-06-04 11:15:53 - Dot-Product-Similarity:	Pearson: 0.9536	Spearman: 0.9044


                                                     
Epoch:  25%|██▌       | 1/4 [02:38<06:22, 127.60s/it]      

2022-06-04 11:16:02 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 96 steps:


                                                     
                                                           
                                                           
                                                           
                                                           
Epoch:  25%|██▌       | 1/4 [02:41<06:22, 127.60s/it]      

2022-06-04 11:16:04 - Cosine-Similarity :	Pearson: 0.9612	Spearman: 0.9227
2022-06-04 11:16:04 - Manhattan-Distance:	Pearson: 0.9541	Spearman: 0.9205
2022-06-04 11:16:04 - Euclidean-Distance:	Pearson: 0.9542	Spearman: 0.9202
2022-06-04 11:16:04 - Dot-Product-Similarity:	Pearson: 0.9553	Spearman: 0.9097
2022-06-04 11:16:04 - Save model to output/training_sts-klue-roberta-base-2022-06-04_11-12-34


Iteration:  36%|███▋      | 120/329 [00:41<01:12,  2.89it/s]
Epoch:  25%|██▌       | 1/4 [02:49<08:27, 169.17s/it]


KeyboardInterrupt: 

---

### DDP

In [None]:
def main():
    ngpus_per_node = 3 #torch.cuda.device_count()
    world_size = ngpus_per_node
 
    torch.multiprocessing.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, ))


def main_worker(gpu, ngpus_per_node):
 
    image_size = 224
    batch_size = 512
    num_worker = 8
    epochs = 1
 
    batch_size = int(batch_size / ngpus_per_node)
    num_worker = int(num_worker / ngpus_per_node)
    
    torch.distributed.init_process_group(
            backend='nccl',
            init_method='tcp://127.0.0.1:3456',
            world_size=ngpus_per_node,
            rank=gpu)
    model = baseline.ResnetModel()
    torch.cuda.set_device(gpu)
    model = model.cuda(gpu)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
 
    train_loader = get_train_loader(
        image_size=image_size,
        batch_size=batch_size,
        num_worker=num_worker)
 
    optimizer = torch.optim.SGD(
        params=model.parameters(),
        lr=0.001,
        momentum=0.9)
    criterion = torch.nn.CrossEntropyLoss().to(gpu)
 
    model.train()
    for epoch in range(epochs):
 
        start_time = time.time()
        for j, (images, labels) in enumerate(train_loader):
            images, labels = images.to(gpu), labels.to(gpu)
            
            optimizer.zero_grad()
            logits, _, _ = model(images)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
 
 
            print(f'epoch : {epoch} | step : {j} / {len(train_loader)} | mp : {gpu}')
        end_time = time.time()
        print('total time :', end_time - start_time)

# 4. Evaluation

In [None]:
# evaluation sts-test
test_evaluator(model, output_path=sts_model_save_path)

2022-02-25 02:15:39 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2022-02-25 02:15:43 - Cosine-Similarity :	Pearson: 0.8870	Spearman: 0.8873
2022-02-25 02:15:43 - Manhattan-Distance:	Pearson: 0.8862	Spearman: 0.8835
2022-02-25 02:15:43 - Euclidean-Distance:	Pearson: 0.8869	Spearman: 0.8844
2022-02-25 02:15:43 - Dot-Product-Similarity:	Pearson: 0.8775	Spearman: 0.8745


0.887279591001845