In [14]:
from sentence_transformers import SentenceTransformer, models, losses
from sentence_transformers.readers import InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import pandas as pd
from torch.utils.data import DataLoader
import csv

In [15]:
dataset = pd.read_csv('datasets/stsbenchmark.tsv', delimiter='\t', quoting=csv.QUOTE_NONE)
dataset_train = dataset[dataset['split'] == 'train']
dataset_dev = dataset[dataset['split'] == 'dev']
dataset_test = dataset[dataset['split'] == 'test']
dataset_train.head()

Unnamed: 0,split,genre,dataset,year,sid,score,sentence1,sentence2
0,train,main-captions,MSRvid,2012test,1,5.0,A plane is taking off.,An air plane is taking off.
1,train,main-captions,MSRvid,2012test,4,3.8,A man is playing a large flute.,A man is playing a flute.
2,train,main-captions,MSRvid,2012test,5,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,train,main-captions,MSRvid,2012test,6,2.6,Three men are playing chess.,Two men are playing chess.
4,train,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.


In [16]:
def create_InputExample(row):
    score = float(row['score']) / 5.0
    # Creates one InputExample with the given texts, guid and label
    row['ie'] = InputExample(
        # the texts for the example.
        texts=[row['sentence1'], row['sentence2']],
        # the label for the example
        label=score)  # 连续的分数(浮点类型), 不同类别对应标签(整型)
    return row['ie']


train_samples = dataset_train.apply(create_InputExample, axis=1).tolist()
test_samples = dataset_test.apply(create_InputExample, axis=1).tolist()
dev_samples = dataset_dev.apply(create_InputExample, axis=1).tolist()

In [17]:
word_embedding_model = models.Transformer('distilbert-base-uncased')

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model=model)
# Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation in comparison to the gold standard labels.
# The metrics are the cosine similarity as well as euclidean and Manhattan distance The returned score is the Spearman correlation with a specified metric.
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

In [19]:
model.fit(
    # Tuples of (DataLoader, LossFunction). Pass more than one for multi-task learning
    train_objectives=[(train_dataloader, train_loss)],
    # An evaluator (sentence_transformers.evaluation) evaluates the model performance during training on held-out dev data.
    # It is used to determine the best model that is saved to disc.
    evaluator=evaluator,
    epochs=5,  # 默认epochs=1
    # If > 0, evaluate the model using evaluator after each number of training steps
    evaluation_steps=1000,  # 默认evaluation_steps=0
    # 其他可选择参数:
    # 优化器(optimizer_class,optimizer_params)
    # 学习率调整(cheduler,warmup_steps)
    # 梯度裁剪(max_grad_norm)
    # 混合精度加速(use_amp )
    output_path='save_sbert')

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

In [20]:
model = SentenceTransformer('save_sbert')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [21]:
# This is called during training to evaluate the model.
# It returns a score for the evaluation with a higher score indicating a better result.
print(evaluator(model, output_path='save_sbert'))  # 达到0.84

test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
print(test_evaluator(model, output_path='save_sbert'))

0.8415362480129329
0.7985742750445673


In [25]:
model_init = SentenceTransformer(model_name_or_path='distilbert-base-uncased')  # 默认,不进行微调
print(evaluator(model_init))  # 只有0.69
print(test_evaluator(model_init))  # 只有0.58


No sentence-transformers model found with name /root/.cache/torch/sentence_transformers/distilbert-base-uncased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.6929568939710936
0.5888452571674063
