# Try sentence transformer?

In [14]:
from sentence_transformers import SentenceTransformer
from pprint import pprint
import torch
from sentence_transformers import util

In [15]:
if torch.cuda.is_available():
    DEVICE = torch.device("cuda:0")  
    print("Running on the GPU")
else:
    DEVICE = torch.device("cpu")
    print("Running on the CPU")

Running on the CPU


In [16]:
sentence_transformer_model_v2 = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2', device=DEVICE)

In [17]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.',
    'Le renard brun et rapide saute par-dessus le chien paresseux.',
    '敏捷的棕色狐狸跳过懒惰的狗',
    'London is the best place on earth.',
    'I love London.']

In [18]:
sentence_embeddings = sentence_transformer_model_v2.encode(sentences)

In [19]:
sentence_embeddings.shape

(7, 768)

In [20]:
print('sentences: ', sentences[0], '\n', sentences[4])
print('dot similarity: ', util.dot_score(sentence_embeddings[0], sentence_embeddings[4]))
print('cos similarity: ', util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[4]))

print('sentences: ', sentences[2], '\n', sentences[4])
print('dot similarity: ', util.dot_score(sentence_embeddings[2], sentence_embeddings[4]))
print('cos similarity: ', util.pytorch_cos_sim(sentence_embeddings[2], sentence_embeddings[4]))

print('sentences: ', sentences[3], '\n', sentences[4])
print('dot similarity: ', util.dot_score(sentence_embeddings[3], sentence_embeddings[4]))
print('cos similarity: ', util.pytorch_cos_sim(sentence_embeddings[3], sentence_embeddings[4]))

print('sentences: ', sentences[5], '\n', sentences[6])
print('dot similarity: ', util.dot_score(sentence_embeddings[5], sentence_embeddings[6]))
print('cos similarity: ', util.pytorch_cos_sim(sentence_embeddings[5], sentence_embeddings[6]))

sentences:  This framework generates embeddings for each input sentence 
 敏捷的棕色狐狸跳过懒惰的狗
dot similarity:  tensor([[0.5263]])
cos similarity:  tensor([[0.0797]])
sentences:  The quick brown fox jumps over the lazy dog. 
 敏捷的棕色狐狸跳过懒惰的狗
dot similarity:  tensor([[6.1115]])
cos similarity:  tensor([[0.9069]])
sentences:  Le renard brun et rapide saute par-dessus le chien paresseux. 
 敏捷的棕色狐狸跳过懒惰的狗
dot similarity:  tensor([[5.9588]])
cos similarity:  tensor([[0.9261]])
sentences:  London is the best place on earth. 
 I love London.
dot similarity:  tensor([[8.4054]])
cos similarity:  tensor([[0.7646]])


# okay now get some passages out of ES to test this

In [10]:
test_query = '班上同学欠钱不还怎么办'
INDEX = "efaqa-70" # index to search e.g. "msmacro-full"
FIELDS = ["passage"] # fields to search e.g. ["passage", "query"]

In [11]:
from es_helper import *

Running on the CPU


In [12]:
result_count, es_hits = es_search(test_query, cutoff = 10, index=INDEX, fields = FIELDS)
es_results = direct_es_search_result('dummy', test_query, es_hits)



In [13]:
def rerank(es_results):
    '''
    gets a result table, reranks it, and returns a reranked result table
    '''
    passages = [hit['passage'] for hit in es_results.table]
    queries = [es_results.query_input]
    query_embeddings = sentence_transformer_model_v2.encode(queries, convert_to_tensor=True)
    sentence_embeddings = sentence_transformer_model_v2.encode(passages, convert_to_tensor=True)

    sentence_embeddings = sentence_embeddings.to(DEVICE)
    sentence_embeddings = util.normalize_embeddings(sentence_embeddings)

    query_embeddings = query_embeddings.to(DEVICE)
    query_embeddings = util.normalize_embeddings(query_embeddings)

    reranked_ranking = util.semantic_search(query_embeddings, sentence_embeddings, score_function=util.dot_score)

    reranked_table = []
    for index, entry_at_rank in enumerate(reranked_ranking[0]):
        corpus_id = entry_at_rank['corpus_id']
        entry = es_results.table[corpus_id]
        entry['score'] = entry_at_rank['score']
        entry['rank'] = index

        reranked_table.append(entry)
    es_results.table = reranked_table
    
    return es_results
        
es_results = rerank(es_results)

In [16]:
es_results.table

[{'rank': 0,
  'qid': 'q156',
  'pid': 'p156',
  'query_label': '我现在已经无法面对接下来的生活了，现在欠了一屁股债，每天都有债主追债，已经无能为力了，我想在近期结束自己。',
  'passage': '能够理解你现在无能为力的痛苦，但比欠债更可怕的是放弃生命，欠债可以还，放弃生命就再无回头的可能。你连死都不怕了，还有什么无法面对？专业咨询请关注我，希望陪你面对困难，看到解决问题的可能其实我不建议你自杀，因为逃债而寻死，是一种极不负责任的逃避行为。人活在世上，要不愧天，不愧地，不愧自己。你还不到30岁，你可以拿10年，20年来还债，当然你会过的很辛苦，但，债还清的那一刻，你又是一个挺起腰的顶天立地的男人。你也可以选择自私的死亡，留家人为你还债。一切在你的选择。建议你和家人坐下来好好沟通一下。如果你欠了很多钱，还没跟你提离婚，愿意和你一起还钱的人，我相信她还是爱你的。或许只是有些恨铁不成钢，女人有时候很爱抱怨，但，她为了爱的人可以变得很坚强。加油吧！',
  'score': 0.5227701663970947},
 {'rank': 1,
  'qid': 'q963',
  'pid': 'p963',
  'query_label': '代朋友问，别人跟我朋友借了1万元写了欠条，对方不想还否认赖账。怎么办',
  'passage': '如果是起诉了法院调解让对方在两月内还清，对方说他不想还，他说会告诉法院他没工作没钱，法院会拿他没办法最多他进去监狱呆一阵子。他说告法院能怎么样他就是不想还钱',
  'score': 0.520677387714386},
 {'rank': 2,
  'qid': 'q1510',
  'pid': 'p1510',
  'query_label': '就是懒，好懒，不想懒，怎么办，很长时间了19岁',
  'passage': '刚开始我觉得特别苦，我想我那时候都快崩溃了，就不想上学了，可我自己心里不甘心，我不想自己以后过下等生活，不想为了几个小钱去看尽人的脸色，结果我就逼着自己学啊学，五个月后，我成了班里的中好生，觉得也就那么一回事我常吓唬自己说你爸妈老了，万一他们生病了，你没本事你没钱，你爸妈该咋办，生了我还没让他们享福呢，我的等大了带他们出去玩玩，我的

# Now fine tune the sentence transformer approach

In [34]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
import pandas as pd

In [35]:
training_df = pd.read_csv('sentencetransformer training.tsv', sep='\t')

In [36]:
# turn that into train examples as in:
# train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8), InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]

training_examples = []

for row in training_df.iterrows():
    training_query = row[1]['training_queries']
    training_passage = row[1]['training_passages']
    training_target_score = row[1]['training_target_scores']
    training_examples.append(
        InputExample(texts=[training_query, training_passage], label=training_target_score)
    )
    # print(training_query, training_passage, training_target_score)

In [37]:
#Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(training_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(sentence_transformer_model_v2)

In [38]:
#  evaluate it (?) (evaluate on train data (?)) (sample 300 (?))

sampled_eval_data = training_df.sample(300)

eval_queries = sampled_eval_data['training_queries'].to_list()
eval_passages = sampled_eval_data['training_passages'].to_list()
eval_scores = sampled_eval_data['training_target_scores'].to_list()

evaluator = evaluation.EmbeddingSimilarityEvaluator(eval_queries, eval_passages, eval_scores)

In [39]:
#Tune the sentence_transformer_model_v2
sentence_transformer_model_v2.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10, warmup_steps=100, evaluator=evaluator, evaluation_steps=100)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]