thesis
- Take a line of sentence, transform it into a vector.
- Take various other penalties, and change them into vectors.
- Spot sentences with the shortest distance (Euclidean) or tiniest angle (cosine similarity) among them.
- We instantly get a standard of semantic similarity connecting sentences.

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import csv

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
corpus_path = ["data/bq_corpus/test.tsv", "data/lcqmc/test.tsv", "data/paws-x-zh/test.tsv"]

In [4]:
for path in corpus_path:
    cls_result = []
    test_pairs = open(path, encoding='utf-8').readlines()    
    
    for i in tqdm(range(len(test_pairs))):
        pair = test_pairs[i].split('\t')

        sentence_embeddings = model.encode(pair)
        cossin = cosine_similarity(
            sentence_embeddings[0:1],
            sentence_embeddings[1:2]
        )

        if cossin > 0.9:
            cls_result.append(1)
        else:
            cls_result.append(0)

    with open('result/' + path.split('/')[1] + ".tsv", 'w') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(['index', 'prediction'])
        for i in range(len(cls_result)):
            tsv_writer.writerow([str(i), str(cls_result[i])])

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [01:28<00:00, 112.88it/s]
100%|███████████████████████████████████████████████████████████████████████████| 12500/12500 [01:46<00:00, 117.16it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:32<00:00, 61.51it/s]
