In [1]:
from sentence_transformers import SentenceTransformer, util

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [3]:
# Single list of sentences - Possible tens of thousands of sentences
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

# Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all other sentences and returns a list with the pairs that have the highest cosine similarity score.
# Returns a list of triplets with the format [score, id1, id2]
paraphrases = util.paraphrase_mining(model, sentences)
paraphrases

[[0.8939037919044495, 3, 6],
 [0.6787883043289185, 0, 4],
 [0.5095502138137817, 2, 7],
 [0.2560485005378723, 2, 6],
 [0.24403773248195648, 2, 3],
 [0.21046359837055206, 1, 4],
 [0.19693005084991455, 3, 7],
 [0.16922728717327118, 6, 7],
 [0.13100729882717133, 0, 5],
 [0.089983269572258, 4, 7],
 [0.06292402744293213, 4, 5],
 [0.05911204218864441, 4, 6],
 [0.041682302951812744, 5, 7],
 [0.0363304503262043, 0, 1],
 [0.03587252274155617, 2, 5],
 [0.02754371240735054, 3, 4],
 [0.02539319545030594, 0, 7],
 [0.023044737055897713, 2, 4],
 [0.011648576706647873, 1, 7],
 [0.009295160882174969, 1, 3],
 [0.008107375353574753, 0, 2],
 [-0.0028615910559892654, 0, 6],
 [-0.013609589077532291, 1, 6],
 [-0.024680081754922867, 0, 3],
 [-0.03272603452205658, 1, 5],
 [-0.036783091723918915, 1, 2],
 [-0.05019041895866394, 3, 5],
 [-0.05093100666999817, 5, 6]]

In [4]:
for paraphrase in paraphrases[0:10]:
    score, i, j = paraphrase
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], score))

The new movie is awesome 		 The new movie is so great 		 Score: 0.8939
The cat sits outside 		 The cat plays in the garden 		 Score: 0.6788
I love pasta 		 Do you like pizza? 		 Score: 0.5096
I love pasta 		 The new movie is so great 		 Score: 0.2560
I love pasta 		 The new movie is awesome 		 Score: 0.2440
A man is playing guitar 		 The cat plays in the garden 		 Score: 0.2105
The new movie is awesome 		 Do you like pizza? 		 Score: 0.1969
The new movie is so great 		 Do you like pizza? 		 Score: 0.1692
The cat sits outside 		 A woman watches TV 		 Score: 0.1310
The cat plays in the garden 		 Do you like pizza? 		 Score: 0.0900
