In [None]:
import pandas as pd
import numpy as np

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
test_dataset_df = pd.read_pickle("amazon_clean_data.pkl")

In [None]:
test_dataset_df.shape

#### Predict paraphrases for 10 examples

In [None]:
sentences = test_dataset_df['question'].sample(n=10).tolist()
for i, s in enumerate(sentences, 1):
    print(f'{i}: {s}')

In [None]:
paraphrases = util.paraphrase_mining(model, sentences, top_k=1)

In [None]:
para_list = []
for paraphrase in paraphrases[0:100]:
    score, i, j = paraphrase
    para_list.append([round(score, 2), sentences[i], sentences[j]])
para_df = pd.DataFrame(para_list, columns=['Paraphrase Likelihood', 'Sentence 1', 'Sentence 2'])
para_df.index = np.arange(1, len(para_df) + 1)
para_df.index.name = 'Result'
para_df

#### Predict paraphrases for all examples

In [None]:
all_sentences = test_dataset_df['question'].tolist()

In [None]:
%%time
paraphrases = util.paraphrase_mining(model, all_sentences, top_k=1)

In [None]:
para_list = []
for paraphrase in paraphrases:
    score, i, j = paraphrase
    para_list.append([round(score, 2), all_sentences[i], all_sentences[j]])
para_df = pd.DataFrame(para_list, columns=['Paraphrase Likelihood', 'Sentence 1', 'Sentence 2'])
para_df.index = np.arange(1, len(para_df) + 1)
para_df.index.name = 'Result'
para_df

In [None]:
sample_prara_df = para_df.query('0.75 <= `Paraphrase Likelihood` <= 0.85')
sample_prara_df

In [None]:
for row in sample_prara_df.sample(n=20).itertuples():
    print(row[2])
    print(f'------------------ {row[1]}------------------')
    print(row[3])
    print('='*50)

#### Predict paraphrases for all examples using top 5 best paraphrased for each example

In [None]:
all_sentences = test_dataset_df['question'].tolist()

In [None]:
%%time
paraphrases = util.paraphrase_mining(model, all_sentences, top_k=5)

In [None]:
para_list = []
# For this example lets sort the results via the sentence index
# This way we can list all the potential paraphrase examples together 
# Rather than sorting by score which would make it more difficult to find the same examples
for paraphrase in sorted(paraphrases, key=lambda x: x[1], reverse=True):
    score, i, j = paraphrase
    para_list.append([round(score, 2), all_sentences[i], all_sentences[j]])
para_df = pd.DataFrame(para_list, columns=['Paraphrase Likelihood', 'Sentence 1', 'Sentence 2'])
para_df.index = np.arange(1, len(para_df) + 1)
para_df.index.name = 'Result'
para_df

In [None]:
for row in para_df.head(n=30).itertuples():
    print(row[2])
    print(f'------------------ {row[1]}------------------')
    print(row[3])
    print('='*50)