In [120]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel

roberta_MRPC = "textattack/roberta-base-MRPC"
bert_MRPC = "bert-base-cased-finetuned-mrpc"

paraphraseTokenizer = AutoTokenizer.from_pretrained(roberta_MRPC)  
paraphraseModel = AutoModelForSequenceClassification.from_pretrained(roberta_MRPC)

from sentence_transformers import SentenceTransformer, util
sentenceTransformer = SentenceTransformer("roberta-large-nli-stsb-mean-tokens")

classes = ["not paraphrase", "is paraphrase"]

Some weights of the model checkpoint at textattack/roberta-base-MRPC were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [121]:
def word_similarity(s1, s2):
    num_sim = 0
    seq1 = s1.split(" ")
    for w in seq1:
        if w in s2:
            num_sim += 1
    return num_sim / len(seq1)

def inference_mrpc(seq1s, seq2s):
    for i in range(len(seq1s)):
        paraphrase = paraphraseTokenizer.encode_plus(
            seq1s[i], seq2s[i], return_tensors="pt")
        logits = paraphraseModel(**paraphrase)[0]
        paraphrase_results = torch.softmax(logits, dim=1).tolist()[0]
        print(f"{classes[1]}: {round(paraphrase_results[1] * 100)}%")
        



def inference_sts(seq1s, seq2s):
    embeddings1 = sentenceTransformer.encode(seq1s, convert_to_tensor=True)
    embeddings2 = sentenceTransformer.encode(seq2s, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
    for i in range(len(seq1s)):
        print("Similarity Score: {:.4f}".format(cosine_scores[i][i] * word_similarity(
            seq1s[i], seq2s[i])))
    #for i in range(len(seq1s)):
    #    score = sbert.predict([(seq1s[i]), (seq2s[i])])
    #    print(f"similarity: {round(score * 100, 2)}%")

In [122]:
seq0 = "I caused him to submit his resignation"
seq1 = "I caused the submission of his resignation"
seq2 = "I caused him to submit the resignation"
seq3 = "I caused the submission of his resignation"
seq4 = "I caused the submission of the resignation"
seq5 = "I caused the submission of his resignation"
seq6 = "John ate an apple"
seq7 = "an apple was eaten by john"
seq8 = "Egyption hat"
seq9 = "Egyption headdress"

sentences1 = [seq0, seq2, seq4, seq6, seq8]
sentences2 = [seq1, seq3, seq5, seq7, seq9]

inference_mrpc(sentences1, sentences2)
inference_sts(sentences1, sentences2)

is paraphrase: 100%
is paraphrase: 100%
is paraphrase: 100%
is paraphrase: 99%
is paraphrase: 99%
Similarity Score: 0.5415
Similarity Score: 0.5464
Similarity Score: 0.9770
Similarity Score: 0.6805
Similarity Score: 0.4420


In [116]:
sentences1 = ["there is a seated women"]
sentences2 = ["there is a women sitting"]

inference_mrpc(sentences1, sentences2)
inference_sts(sentences1, sentences2)

is paraphrase: 99%
Similarity Score: 0.7890


In [118]:
sentences1 = ["a red apple"]
sentences2 = ["an apple which is red"]

inference_mrpc(sentences1, sentences2)
inference_sts(sentences1, sentences2)

is paraphrase: 99%
Similarity Score: 0.9588


In [126]:
sentences1 = ["a man is crying"]
sentences2 = ["a man with a smile"]

inference_mrpc(sentences1, sentences2)
inference_sts(sentences1, sentences2)

is paraphrase: 69%
Similarity Score: 0.0823


In [119]:
sentences1 = ["a problem is solved"]
sentences2 = ["resolved a problem"]

inference_mrpc(sentences1, sentences2)
inference_sts(sentences1, sentences2)

is paraphrase: 99%
Similarity Score: 0.7147


In [108]:

sentences1 = ["enjoying a balloon joyride"]
sentences2 = ["on a balloon joyride"]

inference_mrpc(sentences1, sentences2)
inference_sts(sentences1, sentences2)

is paraphrase: 100%
Similarity Score: 0.7243


In [102]:
from transformers import DebertaTokenizer, DebertaForSequenceClassification
import torch

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-large')
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-large')

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'config', 'deberta.embeddings.position_embeddings.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-lar

In [79]:
inputs = tokenizer.encode_plus("John ate an apple", "an apple was eaten", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)
outputs = model(**inputs)
logits = outputs.logits

In [80]:
paraphrase_results = torch.softmax(logits, dim=1).tolist()[0]
print(f"{classes[1]}: {round(paraphrase_results[0] * 100)}%")

is paraphrase: 52%
