In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel

roberta_MRPC = "textattack/roberta-base-MRPC"
bert_MRPC = "bert-base-cased-finetuned-mrpc"

paraphraseTokenizer = AutoTokenizer.from_pretrained(roberta_MRPC)  
paraphraseModel = AutoModelForSequenceClassification.from_pretrained(roberta_MRPC)

classes = ["not paraphrase", "is paraphrase"]

Some weights of the model checkpoint at textattack/roberta-base-MRPC were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from sentence_transformers import SentenceTransformer, util
sentenceTransformer = SentenceTransformer("roberta-large-nli-stsb-mean-tokens")

In [10]:
def word_similarity(s1, s2):
    num_sim = 0
    seq1 = s1.split(" ")
    for w in seq1:
        if w in s2:
            num_sim += 1
    return num_sim / len(seq1)

def inference_sts(seq1s, seq2s):
    embeddings1 = sentenceTransformer.encode(seq1s, convert_to_tensor=True)
    embeddings2 = sentenceTransformer.encode(seq2s, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
    for i in range(len(seq1s)):
        print("Similarity Score: {:.4f}".format(cosine_scores[i][i])) 
        # word_similarity(seq1s[i], seq2s[i])

In [2]:
def inference_mrpc(seq1s, seq2s):
    for i in range(len(seq1s)):
        paraphrase = paraphraseTokenizer.encode_plus(
            seq1s[i], seq2s[i], return_tensors="pt")
        logits = paraphraseModel(**paraphrase)[0]
        paraphrase_results = torch.softmax(logits, dim=1).tolist()[0]
        print(f"{classes[1]}: {round(paraphrase_results[1] * 100)}%")

In [3]:
sentences1 = ["ate an apple", 
              "jumping over a fence", 
              "riding a motorbike down the road", 
              "all the people walk"]
sentences2 = ["an apple was eaten",  
              "jumping over a enclosure", 
              "riding a motorbike along a roadway",
              "everyone walks"]

inference_mrpc(sentences1, sentences2)
inference_sts(sentences1, sentences2)

is paraphrase: 82%
is paraphrase: 100%
is paraphrase: 100%
is paraphrase: 92%
Similarity Score: 0.8932
Similarity Score: 0.8080
Similarity Score: 0.9863
Similarity Score: 0.9202


In [116]:
sentences1 = ["there is a seated women"]
sentences2 = ["there is a women sitting"]

inference_mrpc(sentences1, sentences2)
inference_sts(sentences1, sentences2)

is paraphrase: 99%
Similarity Score: 0.7890


In [118]:
sentences1 = ["a red apple"]
sentences2 = ["an apple which is red"]

inference_mrpc(sentences1, sentences2)
inference_sts(sentences1, sentences2)

is paraphrase: 99%
Similarity Score: 0.9588


In [126]:
sentences1 = ["a man is crying"]
sentences2 = ["a man with a smile"]

inference_mrpc(sentences1, sentences2)
inference_sts(sentences1, sentences2)

is paraphrase: 69%
Similarity Score: 0.0823


In [119]:
sentences1 = ["a problem is solved"]
sentences2 = ["resolved a problem"]

inference_mrpc(sentences1, sentences2)
inference_sts(sentences1, sentences2)

is paraphrase: 99%
Similarity Score: 0.7147


In [108]:

sentences1 = ["enjoying a balloon joyride"]
sentences2 = ["on a balloon joyride"]

inference_mrpc(sentences1, sentences2)
inference_sts(sentences1, sentences2)

is paraphrase: 100%
Similarity Score: 0.7243


In [5]:
from Udep2Mono.polarization import PolarizationPipeline
from chunker import Chunker

2021-03-08 05:45:24 INFO: Loading these models for language: en (English):
| Processor | Package                  |
----------------------------------------
| tokenize  | ../model/e...ize/gum.pt  |
| pos       | ../model/en/pos/ewt.pt   |
| lemma     | ../model/en/lemma/gum.pt |
| depparse  | ../model/e...rse/gum.pt  |

2021-03-08 05:45:24 INFO: Use device: gpu
2021-03-08 05:45:24 INFO: Loading: tokenize
2021-03-08 05:45:26 INFO: Loading: pos
2021-03-08 05:45:26 INFO: Loading: lemma
2021-03-08 05:45:26 INFO: Loading: depparse
2021-03-08 05:45:27 INFO: Done loading processors!
2021-03-08 05:45:27 INFO: Loading these models for language: en (English):
| Processor | Package                 |
---------------------------------------
| tokenize  | ../model/e...ize/gum.pt |

2021-03-08 05:45:27 INFO: Use device: gpu
2021-03-08 05:45:27 INFO: Loading: tokenize
2021-03-08 05:45:27 INFO: Done loading processors!


In [7]:
chunker = Chunker()
pipeline = PolarizationPipeline(verbose=1)
P = "We did not expect guests"
H = "We did not expect arrival of guests"
p_tree = pipeline.single_polarization(P)["polarized_tree"]
chunker.get_chunks_byDepTree(p_tree)

['not expect', 'not expect guests']

In [8]:
h_tree = pipeline.single_polarization(H)["polarized_tree"]
chunker.get_chunks_byDepTree(h_tree)

['of guests',
 'arrival of guests',
 'not expect',
 'not expect arrival of guests']

In [3]:
sentences1 = ["not expect guests"]
sentences2 = ["not expect arrival of guests"]

inference_mrpc(sentences1, sentences2)

is paraphrase: 99%


In [8]:
P = "A man with a warm smile is giving a lecture"
p_tree = pipeline.single_polarization(P)["polarized_tree"]
chunker.get_chunks_byDepTree(p_tree)

['with a warm smile',
 'A man with a warm smile',
 'a lecture',
 'giving a lecture']

In [9]:
H = "A man smiling warmly is giving a lecture"
h_tree = pipeline.single_polarization(H)["polarized_tree"]
chunker.get_chunks_byDepTree(h_tree)

['smiling warmly',
 'smiling warmly',
 'A man smiling warmly',
 'a lecture',
 'giving a lecture']

In [10]:
sentences1 = ["A man smiling warmly"]
sentences2 = ["A man with a warm smile"]

inference_mrpc(sentences1, sentences2)

is paraphrase: 100%


In [11]:
sentences1 = ["giving a lecture"]
sentences2 = ["A man with a warm smile"]

inference_mrpc(sentences1, sentences2)

is paraphrase: 1%


In [12]:
P = "There is a women sitting in front of the office"
p_tree = pipeline.single_polarization(P)["polarized_tree"]
chunker.get_chunks_byDepTree(p_tree)

['of the office',
 'in front of the office',
 'sitting in front of the office',
 'sitting in front of the office',
 'a women sitting in front of the office']

In [13]:
H = "There is a seated women in front of the office"
h_tree = pipeline.single_polarization(H)["polarized_tree"]
chunker.get_chunks_byDepTree(h_tree)

['of the office',
 'in front of the office',
 'a seated women in front of the office']

In [14]:
sentences1 = ["a women sitting in front of the office"]
sentences2 = ["a seated women in front of the office"]

inference_mrpc(sentences1, sentences2)

is paraphrase: 100%
