
# **Install libraries**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pytorch_lightning > /dev/null
!pip install transformers > /dev/null
!pip install sentencepiece > /dev/null

In [3]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# **Set a seed**

In [4]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
set_seed(42)

In [5]:
our_path = '/content/drive/MyDrive/Paraphrasing API/models/Ruslan_finetune_t5'
data_path = f"{our_path}/data"
df_test = pd.read_csv(f"{data_path}/train.tsv", sep='\t').astype(str)
len(df_test)

21829

# **Start testing**

In [6]:
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer

t5_paraphrase_dir = '/content/drive/MyDrive/Paraphrasing API/models/Ruslan_finetune_t5/t5_paraphrase'

best_model_path = f"{t5_paraphrase_dir}/epoch_10"
model = T5ForConditionalGeneration.from_pretrained(best_model_path)
tokenizer = T5Tokenizer.from_pretrained('t5-large')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device ",device)
model = model.to(device)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

device  cuda


In [16]:
def predict(sentence):
  text =  "paraphrase: " + sentence + " </s>"
  max_len = 256

  encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
  input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

  # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
  beam_outputs = model.generate(
     input_ids=input_ids, attention_mask=attention_masks,
      do_sample=True,
      max_length=256,
      top_k=512,
      top_p=1,
      early_stopping=True,
      num_return_sequences=3
  )

  final_outputs =[]
  for beam_output in beam_outputs:
      sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
      if sent.lower() != sentence.lower():
          return sent
  return sent

def predict_multiple(sentence):
  text =  "paraphrase: " + sentence + " </s>"

  max_len = 256

  encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
  input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)


  # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
  beam_outputs = model.generate(
      input_ids=input_ids, attention_mask=attention_masks,
      do_sample=True,
      max_length=256,
      top_k=120,
      top_p=0.98,
      early_stopping=True,
      num_return_sequences=3
  )

  final_outputs = []
  for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    if sent.lower() != sentence.lower() and sent not in final_outputs:
        final_outputs.append(sent)
  return final_outputs

In [17]:
predict_multiple("Trump, Biden pitch dueling visions in final sprint.")

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


[]

In [15]:
sentence_1 = "Washing your hands Properly will keep you away from COVID-19."
sentence_2 = "Wikipedia was launched on January 15, 2001, and was created by Jimmy Wales and Larry Sanger."
sentence_3 = "NLP is one of the interesting fields for Data Scientists to focus on."
sentence_4 = "Do I really need to take a flu shot if I’m healthy with few or no underlying conditions?"
sentence_5 = "Which course should I take to get started in data science?"
sentence_6 = "There will be 3 Walmart Black Friday events held in November starting on November 4, November 11 and November 25!"
sentence_7 = "The FCC says the $200 million civil penalty is the largest fixed-amount penalty in the commission's history."
sentence_8 = "Southwest Airlines travelers can now fly directly from San Diego to Honolulu on a new service that took off Wednesday out of the San Diego International Airport."
sentence_9 = "Gasoline production averaged 9.1 million bpd last week, slightly down on the previous week."
sentence_10 = "If you fall into the latter group, here’s how to replace Google’s new icons for Gmail, Calendar, and other apps with the older, arguably better versions on Android, iPhone, and Chrome."
sentence_11 = "Apple has been working on ARM-based Macs for some time, but only made them official at this year's WWDC."
sentence_12 = "Microsoft is investigating reports that some users are seeing error 0x80070426 when using their Microsoft account to sign into various apps."
sentence_13 = "On Saturday, Connery’s family announced that the Oscar-winning Scottish actor died peacefully in his sleep at home in the Bahamas."
sentence_14 = "Baby Shark Dance, from South Korean brand Pinkfong, officially surpassed the song by Luis Fonsi as the most viewed YouTube video of all time, having racked up 7.05 billion views to 7.04 billion."
sentence_15 = "The University of Washington has informed the NFL office that due to an increase in COVID-19 infection rate and indications of increased community spread in the local area, NFL personnel are no longer allowed to attend games at Husky Stadium."
sentence_16 = "The NBA's basketball-related income was down $1.5 billion last season, according to data provided to teams and obtained by ESPN."
sentence_17 = "Yesterday, the huge orbiting laboratory celebrated 20 years of continuous human occupation, a big milestone in humanity's push to extend its footprint into the final frontier."
sentence_18 = "A team of researchers led by Osaka University and National Taiwan University created a system of nanoscale silicon resonators that can act as logic gates for light pulses."
sentence_19 = "The research on 100 people shows that all had T-cell responses against a range of the coronavirus’s proteins, including the spike protein used as a marker in many vaccine studies, after half a year."
sentence_20 = "A group of researchers at MIT recently developed an artificial intelligence model that can detect asymptomatic COVID-19 cases by listening to subtle differences in coughs between healthy people and infected people."

for sentence in [sentence_1, sentence_2, sentence_3, sentence_4, sentence_5, sentence_6, sentence_7, sentence_8,
                 sentence_9, sentence_10, sentence_11, sentence_12, sentence_13, sentence_14, sentence_15,sentence_16, sentence_17, sentence_18,
                 sentence_19, sentence_20]:
  final_outputs = predict_multiple(sentence)
  print()
  print ("Original sentence: ")
  print (sentence)
  print("-----------------")
  print ("Paraphrased sentences: ")
  for i, final_output in enumerate(final_outputs):
      print("{}: {}".format(i, final_output))

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."



Original sentence: 
Washing your hands Properly will keep you away from COVID-19.
-----------------
Paraphrased sentences: 
0: Properly washing your hands will keep you away from COVID-19.

Original sentence: 
Wikipedia was launched on January 15, 2001, and was created by Jimmy Wales and Larry Sanger.
-----------------
Paraphrased sentences: 

Original sentence: 
NLP is one of the interesting fields for Data Scientists to focus on.
-----------------
Paraphrased sentences: 
0: NLP is one of the interesting fields to focus on for data scientists to study.
1: NLP is one of the interesting fields to focus on for data scientists to focus on.

Original sentence: 
Do I really need to take a flu shot if I’m healthy with few or no underlying conditions?
-----------------
Paraphrased sentences: 
0: Do I really need a flu shot if I am healthy with few or no underlying conditions?
1: Do I really need to take a flu shot if I am healthy with a few or no underlying conditions?

Original sentence: 
W

In [None]:
df_test = df_test[:100]
df_test.head()

Unnamed: 0,id,sentence1,sentence2,label,prediction
0,3,"In January 2011 , the Deputy Secretary General...","In January 2011 , FIBA Asia deputy secretary g...",1,0
1,6,"During her sophomore , junior and senior summe...","During her second , junior and senior summers ...",1,0
2,8,"His father emigrated to Missouri in 1868 , but...",His father emigrated to Missouri in 1868 but r...,1,0
3,10,It is situated south of Köroğlu Mountains and ...,It is situated south of Köroğlu - mountains an...,1,0
4,11,The Río Blanco mine is a large copper mine loc...,The Río Blanco - Mine is a large copper mine i...,1,0


In [None]:
df_test['prediction'] = df_test.sentence1.apply(lambda x: predict(x))

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


In [None]:
df_test.head(5)

Unnamed: 0,id,sentence1,sentence2,label,prediction
0,3,"In January 2011 , the Deputy Secretary General...","In January 2011 , FIBA Asia deputy secretary g...",1,"In January 2011, the deputy secretary general ..."
1,6,"During her sophomore , junior and senior summe...","During her second , junior and senior summers ...",1,"During her sophomore, junior and senior summer..."
2,8,"His father emigrated to Missouri in 1868 , but...",His father emigrated to Missouri in 1868 but r...,1,"His father emigrated to Missouri in 1868, but ..."
3,10,It is situated south of Köroğlu Mountains and ...,It is situated south of Köroğlu - mountains an...,1,It is situated south of the Körolu mountains a...
4,11,The Río Blanco mine is a large copper mine loc...,The Río Blanco - Mine is a large copper mine i...,1,The Ro Blanco Mine is a large copper mine in n...


In [None]:
df_test['sentence1'] = df_test['sentence1'].astype(str)
df_test['sentence2'] = df_test['sentence2'].astype(str)
df_test['prediction'] = df_test['prediction'].astype(str)

In [None]:
!pip install sacrebleu > /dev/null

In [None]:
from sacrebleu import sentence_bleu, corpus_bleu
import numpy as np

def compute_bleu(outputs, references) -> int:

  assert len(np.shape(outputs)) == 1
  if len(np.shape(references)) == 1:
    references = np.expand_dims(references, axis=1).tolist()

  return corpus_bleu(list(outputs), references, lowercase=True).score / 100

In [None]:
compute_bleu(df_test["prediction"], df_test["sentence2"])

0.6106954529917809

In [None]:
!pip install bert-score > /dev/null

In [None]:
from bert_score import score

In [None]:
df_test

Unnamed: 0,id,sentence1,sentence2,label,prediction
0,3,"In January 2011 , the Deputy Secretary General...","In January 2011 , FIBA Asia deputy secretary g...",1,"In January 2011, the FIBA Asia Deputy Secretar..."
1,6,"During her sophomore , junior and senior summe...","During her second , junior and senior summers ...",1,"During her senior, junior, and sophomore summe..."
2,8,"His father emigrated to Missouri in 1868 , but...",His father emigrated to Missouri in 1868 but r...,1,His father emigrated to Missouri in 1868 but r...
3,10,It is situated south of Köroğlu Mountains and ...,It is situated south of Köroğlu - mountains an...,1,It is south of the Körolu Mountains and north ...
4,11,The Río Blanco mine is a large copper mine loc...,The Río Blanco - Mine is a large copper mine i...,1,The Ro Blanco mine is a large copper mine in t...
...,...,...,...,...,...
95,235,"On 1 December , Tinnish announced that he had ...","On December 1 , Tinnish announced he had withd...",1,"On December 1, Tinnish announced that he had w..."
96,239,"On 6 March 2016 , he debuted in the Ukrainian ...",He made his debut in the Ukrainian Premier Lea...,1,On 6 March 2016 he debuted in the Ukrainian Pr...
97,241,Sadar Bazar Railway Station is a small railway...,Sadar Bazar railway station is a small railway...,1,Sadar Bazar railway station is a small railway...
98,242,The man tells Natasha how he had been given cu...,The man tells Natasha how he had been given cu...,1,The man tells Natasha how he had been given cu...


In [None]:
refs = list(df_test['sentence1'])
cands = list(df_test['prediction'])
inputs = list(df_test['sentence2'])

In [None]:
P, R, F1 = score(cands, refs, lang="en")

In [None]:
print(f"System level F1 score: {F1.mean():.3f}")

System level F1 score: 0.969


In [None]:
def get_translation_table():
  # Punctuations according to python
  punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

  # Good symbols that should not be removed
  good_symbols = '\'`-'

  # Bad punctuation that should be removed
  bad_punctuations = punctuations.translate("".maketrans("", "", good_symbols))
  
  return "".maketrans(bad_punctuations, ' '*len(bad_punctuations))
  
translation_table = get_translation_table()

def tokenize_text_for_diversity(text):
  # Remove bad punctuation
  text = text.translate(translation_table)
  return text.lower().strip().split()

def tokenize_for_diversity(input, output):
  assert (type(input) == type(output))

  if type(input) == str:
    return tokenize_text_for_diversity(input), tokenize_text_for_diversity(output)
  
  # Input and output are batches
  input_batch = [tokenize_text_for_diversity(text) for text in input]
  output_batch = [tokenize_text_for_diversity(text) for text in output]
  return input_batch, output_batch

In [None]:
# Function for tokenized
def intersection_over_union_tokenized(input_tokenized, output_tokenized):
  input_tokens = set(input_tokenized)
  output_tokens = set(output_tokenized)
  return len(input_tokens & output_tokens) / len(input_tokens | output_tokens)

# Final function
def intersection_over_union(input, output):
  return intersection_over_union_tokenized(*tokenize_for_diversity(input, output))

In [None]:
def compute_for_every_sample(func, first_argument_batch, second_argument_batch):
  return  [func(arg1, arg2) for arg1, arg2 in zip(first_argument_batch, second_argument_batch)]

def get_corpus_metrics(sentence_metrics_func, first_argument_batch, second_argument_batch):
  values = compute_for_every_sample(sentence_metrics_func, first_argument_batch, second_argument_batch)
  return sum(values) / len(values)

In [None]:
print(get_corpus_metrics(intersection_over_union, inputs, cands))

0.869723555962096
