<a href="https://colab.research.google.com/github/dmi3eva/araneae/blob/main/p4_paraphrasing/01_T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Disk's mounting

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content
%cd drive
%cd My\ Drive
%cd PhD
%cd Paper_01

/content
/content/drive
/content/drive/My Drive
/content/drive/My Drive/PhD
/content/drive/My Drive/PhD/Paper_01


### Modules

In [8]:
!pip install transformers==2.8.0



In [10]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_paraphraser')
tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_paraphraser')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

### Exploring of T5's parameters

In [17]:
def paraphrase(sentence, amount):
  text =  "paraphrase: " + sentence + " </s>"

  max_len = 256

  encoding = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=True, return_tensors="pt")
  input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
  # print('input_ids:')
  # print(input_ids)
  # print('attention_masks:')
  # print(attention_masks)



  # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
  beam_outputs = model.generate(
      input_ids=input_ids, attention_mask=attention_masks,
      do_sample=True,
      max_length=max_len,
      top_k=120,
      top_p=0.98,
      early_stopping=True,
      num_return_sequences=amount
  )

  # print('beam outputs:')
  # print(beam_outputs)

  final_outputs =[]
  for beam_output in beam_outputs:
      sent = tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
      if sent.lower() != sentence.lower() and sent not in final_outputs:
          final_outputs.append(sent)

  return final_outputs  

In [16]:
source = 'Amount of films with Greg Norman'
result = paraphrase(source, 10)
print(f"Source: \"{source}\"")
result

input_ids:
tensor([[ 3856, 27111,    10,    71, 11231,    13,  4852,    28, 11859, 13615,
             1,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     

['Amount of movies with Greg Norman. Amount of films with Greg Norman.',
 'How many movies is Greg Norman worth watching?',
 'How many movies do you have with Greg Norman?',
 'What is your review of Greg Norman film series?',
 'What are Greg Norman movies like?',
 'What are the most successful films by Greg Norman?',
 'Amount of films with Greg Norman?',
 'What film characters have Greg Norman played?',
 'Amount of movies starring Greg Norman.',
 'Amount of movies with Greg Norman.']

In [19]:
source = 'Amount of films with Greg Norman'
result = paraphrase(source, 10)
print(f"Source: \"{source}\"")
result

Source: "Amount of films with Greg Norman"


['Can you give me proof of Greg Norman movies?',
 'Is there any single movie by Greg Norman?',
 'The number of films with Greg Norman.',
 'Books with Greg Norman in full length.',
 'What movies are Greg Norman produced?',
 'What are the most liked movies done by Greg Norman?',
 'Amount of films with Greg Norman: How many came out?',
 'No Hollywood movies have Greg Norman directed or shot.',
 'Amount of movies with Greg Norman.',
 'In what countries does Greg Norman do a movie?']

In [21]:
source = 'Amount of films with Greg Norman'
result = paraphrase(source, 10, top_k=2, top_p=0.95)
print(f"Source: \"{source}\"")
result

Source: "Amount of films with Greg Norman"


['Amount of films with Greg Norman. Greg Norman: Amount of movies with Greg Norman:',
 "What is Greg Norman's filmography and what is his filmography?",
 'Amount of movies with Greg Norman?',
 'What are some of the best movies with Greg Norman?',
 'Amount of movies with Greg Norman.',
 'What are the most popular movies with Greg Norman?',
 'Amount of films with Greg Norman.',
 'What are some of the best films by Greg Norman?']

# Generating paraphrases

$amount$ - amount of returning sentences

$top\_k$ - 

$top\_p$ -

$max\_len$


In [20]:
def paraphrase(sentence, amount, top_k=50, top_p=0.95, max_len=256):
  text =  "paraphrase: " + sentence + " </s>"

  encoding = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=True, return_tensors="pt")
  input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

  beam_outputs = model.generate(
      input_ids=input_ids, attention_mask=attention_masks,
      do_sample=True,
      max_length=max_len,
      top_k=top_k,
      top_p=top_p,
      early_stopping=True,
      num_return_sequences=amount
  )

  final_outputs =[]
  for beam_output in beam_outputs:
      sent = tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
      if sent.lower() != sentence.lower() and sent not in final_outputs:
          final_outputs.append(sent)

  return final_outputs  