<a href="https://colab.research.google.com/github/guilherme-vieira/recipe_sorting/blob/main/synthesising_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook uses HuggingFace PEGASUS Transformer model to create paraphrases os recipe methods



# Setting up PEGASUS model
> This model was developed by Google Research in 2019. More information on github.com/google-research/pegasus






In [2]:
!pip install sentence-splitter



In [3]:
!pip install transformers



In [4]:
!pip install SentencePiece



In [5]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_text,num_return_sequences):
  batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=10, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

# Defining sentence generating function based on the pegasus model

In [6]:
from sentence_splitter import SentenceSplitter, split_text_into_sentences
import numpy as np

In [7]:
def generate_paragraphs(paragraph, number_paraphrases=2):

  # split paragraph into sentences
  splitter = SentenceSplitter(language='en')
  sentence_list = splitter.split(paragraph)

  # getting a matrix of paraphrases for each sentence
  paraphrases = []

  for sentence in sentence_list:
    paraphrases.append(get_response(sentence, number_paraphrases))

  
  # combining generated sentences in multiple methods
  paraphrases_matrix = np.array(paraphrases)

  m = paraphrases_matrix.shape[0] #number of sentences
  n = paraphrases_matrix.shape[1] #numeber of paraphrases for sentence

  # linear vectors for each dimention of pharaphrases matrix
  y = np.arange(0, m, 1)
  x = np.arange(0, n, 1)

  # grid containing all possibilities for phrases
  grid = np.array(np.meshgrid(*[x for i in y])).reshape(n**m, -1)


  # creating all sentences and exporting in a list
  list_paragraphs = []

  for paragraph_num in range(grid.shape[0]): 
    paragraph = ''

    for sentence_num in range(grid.shape[1]):
      paraphrase_num = grid[paragraph_num, sentence_num]
      paragraph += ' ' + paraphrases_matrix[sentence_num, paraphrase_num]

    list_paragraphs.append(paragraph)

  return list_paragraphs

Example:

In [8]:
paragraph = '''
 Add more seasonings to make it taste better. 
 Make smooth with the back of a spoon by turning out into a dinner plate. 
 Scatter the chickpeas with the extra virgin olive oil.
'''
generate_paragraphs(paragraph, 2)



[' It will taste better if you add more seasonings. If you want to make smooth with the back of a spoon, turn it into a dinner plate. Put the chickpeas in the extra virgin olive oil.',
 ' It will taste better if more seasonings are added. If you want to make smooth with the back of a spoon, turn it into a dinner plate. The chickpeas should be Scattered with extra virgin olive oil.',
 ' It will taste better if more seasonings are added. A dinner plate can be made smooth with the back of a spoon. The chickpeas should be Scattered with extra virgin olive oil.',
 ' It will taste better if you add more seasonings. If you want to make smooth with the back of a spoon, turn it into a dinner plate. The chickpeas should be Scattered with extra virgin olive oil.',
 ' It will taste better if more seasonings are added. A dinner plate can be made smooth with the back of a spoon. Put the chickpeas in the extra virgin olive oil.',
 ' It will taste better if more seasonings are added. If you want to ma

# Importing the methods table from csv

In [9]:
import pandas as pd
classified_methods = pd.read_csv('classified_methods.csv')
classified_methods.head()

Unnamed: 0,description,Measuring,Plating,Smoking,Toasting,Microwaving,Air Frying,Double Boiler,Bain Marie,Reducing,Water Bathing,Deglazing,Caramelising,Poaching,Simmering,Boiling,Steaming,Stir Frying,Deep Frying,Flambing,Braising,Searing,Sauteeing,Grilling,Pan Frying,Grating,Baking,Roasting,Squeezing,Mashing,Rehidrating,Drying,Kneading,Tenderizing,Whisking,Mixing,Blending,Refrigerating,Pickling,Curing,Freezing,Infusing,Marinating,Seasoning,Salting,Slicing,Chopping Fruits,Chopping Mushroom,Chopping Herbs,Mincing,Batonnet,Dicing,Roughly Chopping
0,"Put the mushrooms, chard, oil, garlic, chilli,...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Bring a large pan of salted water to the boil,...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Add the drained pasta and chopped tomatoes to ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"Toast the cumin seeds, fennel seeds and black ...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Transfer to a mortar and pestle and grind to a...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Main function that synthesises data on the given data frame

In [10]:
def generate_methods(df):
  num_rows = df.shape[0]
  techniques_names = list(df.columns)[1:]
  techniques_dict = classified_methods[techniques_names].to_dict('r')

  for row in range(num_rows):
    description = df.at[row, 'description']
    technique_vector = df.values[row]
    
    generated_descriptions = generate_paragraphs(description)

    for generated_description in generated_descriptions:
      extra_row = techniques_dict[row]
      extra_row['description'] = generated_description
      df = df.append(extra_row, ignore_index=True)

  return df

Running and showing tail of the data frame

At first we had 12 methods. That was expanded to 67, more than 5 times what we had!

In [11]:
synthetic_classified_methods = generate_methods(classified_methods)
synthetic_classified_methods.tail()



Unnamed: 0,description,Measuring,Plating,Smoking,Toasting,Microwaving,Air Frying,Double Boiler,Bain Marie,Reducing,Water Bathing,Deglazing,Caramelising,Poaching,Simmering,Boiling,Steaming,Stir Frying,Deep Frying,Flambing,Braising,Searing,Sauteeing,Grilling,Pan Frying,Grating,Baking,Roasting,Squeezing,Mashing,Rehidrating,Drying,Kneading,Tenderizing,Whisking,Mixing,Blending,Refrigerating,Pickling,Curing,Freezing,Infusing,Marinating,Seasoning,Salting,Slicing,Chopping Fruits,Chopping Mushroom,Chopping Herbs,Mincing,Batonnet,Dicing,Roughly Chopping
62,The butter can be heated in a pan over a medi...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
63,"Add the onion and cook for a while, stirring ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
64,"Add the onion and cook, stirring occasionally...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
65,Cook the garlic and half of the cumin seeds f...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
66,Add the garlic and half of the cumin seeds an...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Exporting CSV

In [12]:
synthetic_classified_methods.to_csv('synthetic_classified_methods.csv', index=False)