In [None]:
!pip install pandas
!pip install transformers
!pip install torch



In [None]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import time
import random
import nltk
from nltk.tokenize import word_tokenize


In [None]:
# --------------------------- Configuration --------------------------- #

# Input and output file paths
INPUT_CSV = '/content/english_recipes.csv'             # Replace with your input CSV file path
OUTPUT_CSV = 'modified_recipes_2.csv'  # Desired output CSV file path

# Model configuration
MODEL_NAME = 'ramsrigouthamg/t5_paraphraser'  # A T5 model fine-tuned for paraphrasing

# Device configuration: use GPU if available
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Paraphrasing settings
MAX_LENGTH = 256
NUM_BEAMS = 10          # Number of beams for beam search
NUM_BEAM_GROUPS = 5     # Must be >1 and divisor of NUM_BEAMS
NUM_RETURN_SEQUENCES = 3  # Generate multiple paraphrases per question
TEMPERATURE = 1.0       # Higher temperature for more randomness

# Templates for rule-based rephrasing
TEMPLATES = [
    "What is the best way to {action} {item}?",
    "Can you provide a method to {action} {item}?",
    "Could you explain how to {action} {item}?",
    "What steps are involved in {action} {item}?",
    "How might I {action} {item}?",
    "What’s the procedure to {action} {item}?",
    "How would you {action} {item}?",
    "Could you guide me on {action} {item}?",
    "What are the instructions to {action} {item}?",
    "Can you show me how to {action} {item}?",

    # **Additional Templates**
    "How can I go about {action} {item}?",
    "What techniques are used to {action} {item}?",
    "Can you detail the process for {action} {item}?",
    "What is required to {action} {item}?",
    "How do you approach {action} {item}?",
    "What are the necessary steps to {action} {item}?",
    "Could you outline a recipe for {action} {item}?",
    "What are the best practices for {action} {item}?",
    "How should I {action} {item}?",
    "What’s involved in {action} {item}?",
    "Can you walk me through {action} {item}?",
    "What are the key steps to {action} {item}?",
    "How is {item} typically {action}?",
    "What is a good recipe to {action} {item}?",
    "How do I successfully {action} {item}?",
    "What process should I follow to {action} {item}?",
    "Could you provide a step-by-step guide to {action} {item}?",
    "What are the guidelines for {action} {item}?",
    "How would you recommend {action} {item}?",
    "What methods can I use to {action} {item}?"
]

# --------------------------------------------------------------------- #

In [None]:



def load_model(model_name):
    """
    Load the pre-trained T5 model and tokenizer for paraphrasing.
    """
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    model = model.to(DEVICE)
    return tokenizer, model

def paraphrase(text, tokenizer, model, max_length=MAX_LENGTH, num_beams=NUM_BEAMS,
              num_return_sequences=NUM_RETURN_SEQUENCES, temperature=TEMPERATURE, num_beam_groups=NUM_BEAM_GROUPS):
    """
    Paraphrase the input text using the T5 model.
    Generates multiple paraphrases for each input.
    """
    # Prepare the text for the model
    input_text = "paraphrase: " + text + " </s>"
    encoding = tokenizer.encode_plus(input_text, padding='longest', return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(DEVICE), encoding["attention_mask"].to(DEVICE)

    # Generate paraphrased outputs
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_masks,
        max_length=max_length,
        num_beams=num_beams,
        num_beam_groups=num_beam_groups,
        num_return_sequences=num_return_sequences,
        temperature=temperature,
        early_stopping=True,
        diversity_penalty=1.0  # Encourage diversity among beams
    )

    # Decode the generated texts
    paraphrased_texts = [tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in outputs]
    # Remove duplicates and return unique paraphrases
    unique_paraphrases = list(dict.fromkeys(paraphrased_texts))
    return unique_paraphrases

def template_rephrase(question):
    """
    Apply rule-based templates to rephrase the question.
    Extract action and item from the original question.
    """
    # Simple heuristic to extract action and item
    # Example: "How can I cook red fruit smoothie?"
    # Action: cook
    # Item: red fruit smoothie

    tokens = word_tokenize(question)
    # Find the verb (assuming the first verb after "How can I")
    try:
        if "How can I" in question:
            action_index = tokens.index('I') + 1  # 'I' is usually followed by the verb
            action = tokens[action_index]
            item = ' '.join(tokens[action_index + 1:]).rstrip('?').strip()
        elif "How do I" in question:
            action_index = tokens.index('I') + 1
            action = tokens[action_index]
            item = ' '.join(tokens[action_index + 1:]).rstrip('?').strip()
        else:
            # Fallback if parsing fails
            action = 'prepare'
            item = question.rstrip('?').strip()
    except (ValueError, IndexError):
        # Fallback if parsing fails
        action = 'prepare'
        item = question.rstrip('?').strip()

    template = random.choice(TEMPLATES)
    return template.format(action=action, item=item)

def diversify_paraphrases(paraphrases, tokenizer, model):
    """
    Further diversify paraphrases by applying rule-based rephrasing.
    """
    diversified = set(paraphrases)
    for paraphrase_text in paraphrases:
        # Apply template-based rephrasing
        template_rephrased = template_rephrase(paraphrase_text)
        diversified.add(template_rephrased)
    return list(diversified)

def modify_questions(input_csv, output_csv, tokenizer, model):
    """
    Read the input CSV, paraphrase questions, and write to the output CSV.
    """
    # Read the CSV file
    try:
        df = pd.read_csv(input_csv)
    except FileNotFoundError:
        print(f"Input file '{input_csv}' not found.")
        return
    except pd.errors.EmptyDataError:
        print("Input CSV is empty.")
        return
    except Exception as e:
        print(f"Error reading CSV: {e}")
        return

    # Check if required columns exist
    if 'Question' not in df.columns or 'Content' not in df.columns:
        print("CSV must contain 'Question' and 'Content' columns.")
        return

    # List to store paraphrased questions
    paraphrased_questions = []

    # Iterate over each question and paraphrase
    for index, row in df.iterrows():
        original_question = row['Question']
        print(f"Processing Question {index + 1}: {original_question}")
        try:
            # Generate multiple paraphrases
            paraphrases = paraphrase(original_question, tokenizer, model)
            # Further diversify paraphrases
            diversified = diversify_paraphrases(paraphrases, tokenizer, model)
            # Select one paraphrase randomly
            selected_paraphrase = random.choice(diversified)
            print(f"Selected Paraphrase {index + 1}: {selected_paraphrase}\n")
            paraphrased_questions.append(selected_paraphrase)
            # Optional: Sleep to avoid overloading the system
            time.sleep(1)
        except Exception as e:
            print(f"Error paraphrasing question {index + 1}: {e}")
            paraphrased_questions.append(original_question)  # Fallback to original
            continue

    # Add paraphrased questions to the dataframe
    df['Modified_Question'] = paraphrased_questions

    # Select desired columns for output
    output_df = df[['Modified_Question', 'Content']].rename(columns={'Modified_Question': 'Question'})

    # Write to the output CSV
    try:
        output_df.to_csv(output_csv, index=False)
        print(f"Modified questions have been written to '{output_csv}'.")
    except Exception as e:
        print(f"Error writing to CSV: {e}")

if __name__ == "__main__":
    print("Loading model and tokenizer...")
    tokenizer, model = load_model(MODEL_NAME)
    print("Model loaded successfully.\n")
    modify_questions(INPUT_CSV, OUTPUT_CSV, tokenizer, model)


Loading model and tokenizer...
Model loaded successfully.

Processing Question 1: How can I cook red fruit smoothie?




Selected Paraphrase 1: Can you provide a method to cook a red fruit smoothie?

Processing Question 2: How can I cook affogato ice cream?
Selected Paraphrase 2: What is the best way to cook affogato ice cream?

Processing Question 3: How can I cook morning fantasy?
Selected Paraphrase 3: What’s the procedure to prepare What is the best way to cook a morning fantasy?

Processing Question 4: How can I cook breakfast mix?
Selected Paraphrase 4: How can I go about prepare How do you cook breakfast mix?

Processing Question 5: How can I cook applesauce?
Selected Paraphrase 5: What methods can I use to cook applesauce?

Processing Question 6: How can I cook a savory breakfast?
Selected Paraphrase 6: What steps are involved in make a savory breakfast?

Processing Question 7: How can I cook non-alcoholic sorbet?
Selected Paraphrase 7: Can you provide a method to prepare How do you cook non-alcoholic sorbet?

Processing Question 8: How can I cook apple pancake?
Selected Paraphrase 8: How do I co