# Data Generation Code for Quora Dataset

In [1]:
# Import necessary libraries
import numpy as np 
import pandas as pd 
import os

In [2]:
# Ensure that pip is up-to-date by upgrading it
! pip install --upgrade pip  
# Install the 'transformers' and 'sentencepiece' packages using pip
! pip install transformers sentencepiece
# Clone the BLEURT repository from GitHub
! git clone https://github.com/google-research/bleurt.git
# Install the BLEURT package using pip
# This installs the BLEURT package from the local directory (./bleurt/)
! pip install ./bleurt/

fatal: destination path 'bleurt' already exists and is not an empty directory.
Processing ./bleurt
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: BLEURT
  Building wheel for BLEURT (setup.py) ... [?25ldone
[?25h  Created wheel for BLEURT: filename=BLEURT-0.0.2-py3-none-any.whl size=16456783 sha256=c777db1b7a8438669ab379d7a0ca1ade6d37248a05124784d74c9bcb629a7c66
  Stored in directory: /private/var/folders/f_/_d229_2x1nsg_hg3f8lpq4pw0000gn/T/pip-ephem-wheel-cache-czghm5wz/wheels/ff/db/da/7d95dfb747a2a426742968f05f5b4feebd822f680766573d19
Successfully built BLEURT
Installing collected packages: BLEURT
  Attempting uninstall: BLEURT
    Found existing installation: BLEURT 0.0.2
    Uninstalling BLEURT-0.0.2:
      Successfully uninstalled BLEURT-0.0.2
Successfully installed BLEURT-0.0.2


In [3]:
# Import necessary libraries
import csv
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSequenceClassification
from bleurt.score import BleurtScorer
from tqdm import tqdm
from numpy import argmax

# Check if a CUDA-enabled GPU is available; if yes, use it, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the T5 tokenizer from the 't5-base' pre-trained model
paraphrasing_tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Load the T5 model for conditional generation from the 'coderpotter/T5-for-Adversarial-Paraphrasing' pre-trained model
paraphrasing_model = T5ForConditionalGeneration.from_pretrained("coderpotter/T5-for-Adversarial-Paraphrasing")

# Set the values for various parameters
bleurt_threshold, initial_top_k, initial_top_p, offset_top_k, offset_top_p,  = 0.5, 120, 0.95, 20, 0.05

# Initialize the BLEURT scorer with the 'bleurt-base-128' pre-trained model
bleurt_scorer = BleurtScorer("bleurt-base-128")

# Load the tokenizer for the adversarial paraphrasing detector from 'coderpotter/adversarial-paraphrasing-detector'
mi_tokenizer = AutoTokenizer.from_pretrained("coderpotter/adversarial-paraphrasing-detector")

# Load the pre-trained MI model for sequence classification from 'coderpotter/adversarial-paraphrasing-detector'
mi_model = AutoModelForSequenceClassification.from_pretrained("coderpotter/adversarial-paraphrasing-detector")

def get_mi_score(s1, s2):  # returns average of s1 and s2
    # Tokenize the input sequences using the mi_tokenizer
    tokenized_input_seq_pair = mi_tokenizer.encode_plus(s1, s2, max_length=256, return_token_type_ids=True, truncation=True)
    # Prepare the input tensors
    input_ids = torch.Tensor(tokenized_input_seq_pair["input_ids"]).long().unsqueeze(0)
    token_type_ids = torch.Tensor(tokenized_input_seq_pair["token_type_ids"]).long().unsqueeze(0)
    attention_mask = torch.Tensor(tokenized_input_seq_pair["attention_mask"]).long().unsqueeze(0)

    # Forward pass through the mi_model
    outputs = mi_model(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        labels=None,
    )

    # Obtain predicted probabilities for the first pair of sequences
    predicted_probability_12 = torch.softmax(outputs[0], dim=1)[0].tolist()  # batch_size only one

    # Release memory by deleting unnecessary variables
    del tokenized_input_seq_pair, input_ids, token_type_ids, attention_mask, outputs

    # Repeat the same process for the reversed order of input sequences
    tokenized_input_seq_pair = mi_tokenizer.encode_plus(s2, s1, max_length=256, return_token_type_ids=True, truncation=True)
    input_ids = torch.Tensor(tokenized_input_seq_pair["input_ids"]).long().unsqueeze(0)
    token_type_ids = torch.Tensor(tokenized_input_seq_pair["token_type_ids"]).long().unsqueeze(0)
    attention_mask = torch.Tensor(tokenized_input_seq_pair["attention_mask"]).long().unsqueeze(0)
    outputs = mi_model(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        labels=None,
    )

    # Obtain predicted probabilities for the second pair of sequences
    predicted_probability_21 = torch.softmax(outputs[0], dim=1)[0].tolist()  # batch_size only one

    # Release memory by deleting unnecessary variables
    del tokenized_input_seq_pair, input_ids, token_type_ids, attention_mask, outputs

    # Return 1 if the first sequence is more likely related to the second sequence, 0 otherwise
    return int(argmax(predicted_probability_12) == 0 and argmax(predicted_probability_21) == 0)


def get_bleurt(s1, s2):
    # Return the computed average BLEURT score
    return (bleurt_scorer.score(references=[s1], candidates=[s2])[0] + bleurt_scorer.score(references=[s2], candidates=[s1])[0]) / 2


def generate_paraphrases(sentence, top_k, top_p):
    # Construct a text prompt for paraphrasing, including the input sentence
    text = "paraphrase: " + sentence + " </s>"
    # Encode the text prompt using the paraphrasing tokenizer
    encoding = paraphrasing_tokenizer.encode_plus(text, max_length=256, padding="max_length", return_tensors="pt")
    # Move the input tensors to the specified device (e.g., GPU)
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
    # Generate paraphrases using the paraphrasing model
    beam_outputs = paraphrasing_model.generate(
        input_ids=input_ids,
        attention_mask=attention_masks,
        do_sample=True,
        max_length=256,
        top_k=top_k,
        top_p=top_p,
        early_stopping=True,
        num_return_sequences=1,
    )
    # Process and filter the generated paraphrases
    final_outputs = []
    for beam_output in beam_outputs:
        # Decode the generated sequence, removing special tokens and extra spaces
        sent = paraphrasing_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        # Check if the generated paraphrase is different from the input sentence
        # and has not been already included in the final outputs
        if sent.lower() != sentence.lower() and sent not in final_outputs:
            final_outputs.append(sent)

    # Return the list of filtered and unique paraphrases
    return final_outputs



def write_paraphrases(input_file, apt_output_file, mi_output_file, nmi_output_file, position, startFrom=1, endAt=10000):  
    # Set initial values
    n, i = 4, 0
    written_sentences = set()

    # Try to read previously written sentences from each output file
    try:
        with open(apt_output_file + str(i), "r") as f:
            for l in f.readlines():
                written_sentences.add(l.strip().split("\t")[0])
    except:
        pass
    try:
        with open(mi_output_file + str(i), "r") as f:
            for l in f.readlines():
                written_sentences.add(l.strip().split("\t")[0])
    except:
        pass
    try:
        with open(nmi_output_file + str(i), "r") as f:
            for l in f.readlines():
                written_sentences.add(l.strip().split("\t")[0])
    except:
        pass

    # Open output files in append mode
    apt = open(apt_output_file, "a+")
    mi = open(mi_output_file, "a+")
    nmi = open(nmi_output_file, "a+")

    # Read input CSV file
    with open(input_file, "r", encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter=',')
        # Iterate through rows with a progress bar
        for row in tqdm(reader, total=endAt - startFrom + 1, desc="Processing"):
            current_id = int(row[position]) # assuming position gives the column with ID
            # Skip rows with ID less than startFrom
            if current_id < startFrom:
                continue
            # Break the loop if current_id is greater than endAt
            if current_id > endAt:  
                break
            sentence = row["question1"]
            # Skip if the sentence is already written
            if sentence in written_sentences:
                continue
            bad_sentences, written, top_k, top_p, c = set(), False, initial_top_k, initial_top_p, 1

            # Generate paraphrases and evaluate them
            for p in generate_paraphrases(sentence, top_k, top_p):
                if p not in bad_sentences:
                    bleurt, miscore = get_bleurt(sentence, p), get_mi_score(sentence, p)
                    # Check conditions for writing to different files
                    if miscore:
                        if bleurt < bleurt_threshold:
                            apt.write(sentence + "\t" + p + "\t" + str(bleurt) + "\t" + str(miscore) + "\n")
                            written = True
                        else:
                            bad_sentences.add(p)
                            mi.write(sentence + "\t" + p + "\t" + str(bleurt) + "\t" + str(miscore) + "\n")
                    else:
                        bad_sentences.add(p)
                        nmi.write(sentence + "\t" + p + "\t" + str(bleurt) + "\t" + str(miscore) + "\n")
            # Adjust parameters and try again if not written
            while not written and c <= 5:
                top_k += offset_top_k
                top_p -= offset_top_p
                for p in generate_paraphrases(sentence, top_k, top_p):
                    if p not in bad_sentences:
                        bleurt, miscore = get_bleurt(sentence, p), get_mi_score(sentence, p)
                        if miscore:
                            if bleurt < bleurt_threshold:
                                apt.write(sentence + "\t" + p + "\t" + str(bleurt) + "\t" + str(miscore) + "\n")
                                written = True
                            else:
                                bad_sentences.add(p)
                                mi.write(sentence + "\t" + p + "\t" + str(bleurt) + "\t" + str(miscore) + "\n")
                        else:
                            bad_sentences.add(p)
                            nmi.write(sentence + "\t" + p + "\t" + str(bleurt) + "\t" + str(miscore) + "\n")
                c += 1
            # Clean up variables
            del bad_sentences, written

  from .autonotebook import tqdm as notebook_tqdm
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


INFO:tensorflow:Reading checkpoint bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.


2023-10-27 19:00:37.442457: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


INFO:tensorflow:BLEURT initialized.


In [4]:
# Set variables specific to the Quora Question Pairs dataset
input_file = "Quora_QuestionPairs.csv"  # Update this with the correct path to your dataset
output_file_prefix = "data_created-1-404363/"  # Update this as needed
position_of_sentence = 3  # Update this based on the position of the sentence in your dataset

write_paraphrases(input_file, output_file_prefix + "apt.txt", output_file_prefix + "mi.txt", output_file_prefix + "nmi.txt", "id", startFrom=1, endAt=404363)

Processing: 51001it [1:16:36, 11.10it/s]  
