In [14]:
# Load eval data
import pandas as pd

# synonym replacement libraries
import random
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# t5 paraphrasing libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# LLM paraphrasing libraries
from ollama import chat
from ollama import ChatResponse

In [2]:
eval_df = pd.read_csv("./data/val.csv")
eval_df.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_token_len,question2_token_len
0,229726,32797,55300,What's the best way to learn Python?,Where should I start at to learn about how to ...,1,7,12
1,66828,32100,115759,Were the IRA freedom fighters or terrorists?,Is the IRA a group of freedom fighters or terr...,1,7,11


In [4]:
paraphrased_eval_df = eval_df.copy()
# For time constraints, we will only use 300 samples
paraphrased_eval_df = paraphrased_eval_df.sample(n=300)

# 1. Paraphrasing with simple synonym replacement

In [None]:
# nltk.download('punkt_tab')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('wordnet')

In [5]:
def get_wordnet_pos(tag):
    """Map NLTK POS tag to WordNet POS tag"""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def synonym_replacement(sentence):
    """
    Simple paraphrasing by replacing nouns and verbs with their synonyms
    """
    tokens = word_tokenize(sentence)
    tagged = pos_tag(tokens)
    
    new_tokens = tokens.copy()
    
    # Replace 1-3 words with synonyms
    replaceable = []
    for i, (word, tag) in enumerate(tagged):
        pos = get_wordnet_pos(tag)
        if pos in [wordnet.NOUN, wordnet.ADJ] and len(word) > 2:
            replaceable.append(i)
    
    # If we found replaceable words, replace 1-3 of them
    if replaceable:
        num_to_replace = min(len(replaceable), random.randint(1, 3))
        indices_to_replace = random.sample(replaceable, num_to_replace)
        
        for idx in indices_to_replace:
            word, tag = tagged[idx]
            pos = get_wordnet_pos(tag)
            
            # Get synonyms
            synonyms = []
            for syn in wordnet.synsets(word.lower(), pos=pos):
                for lemma in syn.lemmas():
                    synonym = lemma.name().replace('_', ' ')
                    if synonym != word.lower() and synonym not in synonyms:
                        synonyms.append(synonym)
            
            # Replace with a synonym if we found any
            if synonyms:
                synonym = random.choice(synonyms)
                # Maintain capitalization
                if word[0].isupper():
                    synonym = synonym.capitalize()
                new_tokens[idx] = synonym
    
    return ' '.join(new_tokens)

In [6]:
paraphrased_eval_df["synonym_replacement"] = paraphrased_eval_df["question1"].apply(synonym_replacement)
paraphrased_eval_df[["question1", "synonym_replacement"]].head()

Unnamed: 0,question1,synonym_replacement
2738,How do I find a hacker?,How do I find a cyberpunk ?
816,What would the sky look like if Andromeda was ...,What would the sky look like if Japanese andro...
1258,Is world war 3 likely?,Is domain state of war 3 likely ?
1694,What are the career options after electrical e...,What are the calling options after electrical ...
1782,What are the chances that the electoral colleg...,What are the chance that the electoral college...


# 2. Paraphrasing with finetuned t5 model

In [7]:
def t5_paraphrase(
    question,
    num_beams=3,
    num_beam_groups=3,
    num_return_sequences=3,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=64
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,        
        truncation=True,
    ).input_ids
    
    outputs = model.generate(
        input_ids,temperature=temperature,repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res_list = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    res_list = [r for r in res_list if r != question]
    res = random.choice(res_list)

    return res

In [8]:
# Paraphrasing with the fine-tuned T5 model
# Load model and tokenizer
device = "cpu"
model_name = "coco101010/t5-paraphrase-quora-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

paraphrased_eval_df["finetuned_t5"] = paraphrased_eval_df["question1"].apply(lambda x: t5_paraphrase(x))



In [9]:
# Paraphrasing with the original T5 model
# Load model and tokenizer
model_name = "Vamsi/T5_Paraphrase_Paws"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

paraphrased_eval_df["original_t5"] = paraphrased_eval_df["question1"].apply(lambda x: t5_paraphrase(x))

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:
# Paraphrasing with the GPT T5 model
model_name = "humarin/chatgpt_paraphraser_on_T5_base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

paraphrased_eval_df["gpt_t5"] = paraphrased_eval_df["question1"].apply(lambda x: t5_paraphrase(x))



In [13]:
paraphrased_eval_df[["question1", "finetuned_t5", "original_t5", "gpt_t5"]].head()

Unnamed: 0,question1,finetuned_t5,original_t5,gpt_t5
2738,How do I find a hacker?,What are the best ways to find a hacker?,What is the best way to find a hacker?,Is there a way to find if someone is hacking?
816,What would the sky look like if Andromeda was ...,How would the sky look if Andromeda collided w...,How the sky would look if Andromeda was right ...,How would the sky appear if Andromeda collided...
1258,Is world war 3 likely?,Is world war 3 possible?,Is world war 3 probable?,Is the possibility of world war 3 a realistic ...
1694,What are the career options after electrical e...,How can I get a career in electrical engineeri...,What are career options after electrical engin...,"Apart from multinational corporations, what ot..."
1782,What are the chances that the electoral colleg...,What are the chances that the electoral colleg...,What are the chances that if Hillary wins the ...,In the event that Hillary wins the popular vot...


# 3. Paraphrasing with LLM

In [29]:

def LLM_paraphrase(question):
    response: ChatResponse = chat(model='gemma3:1b', messages=[
        {
            'role': 'user',
            'content': """Give 1 paraphrase for the following question, output the paraphrase only.
            """ + question,
        },
    ])
    return response.message.content

print(LLM_paraphrase("How can I get back my instagram deleted dms?"))

How can I recover my Instagram DM?


In [30]:
paraphrased_eval_df["llm"] = paraphrased_eval_df["question1"].apply(lambda x: LLM_paraphrase(x))

In [31]:
paraphrased_eval_df[["question1", "llm"]].head()

Unnamed: 0,question1,llm
2738,How do I find a hacker?,What's the best way to identify a hacker?
816,What would the sky look like if Andromeda was ...,What would the sky appear to be like during a ...
1258,Is world war 3 likely?,Is there a high probability of a large-scale g...
1694,What are the career options after electrical e...,What other career paths are available to engin...
1782,What are the chances that the electoral colleg...,What is the probability that the Electoral Col...


# Save paraphrased data

In [32]:
paraphrased_eval_df.to_csv("data/paraphrased_eval.csv", index=False)