In [1]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
#!pip install sentence-transformers


In [3]:
#!pip install sentencepiece


In [4]:
pip install --upgrade transformers sentencepiece


Collecting transformers
  Downloading transformers-4.39.2-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Downloading transformers-4.39.2-py3-none-any.whl (8.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.39.1
    Uninstalling transformers-4.39.1:
      Successfully uninstalled transformers-4.39.1
Successfully installed transformers-4.39.2
Note: you may need to restart the kernel to use updated packages.


In [5]:
with open('/Users/eraparihar/Desktop/Semester 2/SI 630/project/squad_train-v2.0.json', 'r') as file:
    train_data = json.load(file)

In [6]:
def convert_to_dataset(data):
    dataset = []
    for entry in data['data']:
        title = entry['title']
        for paragraph in entry['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                q = qa['question']
                id_ = qa['id']
                is_impossible = qa['is_impossible']
                answers = qa['answers'] if not is_impossible else []
                # For each question, we will create a dictionary with the question, its context and the answers
                dataset.append({
                    'id': id_,
                    'title': title,
                    'context': context,
                    'question': q,
                    'is_impossible': is_impossible,
                    'answers': [{'text': answer['text'], 'answer_start': answer['answer_start']} for answer in answers]
                })
    return dataset

In [7]:
dataset = convert_to_dataset(train_data)

trial_data = dataset[:10]

In [8]:
trial_data

[{'id': '56be85543aeaaa14008c9063',
  'title': 'Beyoncé',
  'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
  'question': 'When did Beyonce start becoming popular?',
  'is_impossible': False,
  'answers': [{'text': 'in the late 1990s', 'answer_start': 269}]},
 {'id': '56be85543aeaaa14008c9065',
  'title': 'Beyoncé',
  'context': 'Beyo

In [9]:
tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")  
model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")   #.to('cuda')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
#formatted_questions = ['paraphrase: ' + row['question'] + ' </s>' for row in trial_data]

for item in trial_data:
    sentence = item['question']  
    text = "paraphrase: " + sentence + " </s>"

In [11]:
# Encode the text input for the model
encoding = tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]   #.to("cuda")



In [12]:
outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_masks,
        max_length=256,
        do_sample=True,
        top_k=150,
        top_p=0.95,
        early_stopping=True,
        num_return_sequences= 3
    )

    # Decode and print the paraphrases
print(f"Original: {sentence}")
for i, output in enumerate(outputs):
    line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(f"Paraphrase {i+1}: {line}")
print("\n" + "-"*50 + "\n")




Original: What role did Beyoncé have in Destiny's Child?
Paraphrase 1: What role did Beyoncé have in Destiny's Child?
Paraphrase 2: What role did Beyoncé have in Destiny's Child?
Paraphrase 3: What role played Beyoncé in Destiny's Child?

--------------------------------------------------



In [13]:
def calculate_cosine_similarity(model, tokenizer, sentence, paraphrases):
    # Tokenize and encode the original sentence and the paraphrases
    encoded_original = tokenizer.encode_plus(sentence, return_tensors='pt')
    original_embedding = model(**encoded_original)[0].mean(1)  # Taking the mean of the last hidden state to get a single vector

    similarities = []
    for paraphrase in paraphrases:
        encoded_paraphrase = tokenizer.encode_plus(paraphrase, return_tensors='pt')
        paraphrase_embedding = model(**encoded_paraphrase)[0].mean(1)
        
        # Calculate cosine similarity and store
        similarity = cosine_similarity(original_embedding, paraphrase_embedding)
        similarities.append(similarity.item())
    
    return similarities

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import util
import torch


embedding_model = 'all-MiniLM-L6-v2'  
embedder = SentenceTransformer(embedding_model)

def paraphrase_and_similarity(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=256,
        do_sample=True,
        top_k=150,
        top_p=2,
        early_stopping=True,
        num_return_sequences=10

    )
    
    paraphrases = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    original_embedding = embedder.encode(sentence, convert_to_tensor=True)
    paraphrase_embeddings = embedder.encode(paraphrases, convert_to_tensor=True)

    #similarities = [util.pytorch_cos_sim(original_embedding, paraphrase_embedding)[0][0].item() for paraphrase_embedding in paraphrase_embeddings]

    paraphrase_similarity_dict = {}
    for paraphrase, paraphrase_embedding in zip(paraphrases, paraphrase_embeddings):
        similarity = util.pytorch_cos_sim(original_embedding, paraphrase_embedding)[0][0].item()
        paraphrase_similarity_dict[paraphrase] = similarity

    print(f"Original: {sentence}")
    for i, (paraphrase, similarity) in enumerate(zip(paraphrases, similarities)):
        print(f"Paraphrase {i+1}: {paraphrase} (Cosine Similarity: {similarity})")
    print("\n" + "-"*50 + "\n")

    return paraphrases, similarities

sentence = "What are the challenges and opportunities associated with the use of renewable energy sources to meet the world's increasing energy demands?"

paraphrases, similarities = paraphrase_and_similarity(sentence)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Original: What are the challenges and opportunities associated with the use of renewable energy sources to meet the world's increasing energy demands?


NameError: name 'similarities' is not defined

**work on generating more question from all that haev very high cosine similarity**

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
import torch

tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")

embedding_model = 'all-MiniLM-L6-v2'  # or any other model you prefer
embedder = SentenceTransformer(embedding_model)

def paraphrase_and_similarity(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=256,
        do_sample=True,
        top_k=150,
        top_p=2, 
        early_stopping=True,
        num_return_sequences=7
    )
    
    paraphrases = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    original_embedding = embedder.encode(sentence, convert_to_tensor=True)
    paraphrase_embeddings = embedder.encode(paraphrases, convert_to_tensor=True)

    paraphrase_similarity_dict = {}
    for paraphrase, paraphrase_embedding in zip(paraphrases, paraphrase_embeddings):
        similarity = util.pytorch_cos_sim(original_embedding, paraphrase_embedding)[0][0].item()
        paraphrase_similarity_dict[paraphrase] = similarity

    selected_paraphrases = {p: s for p, s in paraphrase_similarity_dict.items() if 0.97 <= s <= 0.99}

    print("Selected Paraphrases:")
    for paraphrase, similarity in selected_paraphrases.items():
        print(f"{paraphrase} (Cosine Similarity: {similarity})")

    generated_questions = {}
    for paraphrase in selected_paraphrases.keys():
        more_outputs = model.generate(
            input_ids=tokenizer.encode("paraphrase: " + paraphrase + " </s>", return_tensors="pt"),
            max_length=256,
            do_sample=True,
            top_k=200,
            top_p=2,  
            early_stopping=True,
            num_return_sequences=3
        )
        
        more_paraphrases = [tokenizer.decode(output, skip_special_tokens=True) for output in more_outputs]
        generated_questions[paraphrase] = more_paraphrases

    print("\nGenerated Questions for Selected Paraphrases:")
    for paraphrase, questions in generated_questions.items():
        print(f"Original Paraphrase: {paraphrase}")
        for question in questions:
            print(f"Generated Question: {question}")
        print("\n" + "-"*50 + "\n")

    return selected_paraphrases, generated_questions

sentence = "What are the challenges and opportunities associated with the use of renewable energy sources to meet the world's increasing energy demands?"

selected_paraphrases, generated_questions = paraphrase_and_similarity(sentence)

Selected Paraphrases:
What are the challenges and opportunities associated with the use of renewable energy sources to meet the growing energy demands of the world? (Cosine Similarity: 0.986009418964386)
How are the challenges and opportunities associated with the use of renewable energy sources to meet the increasing global energy demands? (Cosine Similarity: 0.9716543555259705)
What are the challenges and opportunities associated with the use of renewable energy sources to meet the growing energy demands in the world? (Cosine Similarity: 0.98566734790802)

Generated Questions for Selected Paraphrases:
Original Paraphrase: What are the challenges and opportunities associated with the use of renewable energy sources to meet the growing energy demands of the world?
Generated Question: What are the challenges and opportunities associated with the use of renewable energy sources to meet the global energy demands?
Generated Question: What are the challenges and opportunities associated wit

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import torch

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")

# Embedding model
embedding_model = 'all-MiniLM-L6-v2'  # Adjust with your preference
embedder = SentenceTransformer(embedding_model)

def cosine_similarity(tensor1, tensor2):
    """
    Compute the cosine similarity between two tensors.
    """
    tensor1_norm = tensor1 / tensor1.norm(dim=1, keepdim=True)
    tensor2_norm = tensor2 / tensor2.norm(dim=1, keepdim=True)
    return torch.mm(tensor1_norm, tensor2_norm.transpose(0, 1))

def paraphrase_and_similarity(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'],
                             max_length=256, do_sample=True, top_k=150, top_p=0.95,
                             early_stopping=True, num_return_sequences=7)

    paraphrases = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    original_embedding = embedder.encode([sentence], convert_to_tensor=True)
    paraphrase_embeddings = embedder.encode(paraphrases, convert_to_tensor=True)

    similarities = cosine_similarity(original_embedding, paraphrase_embeddings).squeeze(0).tolist()

    selected_paraphrases = {paraphrase: similarity for paraphrase, similarity in zip(paraphrases, similarities) if 0.97 <= similarity <= 0.99}

    print("Selected Paraphrases:")
    for paraphrase, similarity in selected_paraphrases.items():
        print(f"{paraphrase} (Cosine Similarity: {similarity})")

    generated_questions = {}
    for paraphrase in selected_paraphrases.keys():
        paraphrase_input = tokenizer("paraphrase: " + paraphrase + " </s>", return_tensors="pt")
        more_outputs = model.generate(paraphrase_input['input_ids'], max_length=256, do_sample=True,
                                      top_k=150, top_p=0.95, early_stopping=True, num_return_sequences=3)

        questions = [tokenizer.decode(output, skip_special_tokens=True) for output in more_outputs]
        generated_questions[paraphrase] = questions

    print("\nGenerated Questions for Selected Paraphrases:")
    for paraphrase, questions in generated_questions.items():
        print(f"Original Paraphrase: {paraphrase}")
        for question in questions:
            print(f"Generated Question: {question}")
        print("\n" + "-"*50 + "\n")

    return selected_paraphrases, generated_questions

# Example usage
sentence = "What are the challenges and opportunities associated with the use of renewable energy sources to meet the world's increasing energy demands?"
selected_paraphrases, generated_questions = paraphrase_and_similarity(sentence)


Selected Paraphrases:
What are the challenges and opportunities associated with the use of renewable energy sources to meet the increasingly high energy demand of the world? (Cosine Similarity: 0.9819177985191345)
What are the challenges and opportunities associated with the use of renewable energy sources to meet the growing global energy demands? (Cosine Similarity: 0.9747074842453003)
What are the challenges and opportunities associated with the use of renewable energy sources to meet the ever-changing energy demands in the world? (Cosine Similarity: 0.9765499234199524)

Generated Questions for Selected Paraphrases:
Original Paraphrase: What are the challenges and opportunities associated with the use of renewable energy sources to meet the increasingly high energy demand of the world?
Generated Question: What are the challenges and opportunities associated with the use of renewable energy sources to meet the global energy demand ever increasing?
Generated Question: What are the cha

In [2]:
import pickle
with open("/home/tzujohsu/SI630/Question-answer-Generation/enlarged_finetune.pickle", "rb") as fp:
    enlarged_finetune = pickle.load(fp)

In [5]:
enlarged_finetune[0]

['Some scholars[note 44] use other schemes. Buddhists themselves have a variety of other schemes. Hinayana (literally "lesser vehicle") is used by Mahayana followers to name the family of early philosophical schools and traditions from which contemporary Theravada emerged, but as this term is rooted in the Mahayana viewpoint and can be considered derogatory, a variety of other terms are increasingly used instead, including Śrāvakayāna, Nikaya Buddhism, early Buddhist schools, sectarian Buddhism, conservative Buddhism, mainstream Buddhism and non-Mahayana Buddhism.',
 'lesser vehicle',
 ['What does Hinayana mean in English?',
  'What means Hinayana in English?',
  'What does the word Hinayana mean?',
  'What does the word Hinayana mean in English?']]

In [6]:
train = []
for i in enlarged_finetune:
    ctx = f"""
    answer: {i[1]}, context: {i[0]}
    """
    for q in i[2]:
        train.append({'input':ctx, 'label':q})

In [9]:
import random
random.shuffle(train)

In [11]:
len(train)

10347

In [12]:

import json

with open('qg_train.json', 'w') as file:  # The 'jsonl' or 'ndjson' extension is often used for newline-delimited JSON
    for entry in train[:8000]:
        json.dump(entry, file)  # Dump the dictionary as a JSON string
        file.write('\n')

In [13]:

import json

with open('qg_test.json', 'w') as file:  # The 'jsonl' or 'ndjson' extension is often used for newline-delimited JSON
    for entry in train[8000:]:
        json.dump(entry, file)  # Dump the dictionary as a JSON string
        file.write('\n')