In [1]:
!pip install --upgrade transformers



In [2]:
!pip install sentencepiece



In [3]:
!git clone https://github.com/ozcangundes/multitask-question-generation.git

Cloning into 'multitask-question-generation'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 50 (delta 21), reused 23 (delta 5), pack-reused 0[K
Receiving objects: 100% (50/50), 18.00 KiB | 9.00 MiB/s, done.
Resolving deltas: 100% (21/21), done.


In [4]:
%cd multitask-question-generation/

/content/multitask-question-generation


In [5]:
from google.colab import drive

drive.mount('/content/drive')

main_path = '/content/drive/MyDrive/Question-Answer-Metric-Bleu/'

Mounted at /content/drive


In [6]:
import pandas as pd
import os
import json

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pipelines import pipeline

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
class JsonParser:
    smoothing = SmoothingFunction().method3

    def __init__(self, main_path):
        self.main_path = main_path
        self.json_path = self.main_path + 'jsonpars'

        tokenizer = AutoTokenizer.from_pretrained("ozcangundes/mt5-multitask-qa-qg-turkish")
        model = AutoModelForSeq2SeqLM.from_pretrained("ozcangundes/mt5-multitask-qa-qg-turkish")
        self.multimodel = pipeline("multitask-qa-qg",tokenizer=tokenizer,model=model)

        if not os.path.exists('data_bleu_score.csv'):
            self.json_list = self.load_json_files()
            self.qa_df = self.parse_json()
            print("Total number of questions and answers: ", len(self.qa_df))

        else:
            self.qa_df = pd.read_csv('data_bleu_score.csv')
            print("Total number of questions and answers: ", len(self.qa_df))

    # Load all the json files
    def load_json_files(self):
        try:
            file_lists = os.listdir(self.json_path)
            json_list = [file_ for file_ in file_lists if file_.endswith('.json')]

            return json_list
        except FileNotFoundError:
            print("Directory not found")
            return []

    # Parse the json files to a pandas dataframe
    def parse_json(self):
        qa_df = pd.DataFrame(columns=['Title', 'Context', "id", 'Question', 'Answer'])
        for json_file in self.json_list:
            try:
                with open(os.path.join(self.json_path, json_file), 'r') as f:
                    json_data = json.load(f)
                    for data in json_data['data']:
                        title = data['title']
                        context = data['paragraphs'][0]['context']
                        for qa in data['paragraphs'][0]['qas']:
                            id_ = qa.get('id', None)
                            question = qa['question']
                            answer = qa['answers'][0]['text']
                            qa_df.loc[len(qa_df)] = [title, context, id_, question, answer]

            except (FileNotFoundError, json.JSONDecodeError) as e:
                print(f"Error reading {json_file}: {e}")

        return qa_df

    # Generate questions and answers using the mT5 model and save the results to a new dataframe
    def generate_qa(self):
        try:
            generated_qa_df = pd.read_csv(self.main_path + "generated_qa_df.cvs")
        except FileNotFoundError:
            print("generated_qa_df not found")
            df_questions = [q for q in self.qa_df['Question']]
            df_answers = [a for a in self.qa_df['Answer']]

            len_context = len(set(self.qa_df['Context']))
            generated_qa_df = pd.DataFrame(columns=['Title', 'Context', "id", 'Question', 'Answer'])

            counter = 1
            # Generate questions and answers for each context
            for context, title in zip(set(self.qa_df['Context']), set(self.qa_df['Title'])):
                print(f"Generated questions and answers for context: {counter}/{len_context}")
                generated_qa = self.multimodel(context)
                counter += 1

                for qa in generated_qa:
                    if qa['question'] in df_questions or qa['answer'] in df_answers:
                        generated_qa_df.loc[len(generated_qa_df)] = [title, context, 2, qa['question'], qa['answer']]

            generated_qa_df.to_csv(self.main_path + 'generated_qa_df.csv', index=False, encoding='utf-8')

        return generated_qa_df

    # Find the bleu score for the original and generated text
    @staticmethod
    def get_bleu_score(ref_question, gen_question):
        bleu_score = sentence_bleu([ref_question], gen_question, smoothing_function=JsonParser.smoothing)
        return bleu_score

    # Find the same questions and their answers in the original and generated dataframes
    def get_same_qa_blue_score(self, generated_qa_df, merge_on, bleu_score_column):
        try:
            merged_df = pd.read_csv(self.main_path + 'same_answers_bleu_scores.csv')
            print(f"merged_df found in {self.main_path}")
            merged_df['Answer'], merged_df['Question_generated'] = merged_df['Question_generated'], merged_df['Answer']

        except FileNotFoundError:
            print(f'merged_df not found in {self.main_path}')
            merged_df = pd.merge(self.qa_df, generated_qa_df, on=merge_on, suffixes=('_original', '_generated'))
            merged_df = merged_df.drop(columns=['Title_generated', 'Context_generated', 'id_generated'])
            merged_df['bleu_score'] = merged_df.apply(lambda row: self.get_bleu_score(row[bleu_score_column + '_original'], row[bleu_score_column + '_generated']), axis=1)
            merged_df['Answer'], merged_df['Question_generated'] = merged_df['Question_generated'], merged_df['Answer']
            merged_df.columns = ['Title', 'Context', 'id', 'Question', 'Answer', 'Generated_Question', 'Bleu_Score']
            merged_df.to_csv(self.main_path + "same_answers_bleu_scores.csv", index=False, encoding='utf-8')

        return merged_df

In [8]:
json_parser = JsonParser(main_path)
generated_qa_df = json_parser.generate_qa()

tokenizer_config.json:   0%|          | 0.00/374 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/33.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Total number of questions and answers:  3508
generated_qa_df not found
Generated questions and answers for context: 1/111
Generated questions and answers for context: 2/111
Generated questions and answers for context: 3/111
Generated questions and answers for context: 4/111
Generated questions and answers for context: 5/111
Generated questions and answers for context: 6/111
Generated questions and answers for context: 7/111
Generated questions and answers for context: 8/111
Generated questions and answers for context: 9/111
Generated questions and answers for context: 10/111
Generated questions and answers for context: 11/111
Generated questions and answers for context: 12/111
Generated questions and answers for context: 13/111
Generated questions and answers for context: 14/111
Generated questions and answers for context: 15/111
Generated questions and answers for context: 16/111
Generated questions and answers for context: 17/111
Generated questions and answers for context: 18/111
Ge

In [14]:
same_answers_bleu_scores = json_parser.get_same_qa_blue_score(generated_qa_df, merge_on='Answer', bleu_score_column='Question')

merged_df found in /content/drive/MyDrive/Question-Answer-Metric-Bleu/
merged_df not found in /content/drive/MyDrive/Question-Answer-Metric-Bleu/


In [16]:
same_answers_bleu_scores.head()

Unnamed: 0,Title,Context,id,Question,Answer,Generated_Question,Bleu_Score
0,Sırp Ayaklanmaları,"19. yüzyıl başlarında Avusturya ve Rusya, Sırb...",1,19. Yüzyıl başlarında hangi devletler Sırbista...,19. yüzyıl başlarında kimler Sırbistan'da halk...,Avusturya ve Rusya,0.648116
1,Sırp Ayaklanmaları,"19. yüzyıl başlarında Avusturya ve Rusya, Sırb...",2,Sırplar kimin önderliğinde ayaklandılar ?,Kim 21 Eylül 1813'te diğer isyancılarla birlik...,Kara Yorgi,0.080631
2,Sırp Ayaklanmaları,"19. yüzyıl başlarında Avusturya ve Rusya, Sırb...",10,Kara Yorgi ve isyancılar canlarını kurtarmak i...,Kara Yorgi ne zaman diğer isyancılarla birlikt...,21 Eylül 1813'te,0.761657
3,Osmanlı-Rus Savaşı (1806-1812),1806-1812 Osmanlı-Rus Savaşı Osmanlı Devleti i...,34,III. Selim boğazları hangi tarihte kapattı ?,III. Selim ne zaman boğazları kapattı ve Rusya...,22 Aralık 1805 tarihinde,0.441969
4,Osmanlı-Rus Savaşı (1806-1812),1806-1812 Osmanlı-Rus Savaşı Osmanlı Devleti i...,35,Rus donanması Osmanlı donanmasını hangi tariht...,Rus donanması Osmanlı donanmasını hangi tariht...,11 Mayıs 1807 tarihinde,0.519534
