In [1]:
!pip install --upgrade transformers



In [2]:
!pip install sentencepiece



In [3]:
!git clone https://github.com/ozcangundes/multitask-question-generation.git

Cloning into 'multitask-question-generation'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 50 (delta 21), reused 23 (delta 5), pack-reused 0[K
Receiving objects: 100% (50/50), 18.00 KiB | 9.00 MiB/s, done.
Resolving deltas: 100% (21/21), done.


In [4]:
%cd multitask-question-generation/

/content/multitask-question-generation


In [10]:
from google.colab import drive

drive.mount('/content/drive')

main_path = '/content/drive/MyDrive/novelmetricwork/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import pandas as pd
import os
import json

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pipelines import pipeline

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

import warnings
warnings.filterwarnings('ignore')

In [21]:
class JsonParser:
    smoothing = SmoothingFunction().method3

    def __init__(self, main_path):
        self.main_path = main_path
        self.json_path = self.main_path + 'jsonpars'

        tokenizer = AutoTokenizer.from_pretrained("ozcangundes/mt5-multitask-qa-qg-turkish")
        model = AutoModelForSeq2SeqLM.from_pretrained("ozcangundes/mt5-multitask-qa-qg-turkish")
        self.multimodel = pipeline("multitask-qa-qg",tokenizer=tokenizer,model=model)

        self.json_list = self.load_json_files()
        self.qa_df = self.parse_json()
        print("Total number of questions and answers: ", len(self.qa_df))

    # Load all the json files
    def load_json_files(self):
        try:
            file_lists = os.listdir(self.json_path)
            json_list = [file_ for file_ in file_lists if file_.endswith('.json')]

            return json_list
        except FileNotFoundError:
            print("Directory not found")
            return []

    # Parse the json files to a pandas dataframe
    def parse_json(self):
        qa_df = pd.DataFrame(columns=['Title', 'Context', "id", 'Question', 'Answer'])
        for json_file in self.json_list:
            try:
                with open(os.path.join(self.json_path, json_file), 'r') as f:
                    json_data = json.load(f)
                    for data in json_data['data']:
                        title = data['title']
                        context = data['paragraphs'][0]['context']
                        for qa in data['paragraphs'][0]['qas']:
                            id_ = qa.get('id', None)
                            question = qa['question']
                            answer = qa['answers'][0]['text']
                            qa_df.loc[len(qa_df)] = [title, context, id_, question, answer]

            except (FileNotFoundError, json.JSONDecodeError) as e:
                print(f"Error reading {json_file}: {e}")

        return qa_df

    # Generate questions and answers using the mT5 model and save the results to a new dataframe
    def generate_qa(self):
        try:
            generated_qa_df = pd.read_csv(self.main_path + "generated_qa_by_mT5.cvs")
            print('generated_qa_df not found')

        except FileNotFoundError:
            print("generated_qa_df not found")
            df_questions = [q for q in self.qa_df['Question']]
            df_answers = [a for a in self.qa_df['Answer']]

            len_context = len(set(self.qa_df['Context']))
            generated_qa_df = pd.DataFrame(columns=['Title', 'Context', "id", 'Question', 'Answer'])

            counter = 1
            # Generate questions and answers for each context
            for context, title in zip(set(self.qa_df['Context']), set(self.qa_df['Title'])):
                print(f"Generated questions and answers for context: {counter}/{len_context - 2}")
                generated_qa = self.multimodel(context)
                counter += 1

                for qa in generated_qa:
                    if qa['question'] in df_questions or qa['answer'] in df_answers:
                        generated_qa_df.loc[len(generated_qa_df)] = [title, context, 2, qa['question'], qa['answer']]

            generated_qa_df.to_csv(self.main_path + 'generated_qa_by_mT5.csv', index=False, encoding='utf-8')

        return generated_qa_df

    # Find the bleu score for the original and generated text
    @staticmethod
    def get_bleu_score(ref_question, gen_question):
        bleu_score = sentence_bleu([ref_question], gen_question, smoothing_function=JsonParser.smoothing)
        return bleu_score

    # Find the same questions and their answers in the original and generated dataframes
    def get_same_qa_blue_score(self, generated_qa_df, merge_on):

        print(f'same_{merge_on.lower()}_bleu_scores.csv not found in {self.main_path}')
        merged_df = pd.merge(self.qa_df, generated_qa_df, on=merge_on, suffixes=('_original', '_generated'))
        merged_df = merged_df.drop(columns=['Title_generated', 'Context_generated', 'id_generated'])

        if merge_on == 'Answer':
          merged_df['bleu_score'] = merged_df.apply(lambda row: self.get_bleu_score(row['Question' + '_original'], row['Question' + '_generated']), axis=1)
          merged_df['Answer'], merged_df['Question_generated'] = merged_df['Question_generated'], merged_df['Answer']
          merged_df.columns = ['Title', 'Context', 'id', 'Question', 'Answer', 'Generated_Question', 'Bleu_Score']

        elif merge_on == 'Question':
          merged_df['bleu_score'] = merged_df.apply(lambda row: self.get_bleu_score(row['Answer' + '_original'], row['Answer' + '_generated']), axis=1)
          # merged_df['Answer'], merged_df['Question_generated'] = merged_df['Question_generated'], merged_df['Answer']
          # merged_df.columns = ['Title', 'Context', 'id', 'Question', 'Answer', 'Generated_Question', 'Bleu_Score']
        else:
          print(f"merged_df has no column {merge_on}")

        merged_df.to_csv(self.main_path + f"bleu_scores_merge_on_{merge_on.lower()}.csv", index=False, encoding='utf-8')

        return merged_df

In [22]:
json_parser = JsonParser(main_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Total number of questions and answers:  3508


In [23]:
generated_qa_df = json_parser.generate_qa()

generated_qa_df not found
Generated questions and answers for context: 1/109
Generated questions and answers for context: 2/109
Generated questions and answers for context: 3/109
Generated questions and answers for context: 4/109
Generated questions and answers for context: 5/109
Generated questions and answers for context: 6/109
Generated questions and answers for context: 7/109
Generated questions and answers for context: 8/109
Generated questions and answers for context: 9/109
Generated questions and answers for context: 10/109
Generated questions and answers for context: 11/109
Generated questions and answers for context: 12/109
Generated questions and answers for context: 13/109
Generated questions and answers for context: 14/109
Generated questions and answers for context: 15/109
Generated questions and answers for context: 16/109
Generated questions and answers for context: 17/109
Generated questions and answers for context: 18/109
Generated questions and answers for context: 19

In [24]:
same_answers_bleu_scores = json_parser.get_same_qa_blue_score(generated_qa_df, merge_on='Answer')

same_answer_bleu_scores.csv not found in /content/drive/MyDrive/novelmetricwork/


In [30]:
same_answers_bleu_scores.tail()

Unnamed: 0,Title,Context,id,Question,Answer,Generated_Question,Bleu_Score
5219,III. Murad,"III. Murad, divan edebiyatındaki mahlasıyla Mu...",2896,III. Murad II. Selim'in kimden olan en büyük o...,III. Murad II. Selim'in kimden olan en büyük o...,Nurbanu Sultan'dan,1.0
5220,III. Murad,"III. Murad, divan edebiyatındaki mahlasıyla Mu...",2901,III. Murad babası padişah olduktan sonra nerey...,III. Murad babası padişah olduktan sonra hangi...,Manisa Sancakbeyliğine,0.736133
5221,III. Murad,"III. Murad, divan edebiyatındaki mahlasıyla Mu...",2902,III. Murad ne zaman tahta geçti?,III. Murad hangi tarihte İstanbul'a gelerek Os...,22 Aralık 1574'te,0.274175
5222,II. Bayezid (Sultân Bayezid-î Velî),"II. Bayezid, ayrıca bilinen adıyla Sultân Baye...",1838,II. Bayezid'in bilinen adı nedir?,II. Bayezid'in bilinen adı nedir?,Sultân Bayezid-î Velî,1.0
5223,II. Bayezid (Sultân Bayezid-î Velî),"II. Bayezid, ayrıca bilinen adıyla Sultân Baye...",1839,II. Bayezid'in doğum tarihi nedir?,II. Bayezid'in doğum tarihi nedir?,3 Aralık 1447,1.0


In [26]:
same_question_bleu_scores = json_parser.get_same_qa_blue_score(generated_qa_df, merge_on='Question')

same_question_bleu_scores.csv not found in /content/drive/MyDrive/novelmetricwork/


In [29]:
same_question_bleu_scores.tail()

Unnamed: 0,Title_original,Context_original,id_original,Question,Answer_original,Answer_generated,bleu_score
594,III. Murad,"III. Murad, divan edebiyatındaki mahlasıyla Mu...",2898,III. Murad Alaşehir Sancakbeyliğine kim tarafı...,dedesi Kanuni Sultan Süleyman,Kanuni Sultan Süleyman,0.727471
595,II. Bayezid (Sultân Bayezid-î Velî),"II. Bayezid, ayrıca bilinen adıyla Sultân Baye...",1838,II. Bayezid'in bilinen adı nedir?,Sultân Bayezid-î Velî,Sultân Bayezid-î Velî,1.0
596,II. Bayezid (Sultân Bayezid-î Velî),"II. Bayezid, ayrıca bilinen adıyla Sultân Baye...",1839,II. Bayezid'in doğum tarihi nedir?,3 Aralık 1447,3 Aralık 1447,1.0
597,II. Bayezid (Sultân Bayezid-î Velî),"II. Bayezid, ayrıca bilinen adıyla Sultân Baye...",1842,II. Bayezid'in babası kimdir?,Fatih Sultan Mehmed,Yavuz Sultan Selim,0.407136
598,II. Bayezid (Sultân Bayezid-î Velî),"II. Bayezid, ayrıca bilinen adıyla Sultân Baye...",1843,II. Bayezid'in annesi kimdir?,Sitti Mükrime Hatun ya da Emîne Gül - Bahar Vâ...,Sitti Mükrime Hatun ya da Emîne Gül - Bahar Vâ...,0.931063
