In [None]:
import json
import pandas as pd
import numpy as np
from googletrans import Translator
import re

## Importing SQuAD Dataset

This dataset is imported from the official release of SQuAD.
In cell below, we are trying to convert json files to desired csv files. 

In [None]:
f1 = open('../data/raw/train-v1.1.json')  # This is SQuAD version 1 dataset
data1 = json.load(f1)['data']
df1 = pd.DataFrame(columns=['id', 'question', 'answer', 'title', 'context'])

In [None]:
i = 0
for row in data1:
    title = row['title']
    paragraphs = row['paragraphs']
    for paragraph in paragraphs:
        context = paragraph['context']
        qas = paragraph['qas']
        for qa in qas:
            answer = qa['answers'][0]['text']
            question = qa['question']
            idd = qa['id']
            df1.loc[i] = [idd, question, answer, title, context]
            i += 1

In [None]:
df1.to_csv('../data/processed/squad_v1.csv')

In [None]:
f2 = open('../data/raw/train-v2.0_1.json')  # This is SQuAD version 2 dataset
data2 = json.load(f2)['data']
df2 = pd.DataFrame(columns=['id', 'question', 'answer', 'title', 'context', 'is_impossible'])

In [None]:
i = 0
for row in data2:
    title = row['title']
    paragraphs = row['paragraphs']
    for paragraph in paragraphs:
        context = paragraph['context']
        qas = paragraph['qas']
        for qa in qas:
            is_impossible = qa['is_impossible']
            if is_impossible:
                answer = None
            else:
                answer = qa['answers'][0]['text']
            question = qa['question']
            idd = qa['id']
            df2.loc[i] = [idd, question, answer, title, context, is_impossible]
            i += 1

In [None]:
df2.to_csv('../data/processed/squad_v2.csv') 

## Translation

In [None]:
translator = Translator(service_urls=['translate.google.com'])
text = 'University of Notre Dame'
translator.translate(text=text, src='en', dest='fa').text  # just for test

In [None]:
df1 = pd.read_csv('../data/processed/squad_v1.csv', index_col=0)

In [None]:
df1.info()

In [None]:
for i, v in df1.iterrows():
    try:
        df1.loc[i, 'question_fa'] = translator.translate(
            text=v['question'], src='en', dest='fa').text
        df1.loc[i, 'answer_fa'] = translator.translate(
            text=v['answer'], src='en', dest='fa').text
        df1.loc[i, 'title_fa'] = translator.translate(
            text=v['title'].replace('_', ' '), src='en', dest='fa').text
        df1.loc[i, 'context_fa'] = translator.translate(
            text=v['context'], src='en', dest='fa').text
    except Exception:
        skipped_ids.append(i)
        print(i, end=' ')

df1.to_csv('../data/processed/squad_v1_translated.csv')

The difference between two versions of SQuAD dataset is the addition field, called *is_impossible* and it is not translatable. So there is no need to translate the second version of the dataset.