##**Dataset**(JSON.File)


**import**





In [None]:
import json

**Json File**

In [None]:
def import_qas_data(datapath):
    with open(datapath) as data_file:
        data = json.load(data_file)

    data_entity_list = []
    for item in data['data']:
        entity = DataEntity(item['title'], item['paragraphs'])
        data_entity_list.append(entity)
    return data_entity_list

In [None]:
class DataEntity:
    def __init__(self, title, paragraph_data):
        self._title_ = title
        self._paragraphs_ = []
        for item in paragraph_data:
            paragraph = Paragraph(item['context'], item['qas'])
            self._paragraphs_.append(paragraph)

In [None]:
class Paragraph:
    def __init__(self, context, qas):
        self._context_ = context
        self._qas_ = []
        for answer in qas:
            qa = QAEntity(answer['question'], answer['id'])
            for item in answer['answers']:
                a = Answer(item['answer_start'], item['text'])
                qa._answers_.append(a)
            self._qas_.append(qa)

In [None]:
class QAEntity:
    def __init__(self, question, id):
        self._question_ = question
        self._id_ = id
        self._answers_ = []

In [None]:
class Answer:
    def __init__(self, answer_start, text):
        self._answer_start_ = answer_start
        self._text_ = text

##**Tokenize**

**import**

In [None]:
import nltk
nltk.download('punkt')
import numpy as np
from keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Tokenize**

In [None]:
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('جمال أحمد حمزة خاشقجي ، صحفي و إعلامي سعودي.من هو جمال أحمد حمزة خاشقجي؟')
    [جمال', 'أحمد','حمزة','خاشقجي','،','صحفي','و','إعلامي','سعودي','.','من','هو','جمال','أحمد','حمزة','خاشقجي','؟''ك]
    '''
    return [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]

In [None]:
tokenize('جمال أحمد حمزة خاشقجي ، صحفي و إعلامي سعودي.من هو جمال أحمد حمزة خاشقجي؟')

['جمال',
 'أحمد',
 'حمزة',
 'خاشقجي',
 '،',
 'صحفي',
 'و',
 'إعلامي',
 'سعودي.من',
 'هو',
 'جمال',
 'أحمد',
 'حمزة',
 'خاشقجي؟']

In [None]:
def tokenizeVal(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('جمال أحمد حمزة خاشقجي ، صحفي و إعلامي سعودي.من هو جمال أحمد حمزة خاشقجي؟')
    [جمال', 'أحمد','حمزة','خاشقجي','،','صحفي','و','إعلامي','سعودي','.','من','هو','جمال','أحمد','حمزة','خاشقجي','؟''ك]
    '''
    tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
    tokenIdx2CharIdx = [None] * len(tokenizedSent)
    idx = 0
    token_idx = 0
    while idx < len(sent) and token_idx < len(tokenizedSent):
        word = tokenizedSent[token_idx]
        if sent[idx:idx+len(word)] == word:
            tokenIdx2CharIdx[token_idx] = idx
            idx += len(word)
            token_idx += 1 
        else:
            idx += 1
    return tokenizedSent, tokenIdx2CharIdx

In [None]:
tokenizeVal('جمال أحمد حمزة خاشقجي ، صحفي و إعلامي سعودي.من هو جمال أحمد حمزة خاشقجي؟')

(['جمال',
  'أحمد',
  'حمزة',
  'خاشقجي',
  '،',
  'صحفي',
  'و',
  'إعلامي',
  'سعودي.من',
  'هو',
  'جمال',
  'أحمد',
  'حمزة',
  'خاشقجي؟'],
 [0, 5, 10, 15, 22, 24, 29, 31, 38, 47, 50, 55, 60, 65])

# **Datasets**

In [None]:
def splitDatasets(f):
    '''Given a parsed Json data object, split the object into training context (paragraph), question, answer matrices, 
       and keep track of max context and question lengths.
    '''
    xContext = [] # list of contexts paragraphs
    xQuestion = [] # list of questions
    xQuestion_id = [] # list of question id
    xAnswerBegin = [] # list of indices of the beginning word in each answer span
    xAnswerEnd = [] # list of indices of the ending word in each answer span
    xAnswerText = [] # list of the answer text
    maxLenContext = 0
    maxLenQuestion = 0

    for data in f:
        paragraphs = data._paragraphs_
        for paragraph in paragraphs:
            context = paragraph._context_
            context1 = context.replace("''", '" ')
            context1 = context1.replace("``", '" ')
            contextTokenized = tokenize(context.lower())
            contextLength = len(contextTokenized)
            if contextLength > maxLenContext:
                maxLenContext = contextLength
            qas = paragraph._qas_
            for qa in qas:
                question = qa._question_
                question = question.replace("''", '" ')
                question = question.replace("``", '" ')
                questionTokenized = tokenize(question.lower())
                if len(questionTokenized) > maxLenQuestion:
                    maxLenQuestion = len(questionTokenized)
                question_id = qa._id_
                answers = qa._answers_
                for answer in answers:
                    answerText = answer._text_
                    answerTokenized = tokenize(answerText.lower())
                    # find indices of beginning/ending words of answer span among tokenized context
                    contextToAnswerFirstWord = context1[:answer._answer_start_ + len(answerTokenized[0])]
                    answerBeginIndex = len(tokenize(contextToAnswerFirstWord.lower())) - 1
                    answerEndIndex = answerBeginIndex + len(answerTokenized) - 1
                    
                    xContext.append(contextTokenized)
                    xQuestion.append(questionTokenized)
                    xQuestion_id.append(str(question_id))
                    xAnswerBegin.append(answerBeginIndex)
                    xAnswerEnd.append(answerEndIndex)
                    xAnswerText.append(answerText)


    return xContext, xQuestion, xQuestion_id, xAnswerBegin, xAnswerEnd, xAnswerText, maxLenContext, maxLenQuestion

In [None]:
# for validation dataset, as there's no need to keep track of answers
def splitValDatasets(f):
    '''Given a parsed Json data object, split the object into training context (paragraph), question, answer matrices, 
       and keep track of max context and question lengths.
    '''
    xContext = [] # list of contexts paragraphs
    xQuestion = [] # list of questions
    xQuestion_id = [] # list of question id
    xToken2CharIdx = []
    xContextOriginal = []
    maxLenContext = 0
    maxLenQuestion = 0

    for data in f:
        paragraphs = data._paragraphs_
        for paragraph in paragraphs:
            context = paragraph._context_
            context1 = context.replace("''", '" ')
            context1 = context1.replace("``", '" ')
            contextTokenized, tokenIdx2CharIdx = tokenizeVal(context1.lower())
            contextLength = len(contextTokenized)
            if contextLength > maxLenContext:
                maxLenContext = contextLength
            qas = paragraph._qas_
            for qa in qas:
                question = qa._question_
                question = question.replace("''", '" ')
                question = question.replace("``", '" ')
                questionTokenized = tokenize(question.lower())
                if len(questionTokenized) > maxLenQuestion:
                    maxLenQuestion = len(questionTokenized)
                question_id = qa._id_
                answers = qa._answers_
                
                xToken2CharIdx.append(tokenIdx2CharIdx)
                xContextOriginal.append(context)
                xContext.append(contextTokenized)
                xQuestion.append(questionTokenized)
                xQuestion_id.append(str(question_id))

    return xContext, xToken2CharIdx, xContextOriginal, xQuestion, xQuestion_id, maxLenContext, maxLenQuestion


In [None]:
trainData =import_qas_data('arcd.json')
tContext, tQuestion, tQuestion_id, tAnswerBegin, tAnswerEnd, tAnswerText, maxLenTContext, maxLenTQuestion = splitDatasets(trainData)

In [None]:
tAnswerText

['صحفي وإعلامي',
 'حمزة خاشقجي (13 أكتوبر 1958، المدينة المنورة - 2 أكتوبر 2018)،',
 'المدينة المنورة',
 'واشنطن بوست',
 'وُصف في الصحف وأجهزة الاعلام العالمية بأنه "وفيّ للدولة السعودية" و"منتقد لسياساتها".',
 'وتقلّد منصب مستشار،',
 'في سبتمبر 2017،',
 'الحكومة السعودية.',
 'ولي العهد السعودي محمد بن سلمان، والملك سلمان بن عبد العزيز.',
 'المملكة العربية السعودية)',
 'الجنوب الغربي',
 'حوالي مليوني كيلومتر مربع.',
 'يحدها من الشمال العراق والأردن',
 'الكويت',
 'قطر والإمارات العربية المتحدة بالإضافة إلى البحرين التي ترتبط بالسعودية من خلال جسر الملك',
 'أول تلك الكيانات إمارة الدرعية',
 'محمد بن سعود',
 'سنة 1157 هـ / 1744',
 'تقع في الركن الشمالي الشرقي من قارة أفريقيا،',
 'حيث تقع شبه جزيرة سيناء داخل',
 'قُدّر عدد سكانها بـ104 مليون نسمة،',
 '1115 كم،',
 'مع السودان',
 'حوالي 1.002.000 كيلومتر مربع',
 'مصر في وادي النيل وفي الحضر',
 'أقل من 4% من المساحة الكلية للبلاد أي حوالي 33000 كم2،',
 'هي القاهرة الكبرى',
 'أَبُو القَاسِم مُحَمَّد بنِ عَبد الله بنِ عَبدِ المُطَّلِب',
 'ليعيد

**load training data, parse, and split**

In [None]:
print('Loading in training data')
trainData =import_qas_data('arcd.json')
tContext, tQuestion, tQuestion_id, tAnswerBegin, tAnswerEnd, tAnswerText, maxLenTContext, maxLenTQuestion = splitDatasets(trainData)

Loading in training data


In [None]:
print('Loading in Validation data...')
valData = import_qas_data('arcd-test.json')
vContext, vToken2CharIdx, vContextOriginal, vQuestion, vQuestion_id, maxLenVContext, maxLenVQuestion = splitValDatasets(valData)

Loading in Validation data...


# **Building vocabulary**

In [None]:
print('Building vocabulary...')
# build a vocabular over all training and validation context paragraphs and question words
vocab = {}
for words in tContext + tQuestion + vContext + vQuestion:
    for word in words:
        if word not in vocab:
            vocab[word] = 1
vocab = sorted(vocab.keys())  

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_index = dict((c, i + 1) for i, c in enumerate(vocab))
print(word_index)
context_maxlen = max(maxLenTContext, maxLenVContext)
question_maxlen = max(maxLenTQuestion, maxLenVQuestion)

Building vocabulary...
{'!': 1, '"': 2, '$': 3, '%': 4, "'": 5, "'che": 6, "'s": 7, '(': 8, ')': 9, '*': 10, ',': 11, '-': 12, '-10': 13, '-عاصمة': 14, '-في': 15, '-كلاي-،': 16, '-وفق': 17, '-ومعهم': 18, '.': 19, '..': 20, '...': 21, '.ترك': 22, '.كتب': 23, '.وذات': 24, '.يقول': 25, '/': 26, '/ʃəˈkiːrə/': 27, '/ˌɛkˈsəʊ/': 28, '0°': 29, '1': 30, '1,000': 31, '1-2': 32, '1.002.000': 33, '1.1': 34, '1.2': 35, '1.338': 36, '1.5': 37, '1.5٪': 38, '1.7': 39, '1.8': 40, '10': 41, '10,000': 42, '10.180': 43, '100': 44, '100,000': 45, '1000': 46, '10000000': 47, '103,000': 48, '1037م': 49, '1040م': 50, '1071م': 51, '1093': 52, '11': 53, '11,900,000': 54, '11-14': 55, '11.2': 56, '1115': 57, '1138': 58, '114': 59, '1157': 60, '1157م': 61, '1165': 62, '117': 63, '1171م': 64, '1193': 65, '11،787': 66, '12': 67, '120': 68, '1204': 69, '1206': 70, '1227': 71, '1227م': 72, '1233': 73, '125': 74, '1258م،': 75, '126': 76, '1260': 77, '1262': 78, '127': 79, '128': 80, '1280': 81, '1281م،': 82, '1295م': 

In [None]:
vocab_size

16353

In [None]:
context_maxlen 

349

In [None]:
question_maxlen

17

# **vectorizeData**

In [None]:
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in range(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')


In [None]:
# for validation dataset
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in range(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')


# **vectorize training and validation datasets**

In [None]:
print('Begin vectoring process...')
#tX: training Context, tXq: training Question, tYBegin: training Answer Begin ptr, tYEnd: training Answer End ptr
tX, tXq, tYBegin, tYEnd = vectorizeData(tContext, tQuestion, tAnswerBegin, tAnswerEnd, word_index, context_maxlen, question_maxlen)

Begin vectoring process...


In [None]:
#vX: validation Context, vXq: validation Question
vX, vXq = vectorizeValData(vContext, vQuestion, word_index, context_maxlen, question_maxlen)
print('Vectoring process completed.')

Vectoring process completed.


In [None]:
# shuffle train data
randindex = np.random.permutation(tX.shape[0])
tX = tX[randindex, :]
tXq = tXq[randindex, :]
tYBegin = tYBegin[randindex, :]
tYEnd = tYEnd[randindex, :]

**shape**

In [None]:
print('tX.shape = {}'.format(tX.shape))
print('tXq.shape = {}'.format(tXq.shape))
print('tYBegin.shape = {}'.format(tYBegin.shape))
print('tYEnd.shape = {}'.format(tYEnd.shape))
print('vX.shape = {}'.format(vX.shape))
print('vXq.shape = {}'.format(vXq.shape))
print('context_maxlen, question_maxlen = {}, {}'.format(context_maxlen, question_maxlen))

tX.shape = (1395, 349)
tXq.shape = (1395, 17)
tYBegin.shape = (1395, 349)
tYEnd.shape = (1395, 349)
vX.shape = (702, 349)
vXq.shape = (702, 17)
context_maxlen, question_maxlen = 349, 17


# **Model**

**Import**

In [None]:
from keras.layers import recurrent
from keras.layers import recurrent, Input, Bidirectional, LSTM, Lambda
from keras import layers
from keras.layers import concatenate
from keras.layers import Dense,  Dropout, RepeatVector
from keras.models import Model
from keras import optimizers
from keras.optimizers import Adam, RMSprop
from keras.layers import LSTM, Bidirectional, TimeDistributed, GRU, AveragePooling1D, Reshape, GlobalAveragePooling1D
!pip install keras-metrics
import keras
import keras_metrics as km

Collecting keras-metrics
  Downloading https://files.pythonhosted.org/packages/32/c9/a87420da8e73de944e63a8e9cdcfb1f03ca31a7c4cdcdbd45d2cdf13275a/keras_metrics-1.1.0-py2.py3-none-any.whl
Installing collected packages: keras-metrics
Successfully installed keras-metrics-1.1.0


**Arab QA Model**

In [None]:
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
print(' Embed / Sent / Query =  {}, {}, {}'.format( EMBED_HIDDEN_SIZE,SENT_HIDDEN_SIZE,QUERY_HIDDEN_SIZE))

 Embed / Sent / Query =  50, 100, 100


**input , Embedding layer**

In [None]:
cinput = Input(shape=(context_maxlen,), dtype='int32', name='cinput')
cembed= layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(cinput)

qinput = Input(shape=(question_maxlen,), dtype='int32', name='qinput')
qembed = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(qinput)

**bidirectional GRU layers**

In [None]:
Q = Bidirectional(GRU(64, return_sequences=True))(qembed)
D = Bidirectional(GRU(64, return_sequences=True))(cembed)
Q1 = Bidirectional(GRU(96, return_sequences=True))(Q)
D1 = Bidirectional(GRU(96, return_sequences=True))(D)
Q2 = Bidirectional(GRU(128, return_sequences=False))(Q1)
D2 = Bidirectional(GRU(128, return_sequences=False))(D1)

In [None]:
L = concatenate([D2, Q2])
answerPtrBegin_output = Dense(context_maxlen, activation='softmax')(L)
Lmerge = concatenate([L, answerPtrBegin_output])
answerPtrEnd_output = Dense(context_maxlen, activation='softmax')(Lmerge)

In [None]:
model = Model(input=[cinput, qinput], output=[answerPtrBegin_output, answerPtrEnd_output])

  """Entry point for launching an IPython kernel.


In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy',km.f1_score()])

tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> tp
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> fp
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> tp
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> fn


In [None]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
cinput (InputLayer)             (None, 349)          0                                            
__________________________________________________________________________________________________
qinput (InputLayer)             (None, 17)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 349, 50)      817650      cinput[0][0]                     
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 17, 50)       817650      qinput[0][0]                     
____________________________________________________________________________________________

In [None]:
model.fit([tX, tXq], [tYBegin, tYEnd], epochs=80, batch_size=128, shuffle=True, validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1116 samples, validate on 279 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.callbacks.callbacks.History at 0x7fb157e67be0>

In [None]:
print(model.metrics_names)

['loss', 'dense_1_loss', 'dense_2_loss', 'dense_1_accuracy', 'dense_1_f1_score', 'dense_2_accuracy', 'dense_2_f1_score']


In [None]:
acc = model.evaluate([tX, tXq], [tYBegin, tYEnd])
print(acc)

[4.5458252009525095, 2.386707067489624, 2.325065851211548, 0.7985662817955017, 0.9330125451087952, 0.800000011920929, 0.9127916693687439]


In [None]:
predictions = model.predict([vX, vXq], batch_size=128)

In [None]:
print(predictions[0].shape, predictions[1].shape)
# make class prediction
ansBegin = np.zeros((predictions[0].shape[0],), dtype=np.int32)
ansEnd = np.zeros((predictions[0].shape[0],),dtype=np.int32) 
for i in range(predictions[0].shape[0]):
	ansBegin[i] = predictions[0][i, :].argmax()
	ansEnd[i] = predictions[1][i, :].argmax()
print(ansBegin.min(), ansBegin.max(), ansEnd.min(), ansEnd.max())

(702, 349) (702, 349)
0 344 0 348


In [None]:
# extract answer tokens and join them
answers = {}
for i in range(len(vQuestion_id)):
    #print i
    if ansBegin[i] >= len(vContext[i]):
        answers[vQuestion_id[i]] = ""
    elif ansEnd[i] >= len(vContext[i]):
        answers[vQuestion_id[i]] = vContextOriginal[i][vToken2CharIdx[i][ansBegin[i]]:]
    else:
        answers[vQuestion_id[i]] = vContextOriginal[i][vToken2CharIdx[i][ansBegin[i]]:vToken2CharIdx[i][ansEnd[i]]+len(vContext[i][ansEnd[i]])]


In [None]:
# write out answers to json file
import io
with io.open('result', 'w', encoding='utf-8') as f:
    f.write((json.dumps(answers, ensure_ascii=False)))

In [None]:
pacc = model.evaluate([vX, vXq], [predictions[0], predictions[1]])
print(pacc)

[1.0261926892136577, 0.502328634262085, 0.523270845413208, 1.0, 0.9545453786849976, 1.0, 0.9090908169746399]
