In [52]:
# coding: utf-8
"""
filter and tokenize data
Given train.tsv, dev.tsv, test.tsv
Output WikiPassageQA-train-filtered.txt, WikiPassageQA-dev-filtered.txt, WikiPassageQA-test-filtered.txt
"""
from __future__ import print_function

import os
import sys
import json
import nltk
import re

# Returns a dict that contains all passages in document_passages file
def read_passages(path_to_passages):
    document_dict = {}
    
    with open(path_to_passages, 'r') as documents_json:
        document_dict = json.load(documents_json)
        
    return document_dict

def clean_special_chars(q_id, question_text):
    symbols_to_remove = ['?', "'", '"', '(', ')', ',', '.', ':', '>']
    for symbol in symbols_to_remove:
        question_text = question_text.replace(symbol, '')  
        
    question_text = question_text.replace('-',' ')
    question_text = question_text.replace('&',' and ')

    if "[" in question_text:
        if q_id == "3340": #remove contents
            question_text = re.sub(r'\[[^\(]*?\]', r'', question_text)
        else: #keep contents
            question_text = re.sub(r'\[(?:[^\]|]*\|)?([^\]|]*)\]', r'\1', question_text)
    if "/" in question_text:
        if q_id == "104" or q_id == "857":
            question_text = question_text.replace('/','')
        else:
            question_text = question_text.replace('/',' or ')
    
    return question_text

def preprocess_q_text(q_text):
    q_text = ' '.join(nltk.word_tokenize(q_text))
    q_text = clean_special_chars(q_id, q_text) #needed for our indri run            
    q_text = q_text.strip()        
    return q_text

def remove_tabs(text):
    return text.replace('\t', ' ')

def preprocess_passage(passage):
    passage = ' '.join(nltk.word_tokenize(passage))
    passage = remove_tabs(passage)
    passage = passage.strip()
    return passage

if __name__ == '__main__':
       
    basedir = './WikiPassageQA/'
    q_files = [basedir + 'train.tsv', basedir + 'dev.tsv', basedir + 'test.tsv']
    doc_file = basedir + 'document_passages.json'
    outfiles_matchzoo = [basedir + 'WikiPassageQA-train-filtered.txt', basedir + 'WikiPassageQA-dev-filtered.txt', basedir + 'WikiPassageQA-test-filtered.txt']
    outfile_indri = "corpus.txt"
    
    q_ids_that_contain_no_question = ["4149", "4148", "1315"]
    q_ids_that_are_contained_in_the_train_and_dev_set = ["3566"] # remove it from the train set
    q_ids_that_contain_the_same_question_but_different_answers = ["3731", "3732"]
    q_ids_that_are_on_a_doc_which_has_a_duplicate_passage = ["2230", "2231", "2232", "2233", "2234"] #see description for doc 769
    q_ids_that_should_be_skipped = q_ids_that_contain_no_question + q_ids_that_are_contained_in_the_train_and_dev_set + q_ids_that_contain_the_same_question_but_different_answers + q_ids_that_are_on_a_doc_which_has_a_duplicate_passage
    
    doc_ids_that_have_no_question = ["188"] #The current matchzoo style for processing WikiQA which we follow, does not add the document to the corpus if there is no q about it
    doc_ids_that_have_a_duplicate_passage_with_another_doc = ["769"] #We remove 769 since 769-14 == 243-10, 769 has less questions than 243, hence we remove this doc and its questions
    doc_ids_that_should_be_skipped = doc_ids_that_have_no_question + doc_ids_that_have_a_duplicate_passage_with_another_doc
    
    document_dict = read_passages(doc_file)
    
    for i in range(len(q_files)):
        fout = open(outfile[i], 'w')
        
        
        firstline = True
        for line in open(q_files[i], 'r'):
            if firstline == True: #skip the header
                firstline = False
                continue

            q_id, q_text, doc_id, doc_name, a_ids = line.split('\t')
            
            if q_id in q_ids_that_should_be_skipped:
                continue
            q_text = preprocess_q_text(q_text)
            
            document = document_dict[doc_id]
            for p_id, passage in document.iteritems():
                passage = preprocess_passage(passage)
                if p_id in a_ids:
                    print(q_text + "\t" + passage + "\t" + "1", file=fout)
                else:
                    print(q_text + "\t" + passage + "\t" + "0", file=fout)
        
        fout.close()


KeyboardInterrupt: 

In [47]:
import json
basedir = './WikiPassageQA/'
in_corpfile = [basedir + 'train.tsv', basedir + 'dev.tsv', basedir + 'test.tsv']

q_ids_that_contain_no_question = ["4149", "4148", "1315"]
q_ids_that_contain_the_same_question_but_different_answers = ["3731", "3732"]
q_ids_that_are_on_a_doc_which_has_a_duplicate_passage = ["2230", "2231", "2232", "2233", "2234"] #see description for doc 769
q_ids_that_should_be_skipped = q_ids_that_contain_no_question + q_ids_that_contain_the_same_question_but_different_answers + q_ids_that_are_on_a_doc_which_has_a_duplicate_passage
my_a_ids_per_question = []
my_p_ids_per_question = []

document_dict = {}
doc_file = basedir + 'document_passages.json'
with open(doc_file, 'r') as documents_json:
        document_dict = json.load(documents_json)

for i in range(len(in_corpfile)):
    #fout = open(outfile[i], 'w')
    firstline = True
    for line in open(in_corpfile[i], 'r'):
        if firstline == True: #skip the header
            firstline = False
            continue

        q_id, q_text, doc_id, doc_name, a_ids = line.split('\t')
        
        if q_id in q_ids_that_should_be_skipped:
            continue
        else:
            document = document_dict[doc_id]
            my_a_ids_per_question.append(len(a_ids.split(',')))
            my_p_ids_per_question.append(len(document.keys()))
            if len(document.keys()) < 3:
                print document.keys()

[u'1', u'0']
[u'1', u'0']
[u'1', u'0']
[u'1', u'0']
[u'1', u'0']
[u'1', u'0']
[u'1', u'0']
[u'1', u'0']
[u'1', u'0']
[u'1', u'0']
[u'1', u'0']
[u'1', u'0']


In [51]:
print len([a_id_amount for a_id_amount in my_a_ids_per_question if a_id_amount > 0])

4155


In [44]:
print len([p_id_amount for p_id_amount in my_p_ids_per_question if p_id_amount > 1])

4155


In [4]:
mylist = [1, 2, 2, 3, 3, 3, 10]
from collections import Counter
Counter(mylist)

Counter({1: 1, 2: 2, 3: 3, 10: 1})

In [5]:
# coding: utf-8
"""
tokenize data
Given train.tsv, dev.tsv, test.tsv
Output train-filtered.txt, dev-filtered.txt, test-filtered.txt
"""
from __future__ import print_function

import os
import sys
import json
import nltk
import re

def remove_special_chars(q_id, question_text):
    question_text = question_text.replace('?','') #remove ?
    question_text = question_text.replace("'",'')
    question_text = question_text.replace('"','')
    question_text = question_text.replace('-',' ')
    question_text = question_text.replace('(','')
    question_text = question_text.replace(')','')
    question_text = question_text.replace(',','')
    question_text = question_text.replace('.','')
    question_text = question_text.replace('&',' and ')
    question_text = question_text.replace(':','')
    question_text = question_text.replace('>','')#error in dataset

    if "[" in question_text:
        if q_id == "3340": #remove contents
            question_text = re.sub(r'\[[^\(]*?\]', r'', question_text)
        else: #keep contents
            question_text = re.sub(r'\[(?:[^\]|]*\|)?([^\]|]*)\]', r'\1', question_text)
    if "/" in question_text:
        if q_id == "104" or q_id == "857":
            question_text = question_text.replace('/','')
        else:
            question_text = question_text.replace('/',' or ')
    
    return question_text

def remove_tabs(text):
    return text.replace('\t', ' ')

# Returns a dict that contains all passages in document_passages file
def read_passages(path_to_passages):
    document_dict = {}
    
    with open(path_to_passages, 'r') as documents_json:
        document_dict = json.load(documents_json)
        
    return document_dict
    
    
if __name__ == '__main__':

    basedir = './WikiPassageQA/'
    in_corpfile = [basedir + 'train.tsv', basedir + 'dev.tsv', basedir + 'test.tsv']
    doc_file = basedir + 'document_passages.json'
    outfile = [basedir + 'train-filtered.txt', basedir + 'dev-filtered.txt', basedir + 'test-filtered.txt']

    j = 0
    
    document_dict = read_passages(doc_file)
    
    for i in range(len(in_corpfile)):
        fout = open(outfile[i], 'w')
        firstline = True
        for line in open(in_corpfile[i], 'r'):
            if firstline == True: #skip the header
                firstline = False
                continue

            q_id, q_text, doc_id, doc_name, a_ids = line.split('\t')
            
            document = document_dict[doc_id]
            q_text = ' '.join(nltk.word_tokenize(q_text))
            q_text = remove_special_chars(q_id, q_text) #needed for our indri run            
            
            for p_id, passage in document.iteritems():
                passage = ' '.join(nltk.word_tokenize(passage))
                passage = remove_tabs(passage)
                if p_id in a_ids:
                    print(q_text.strip() + "\t" + passage.strip() + "\t" + "1", file=fout)
                else:
                    print(q_text.strip() + "\t" + passage.strip() + "\t" + "0", file=fout)
            #break
            j += 1
            if j > 19 and j % 20 == 0:
                print(j)
        
        fout.close()

20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160
2180
2200
2220
2240
2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620
2640
2660
2680
2700
2720
2740
2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120
3140
3160
3180
3200
3220
3240
3260
3280
3300
3320
3340
3360
3380
3400
3420
3440
3460
3480
3500
3520
3540
3560
3580
3600
3620
3640
3660
3680
3700
3720
3740
3760
3780
3800
3820
3840
3860
3880
3900
3920
3940
3960
3980
4000
4020
4040
4060
4080
4100
4120
4140
4160


In [2]:
# #### TESTING ON WikiQACORPUS
# # coding: utf-8
# """
# tokenize data
# Given train.tsv, dev.tsv, test.tsv
# Output train-filtered.txt, dev-filtered.txt, test-filtered.txt
# """
# from __future__ import print_function

# import os
# import sys
# import json
# import nltk
# import re

# def remove_special_chars(q_id, question_text):
#     question_text = question_text.replace('?','') #remove ?
#     question_text = question_text.replace("'",'')
#     question_text = question_text.replace('"','')
#     question_text = question_text.replace('-',' ')
#     question_text = question_text.replace('(','')
#     question_text = question_text.replace(')','')
#     question_text = question_text.replace(',','')
#     question_text = question_text.replace('.','')
#     question_text = question_text.replace('&',' and ')
#     question_text = question_text.replace(':','')
#     question_text = question_text.replace('>','')#error in dataset

#     if "[" in question_text:
#         if q_id == "3340": #remove contents
#             question_text = re.sub(r'\[[^\(]*?\]', r'', question_text)
#         else: #keep contents
#             question_text = re.sub(r'\[(?:[^\]|]*\|)?([^\]|]*)\]', r'\1', question_text)
#     if "/" in question_text:
#         if q_id == "104" or q_id == "857":
#             question_text = question_text.replace('/','')
#         else:
#             question_text = question_text.replace('/',' or ')
    
#     return question_text

# def remove_tabs(text):
#     return text.replace('\t', ' ')

# # Returns a dict that contains all passages in document_passages file
# def read_passages(path_to_passages):
#     document_dict = {}
    
#     with open(path_to_passages, 'r') as documents_json:
#         document_dict = json.load(documents_json)
        
#     return document_dict
    
    
# if __name__ == '__main__':


#     basedir = r'C:/Users/drennings/Downloads/WikiQACorpus/'
#     in_corpfile = [basedir + 'WikiQA-dev.tsv', basedir + 'WikiQA-test.tsv']
#     outfile = [basedir + 'dev.txt', basedir + 'test.txt']
    
#     j = 0
    
#     #document_dict = read_passages(doc_file)
    
#     for i in range(len(in_corpfile)):
#         fout = open(outfile[i], 'w')
#         firstline = True
#         for line in open(in_corpfile[i], 'r'):
#             if firstline == True: #skip the header
#                 firstline = False
#                 continue
            
#             #print("A")
#             q_id, q_text, doc_id, doc_name, s_id, sentence, label = line.split('\t')
#             #print("B")
            
#             #document = document_dict[doc_id]
#             q_text = ' '.join(nltk.word_tokenize(q_text))
#             #q_text = remove_special_chars(, q_text) #needed for our indri run            
#             #print("C")
            
#             #for p_id, passage in document.iteritems():
#             sentence = ' '.join(nltk.word_tokenize(sentence))
#             sentence = remove_tabs(sentence)
#             #print("D")
            
#             print(q_text.strip() + "\t" + sentence.strip() + "\t" + label, file=fout, end='')
            
#             j += 1
#             if j > 19 and j % 20 == 0:
#                 print(j)
        
#         fout.close()

20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160
2180
2200
2220
2240
2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620
2640
2660
2680
2700
2720
2740
2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120
3140
3160
3180
3200
3220
3240
3260
3280
3300
3320
3340
3360
3380
3400
3420
3440
3460
3480
3500
3520
3540
3560
3580
3600
3620
3640
3660
3680
3700
3720
3740
3760
3780
3800
3820
3840
3860


UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 8: ordinal not in range(128)