In [25]:
import pickle
import json
import gzip
import os
import subprocess
import numpy as np
import multiprocessing
import re 
import csv

In [26]:
## Options

# debugging mode

debug = 'yes'
build_index_flag = 'yes'
# N of workers for multiprocessing used grid_search
pool_size = 20

data_split = 'dev'
workdir = './workdir/'
qloc = '../qra_data/superuser/'
galago_loc='./galago-3.10-bin/bin/'

# build_index_flag = 'no'

In [27]:
def remove_sc(text):
###    text = re.sub('[.,?;*!%^&_+():-\[\]{}]', '', text.replace('"', '').replace('/', '').replace('\\', '').replace("'", '').strip())
##    text = re.sub('[\[\]{}.,?;*!%^&_+():-]', '', text.replace('"', '').replace('/', '').replace('\\', '').replace("'", '').strip()) # DeepPaper method
    text = re.sub(r'[^\w\s]',' ',text) # My method
###     text = text.rstrip('.?')
    return text

In [28]:
def read_questions(filename):
    with gzip.open(filename, 'rt') as tsv_in:
        qreader = csv.reader(tsv_in, delimiter = '\t')
        questions = {}
#         q_dict = {}
        for q in qreader:
            question = {}
            if 'quora' in filename:
                print('quora')
            elif 'sprint' in filename:
                print('quora')
            else:
#                 question['id'] = q[0]
#                 q_dict[q[0]] = q[1] + ' ' + q[2]
                question['title'] = q[1]
                question['text'] = q[2]
                questions[q[0]]=(dict(question))
#         return [questions, q_dict]
        return questions

In [29]:
def trectext_format(questions):
    trec_questions = {}
    for key, q in questions,items():
        doc = '<DOC>\n' + \
              '<DOCNO>' + key + '</DOCNO>\n' + \
              '<TITLE>' + q['title'] + '</TITLE>\n' + \
              '<TEXT>' + q['text'] + '</TEXT>\n' + \
              '</DOC>\n'
        trec_questions[key] = doc
    return trec_questions

In [30]:
def save_trectext(trec_questions, filename):
# Generate file to index
    with gzip.open(filename,'wt', encoding='utf-8') as f_out:
        for key, value in trec_questions.items():
            f_out.write(value)

In [31]:
def build_index(index_input, index_loc):
# Build corpus index 
    if not os.path.exists(index_loc):
            os.makedirs(index_loc) 
    index_loc_param = '--indexPath=' + index_loc
    galago_parameters = [galago_loc + 'galago', 'build', '--stemmer+krovetz']
    galago_parameters.append('--inputPath+' + index_input)
    galago_parameters.append(index_loc_param)
    print(galago_parameters)

    index_proc = subprocess.Popen(galago_parameters,
            stdout=subprocess.PIPE, shell=False)
    (out, err) = index_proc.communicate()
    print(out.decode("utf-8"))
    print(err)

In [32]:
def read_dups(dups_file):
    with open(dups_file, 'rt') as dups_in:
        dup_reader = csv.reader(dups_in, delimiter = ' ')
        dup_dict = {}
        for dup in dup_reader:
            dup_dict[dup[0]] = dup[1]
    return dup_dict

In [33]:
def generate_queries_file(q_all, q_dup, filename):
    queries_list = []
    queries_dict = {}
    query = {}
    for key, dup in q_dup.items():
        q = q_all[key]
        text = remove_sc(q['title'] + ' ' + q['text']) #Join title and text 
        query['number'] = key
#         query['text'] = '#stopword(' + text + ')'
        query['text'] = '(' + text + ')'
        queries_list.append(dict(query))
    queries_dict['queries'] = queries_list
    with open(filename, 'wt', encoding='utf-8') as q_file:
        json.dump(queries_dict, q_file, indent = 4)

In [34]:
# Return top 1 bm25 scored question = 'duplicated' question
def get_bm25_docs(queries_file, q_all, q_dup, index_loc, b_val=0.75, k_val=1.2):
    index_loc_param = '--index=' + index_loc  
    b=' --b=' + str(b_val)
    k=' --k=' + str(k_val)
    
    command = galago_loc + 'galago threaded-batch-search --threadCount=500 --verbose=false \
         --casefold=true --requested=1 ' + \
         index_loc_param + ' --scorer=bm25' + \
         b + \
         k + \
         '   ' + \
         queries_file + ' | cut -d" " -f1,3'
#     print(command)
#     command = command.encode('utf-8')
    galago_bm25_exec = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True, encoding='utf-8')
    (out, err) = galago_bm25_exec.communicate()
    ids_docs = out.splitlines()
#     print(ids_docs)
    question = {}
    bm25_docs = []
    return ids_docs

    
    for key, q in q_dup.items():
        question = {}
        question['body'] = q_all['title'] + ' ' + q_all['text']
        question['id'] = key  
        documents = [doc.split(' ')[1] for doc in ids_docs if key+' ' in doc]
#         print(documents)
        question['documents'] = documents
        bm25_docs.append(dict(question))
    return bm25_docs

In [35]:
def remove_work_dirs():
    if debug == 'yes':
        print('yes')
        # Execute remove sequence

In [36]:
dataset_name = qloc.split('/')[-2:]

In [37]:
if not os.path.exists(workdir):
    os.makedirs(workdir)

In [38]:
loc_prefix = workdir + dataset_name[0]
index_loc = loc_prefix + '_index'
questions_file = loc_prefix + '_questions' + '.gz'
queries_file = loc_prefix + '_queries'
trectext_file = loc_prefix + '_trectext.gz'
index_input = trectext_file
dups_file = qloc + data_split + '.pos.txt'
corpus_file = qloc + 'corpus.tsv.gz'

In [39]:
questions = read_questions(corpus_file)

In [40]:
q_dups = read_dups(dups_file)
# trec_questions = trectext_format(questions)
# save_trectext(trec_questions, trectext_file)

In [41]:
generate_queries_file(questions, q_dups, queries_file)

In [42]:
# build_index(index_input, index_loc)

In [43]:
bm25_docs = get_bm25_docs(queries_file, questions, q_dups, index_loc, b_val=0.75, k_val=1.2)

In [44]:
bm25_docs[0]

'26369 26369'

In [45]:
questions['690534']

{'title': "Ping IP returned `` destination net unreachable '' , even with different network",
 'text': "I could n't log in to facebook , so I ping the domain and got this : ping www.facebook.com Pinging www.facebook.com [ 69.171.228.14 ] with 32 bytes of data : Request timed out . Reply from 204.15.23.57 : Destination net unreachable . But when I ping this IP , it is reachable . Pinging 204.15.23.57 with 32 bytes of data : Reply from 204.15.23.57 : bytes=32 time=190ms TTL=51 I realize it is something wrong with my computer , as I have tried using different networks ( on the same network other devices can connect to facebook ) , resetting router"}

In [46]:
questions['690447']    

{'title': "Ping IP returned `` destination net unreachable '' , even with different network",
 'text': "I could n't log in to facebook , so I ping the domain and got this : ping www.facebook.com Pinging www.facebook.com [ 69.171.228.14 ] with 32 bytes of data : Request timed out . Reply from 204.15.23.57 : Destination net unreachable . But when I ping this IP , it is reachable . Pinging 204.15.23.57 with 32 bytes of data : Reply from 204.15.23.57 : bytes=32 time=190ms TTL=51 I realize it is something wrong with my computer , as I have tried using different networks ( on the same network other devices can connect to facebook ) , resetting router"}

In [47]:
for doc in bm25_docs:
    doc_parts = doc.split(' ')
    if not doc_parts[0] == doc_parts[1]:
        print(doc)
        print('no')

690534 690447
no
367402 367043
no
643118 643112
no
561311 561056
no
790938 790557
no
411028 410627
no
422799 422170
no


In [48]:
# len(bm25_docs[0]['documents'])