In [1]:
import numpy as np
import re
import xml.etree.ElementTree as ET
from collections import Counter, defaultdict
import _pickle as pk
from time import time
import jieba

from queue import Queue

import os
from os.path import join

import threading

def read_query(filepath):
    tree = ET.parse(filepath)
    questions = []
    for p in tree.findall('topic'):
        questions.append([x.text.strip() for x in p.findall('*')])
    return questions

def dump_files(path, output_file='filelist.txt'):
    def foo(path, f):
        if os.path.isdir(path):
            for p in sorted(os.listdir(path)):
                foo(join(path, p), f)
        else:
            f.write(path + '\n')

    with open(output_file, 'w') as f:
        foo(path, f)
        
class Dataset:
    def __init__(self):
        self.filelist = None

        self.inverted_file = defaultdict(list) # {'a': [(doc_id, cnt), (doc_id, cnt)], 'b': []}
        self.words = Counter() # {'a': 100, 'b': 20}
#         self.docs_length = list() # [10, 20, 30]
#         self.docs = list() # [{'a': 10, 'b': 20}, {'a': 5, 'b': 10}, {'c': 30}]
        self.num_docs = 0 # the number of documents
#         self.num_words = 0 # the number of words
        self.df = Counter() # A word 'a' appears in how many documents.

        self.vocab = None
    def threading_func(self, thd_idx, lines, i, end, q):
        tmp_df = Counter()
        tmp_words = Counter()
        tmp_inverted_file = defaultdict(list)
        while i < end:
            if thd_idx == 0:
                print('Processing .... %08d / %08d, total time: %06.2f sec.' % ((i+1), end, time() - self.start_time), end='\r')
            st, nd, cnt = [int(x) for x in lines[i].split()]
            w = self.vocab[st] if nd < 0 else self.vocab[st] + self.vocab[nd]
            tmp_df[w] += cnt
            
            for line in lines[i+1:i+cnt+1]:
                doc_id, w_cnt = tuple([int(x) for x in line.split()])
                tmp_words[w] += w_cnt
                tmp_inverted_file[w].append((doc_id, w_cnt))
            i += cnt + 1
        q.put((tmp_df, tmp_words, tmp_inverted_file))
            
    def read_from_file(self, path='./data/model'):
        with open(join(path, 'file-list'), 'r') as f:
            self.filelist = f.readlines()
        self.num_docs = len(self.filelist)
        
        with open(join(path, 'vocab.all'), 'r') as f:
            self.vocab = f.read().split()
        
        with open(join(path, 'inverted-file'), 'r') as f:
            self.start_time = time()
            lines = f.readlines()
            vocab_idx = []
            start_idx = np.linspace(0, len(lines), 9).astype(int)
            start_idx = [0, 4665362, 9330749, 13995841, 18660933, 23327811, 27991067, 32655803, len(lines)]
            start_idx = start_idx[::2]
#             def threading_func(thd_idx, lines, start, end, q):
#                 ret = []
#                 while start < end:
#                     st, nd, cnt = [int(x) for x in lines[start].split()]
#                     ret.append(start)
#                     start += cnt + 1
#                 q.put(ret) 
            
            q = Queue()
            thd_list = [threading.Thread(target=self.threading_func, args=(i, lines, start_idx[i], start_idx[i+1], q)) for i in range(len(start_idx)-1)]
            
            for thd in thd_list:
                thd.start()
            for thd in thd_list:
                thd.join()
                
            for _ in range(len(thd_list)):
                df, words, inverted_file = q.get()
                self.df += df
                self.words += words
                for k, v in inverted_file.items():
                    self.inverted_file[k] += v
            print('\ntotal time: %06.2f sec.' % (time() - self.start_time))
#             for _ in range(len(thd_list)):
#                 vocab_idx += q.get()
#             i = 0
#             while i < self.num_docs:
#                 print('Processing .... %06.2f %%, total time: %06.2f sec.' % (100*(i+1)/len(lines), time() - start), end='\r')
#                 st, nd, cnt = [int(x) for x in lines[i].split()]
#                 vocab_idx.append(i)
#                 i += cnt + 1
#         return vocab_idx
#         with open(join(path, 'inverted-file'), 'r') as f:
#             start = time()
#             i = 0
#             lines = f.readlines()
#             self.num_docs = len(lines)
#             self.docs_length = [0]*self.num_docs
#             while i < self.num_docs:
#                 print('Processing .... %06.2f %%, total time: %06.2f sec.' % (100*(i+1)/len(lines), time() - start), end='\r')
#                 st, nd, cnt = [int(x) for x in lines[i].split()]
#                 w = vocab[st] if nd < 0 else vocab[st] + vocab[nd]
                
#                 self.df[w] += cnt
#                 for line in lines[i+1:i+cnt+1]:
#                     doc_id, w_cnt = tuple([int(x) for x in line.split()])
#                     self.words[w] += w_cnt
#                     try:
#                         self.inverted_file[w].append((doc_id, w_cnt))
#                     except:
#                         self.inverted_file[w] = [(doc_id, w_cnt)]
                
#                 i += cnt + 1
                
#     def read_data(self, filelist_path):
#         with open(filelist_path, 'r') as f:
#             self.filelist = f.readlines()
#         self.num_docs = len(self.filelist)
#         start = time()
#         for i, file in enumerate(self.filelist):
#             print('reading file.... %04d/%04d, total time : %06.2f sec.' % (i+1, len(lines), time() - start), end='\r')
#             tree = ET.parse(file.strip())
#             if tree.find('.//title'):
#                 title = tree.find('.//title').text.strip()

#             text = ''.join([x.text.strip() for x in tree.findall('.//p')])
#             chinese_text = re.findall(r"[\u4e00-\u9fa5']+", text)
#             eng_text = re.findall(r"[A-aZ-z']+", text)

#             doc_dict = Counter()

#             # Chinese cut word
#             for sentence in chinese_text:
#                 cut_word = jieba.lcut(sentence)
#                 for w in cut_word:
#                     if len(w) > 1:
#                         cnt = Counter([w] + list(w))
#                     else:
#                         cnt = Counter(list(w))
#                     doc_dict += cnt
#             # bi_text = [x[i:i+2] for x in chinese_text for i in range(len(x)-1)]

#             doc_dict += Counter(eng_text)
#             self.words += doc_dict
#             self.docs.append(doc_dict)
#             self.docs_length.append(len(chinese_text)+len(eng_text))
#             self.df += Counter(list(doc_dict.keys()))

#             for k, v in doc_dict.items():
#                 try:
#                     self.inverted_file[k].append((i, v))
#                 except:
#                     self.inverted_file[k] = [(i, v)]
    def save(self, name):
        with open(name, 'wb') as pf:
            pk.dump(self.__dict__, pf)
    
    def load(self, name):
        with open(name, 'rb') as pf:
            self.__dict__ = pk.load(pf)

In [2]:
d = Dataset()
res = d.read_from_file()
# d.read_data('filelist.txt')
# d.load('mydata.pkl')
d.save('data.pkl')

Processing .... 09327668 / 09330749, total time: 090.70 sec.
total time: 097.48 sec.


In [3]:
path = 'data/queries/query-train.xml'
questions = read_query(path)