In [2]:
from IPython.core.display import HTML
import jieba

class SearcherIIndex():
    """倒排索引文本搜索实现类
    
    用倒排索引
    利用Python的集合运算，来实现候选结果集之间交、并运算
    
    Attributes:
        index: 检索使用的倒排索引
        max_id: 当前索引的文档最大ID
        doc_list: 索引文档原文8
    """
    def __init__(self, docs_file): 
        """初始化，用文件中的文本行构建倒排索引
        
        Args:
            docs_file:包含带索引文档(文本)的文件名
            
        """
        self.index = dict()    
        self.max_id = 0
        self.doc_list = [] 
        
        with open(docs_file, 'r') as f:
            docs_data = f.read()
        
        for doc in docs_data.split():
            self.add_doc(doc)

    def add_doc(self, doc):
        """向索引中添加新文档
        
        Args:
            doc:待检索的文档(文本)
        
        Returns:
            新增文档ID
        """
        self.doc_list.append(doc)
        for term in list(jieba.cut_for_search(doc)):
            #构建和更新各Term对应的Posting(集合)
            if term in self.index: 
                self.index[term].add(self.max_id)
            else:
                self.index[term] = set([self.max_id])
        self.max_id += 1
        return self.max_id - 1
    
    def word_match(self, word):
        """从倒排索引中获取包含word的候选文档ID集合
        
        Args:
            word:待检索的词(短语)
            
        Returns：
            包含待检索词(短语)的文档ID集合
        """
        result = None
        for term in list(jieba.cut(word)):
            if result is None:
                result = self.index.get(term, set())
            else:
                result = result & self.index.get(term, set())
        if result is None:
            result = set()
        return result

    def conv_query(self, query):
        """将用户的查询转换成用eval可运行、返回结果ID集合的代码段
        
        Args:
            query:待转换的原始查询字符串
        
        Returns:
            转换完成可通过eval执行返回ID集合的代码段字符串
        """
        query_new_parts = []
        all_parts = list(jieba.cut(query))
        idx = 0
        cache = '' #缓存变量，用于回收分词过程被切开的短语片段
        count_parts = len(all_parts)
        while idx < count_parts:
            if all_parts[idx] == '(' or all_parts[idx] == ')':
                query_new_parts.append(all_parts[idx])
            elif all_parts[idx] == ' ':
                query_new_parts.append(' ')
            elif all_parts[idx] in ('and', 'AND', '+'):
                query_new_parts.append('&')
            elif all_parts[idx] in ('or', 'OR'):
                query_new_parts.append('|')
            elif all_parts[idx] in ('not', 'NOT', '-'):
                query_new_parts.append('-')
            elif (idx + 1 < count_parts #被分词切开的短语部分回收至缓存
                  and all_parts[idx+1] not in (' ', ')')): 
                cache += all_parts[idx]
            elif (idx + 2 < count_parts #处理词间空格的形式
                  and all_parts[idx+1] == " " 
                  and all_parts[idx+2] not in ('(', ')', 'and', 'AND', '+', 'or', 'OR', 'NOT', 'not', '+', '-', ' ')): 
                query_new_parts.append("self.word_match('{}') & ".format(all_parts[idx]))
                idx += 2
                continue
            else:
                query_new_parts.append("self.word_match('{}')".format(cache + all_parts[idx]))
                cache = '' #合并完成清空缓存
            idx += 1
        query_new = ''.join(query_new_parts)
        return query_new

    def highlighter(self, doc, word):
        """用word对doc进行HTML高亮
        
        Args:
            doc:需要高亮的文档
            word:要进行高亮的关键词(查询)
            
        Returns:
            返回对关键词(查询)进行高亮的文档
        """
        for part in list(jieba.cut(word)):
            #TODO(CHG):短语高亮需要先分词
            if part not in ('(', ')', 'and', 'AND', 'or', 'OR', 'NOT', 'not', ' '):
                doc = doc.replace(part, '<span style="color:red">{}</span>'.format(part))
        return doc

    def search(self, query):
        """用query进行查询返回结果文档列表
        
        Args:
            query:用户的(复合)布尔查询字符串
            
        Returns:
            复合查询要求的(高亮)文档结果列表
        """
        result = []
        query_new = self.conv_query(query)
        for did in eval(query_new):
            result.append(self.highlighter(self.doc_list[did], query))
        return result

In [3]:
searcher = SearcherIIndex('titles.txt')

query = '3-0'
result = searcher.search(query)
if result:
    for doc in result:
        display(HTML(doc))
else:
    print('No result.')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/8t/p4w9tmzx5413xgf7hpr2sj7r0000gn/T/jieba.cache
Loading model cost 0.866 seconds.
Prefix dict has been built succesfully.


TypeError: bad operand type for unary -: 'set'

In [71]:
import string

class SearcherIIndexVII(SearcherIIndex):
    """倒排索引文本搜索实现类(改进)
    
    自定义解析，保留英文片段，将中文片段多粒度分词处理
    
    Attributes:
        index: 检索使用的倒排索引
        max_id: 当前索引的文档最大ID
        doc_list: 索引文档原文
    """
    def parse_doc(self, doc):
        """对文档进行自定义解析，保留英文串，对中文串多粒度分词
        
        Args:
            doc:待解析的原始文档
        
        Returns:
            解析结果列表，元素是切分得到的term
        """
        result = []
        state_last = ''
        cache = ''
        for c in doc:
            state_c = c in string.ascii_letters \
                or c.isdigit() \
                or c in ('-', ':', '.')
            if c == ' ':
                if state_last:
                    result.append(cache)
                else:
                    result.extend(list(jieba.cut_for_search(cache)))
                result.append(' ')
                cache = ''
                state_last = '' 
            else:
                if state_c == state_last:
                    cache += c
                else:
                    if state_last != '':
                        if state_last:
                            result.append(cache)
                        else:
                            result.extend(list(jieba.cut_for_search(cache)))
                    cache = c
                state_last = state_c
        if cache:
            if state_last:
                result.append(cache)
            else:
                result.extend(list(jieba.cut_for_search(cache)))
        return result
    
    def add_doc(self, doc):
        """向索引中添加新文档
        
        Args:
            doc:待检索的文档(文本)
        
        Returns:
            新增文档ID
        """
        self.doc_list.append(doc)
        doc = doc.lower()
        for term in self.parse_doc(doc):
            #构建和更新各Term对应的Posting(集合)
            if term in self.index: 
                self.index[term].add(self.max_id)
            else:
                self.index[term] = set([self.max_id])
        self.max_id += 1
        return self.max_id - 1
    
    def dumpIndex(self):
        """原样输出索引，用于检查索引构建结果
        
        Returns:
            对索引(字典结构)的Dump输出
        """
        print(self.index)
    
    def parse_query(self, doc):
        """对查询进行自定义解析，保留英文串，对中文串原型插入
        
        Args:
            doc:待解析的原始文档
        
        Returns:
            解析结果列表，元素是带有串类型标记(首字符，e为英文，c为中文)
            的切分term结果
        """
        doc = doc.lower() + ' '
        result = []
        state_last = ''
        cache = ''
        for c in doc:
            state_c = c in string.ascii_letters \
                or c.isdigit() \
                or c in ('-', ':', '.')
            flag = None
            #增加串标记，e为英文，c为中文(未来可能扩充)
            if state_c:
                flag = 'e'
            else:
                flag = 'c'
                
            if c == ' ':
                if state_last != '':
                    result.append(cache)
                    result.append('s ')
                    cache = ''
                    state_last = '' 
            elif c == '(' or c == ')':
                if cache != '':
                    result.append(cache)
                    cache = ''
                state_last = ''
                result.append('s' + c)
            else:
                if state_c == state_last:
                    cache += c
                else:
                    if state_last != '':
                        result.append(cache)
                    cache = flag + c
                state_last = state_c
        return result
    
    def conv_query(self, query):
        """将用户的查询转换成用eval可运行、返回结果ID集合的代码段
        
        Args:
            query:待转换的原始查询字符串
        
        Returns:
            转换完成可通过eval执行返回ID集合的代码段字符串
        """
        query_new_parts = []
        all_parts = list(self.parse_query(query))
        idx = 0
        cache = '' #缓存变量，用于回收分词过程被切开的短语片段
        count_parts = len(all_parts)
        while idx < count_parts:
            if all_parts[idx][1:] == '(' or all_parts[idx][1:] == ')':
                query_new_parts.append(all_parts[idx][1:])
            elif all_parts[idx][1:] == ' ':
                query_new_parts.append(' ')
            elif all_parts[idx][1:] in ('and', 'AND', '+'):
                query_new_parts.append('&')
            elif all_parts[idx][1:] in ('or', 'OR'):
                query_new_parts.append('|')
            elif all_parts[idx][1:] in ('not', 'NOT', '-'):
                query_new_parts.append('-')
            elif (idx + 1 < count_parts #被分词切开的短语部分回收至缓存
                  and all_parts[idx+1][1:] not in (' ', ')')): 
                cache += all_parts[idx][1:]
            elif (idx + 2 < count_parts #处理词间空格的形式
                  and all_parts[idx+1][1:] == " " 
                  and all_parts[idx+2][1:] not in (')', 'and', 'AND', '+', 'or', 'OR', 'NOT', 'not', '+', '-', ' ')): 
                query_new_parts.append("{} & ".format(self.conv_part(all_parts[idx])))
                idx += 2
                continue
            else:
                query_new_parts.append(self.conv_part(cache + all_parts[idx]))
                cache = '' #合并完成清空缓存
            idx += 1
        query_new = ''.join(query_new_parts)
        return query_new
    
    def term_match(self, term):
        """在索引里找到term对应的posting集合
        
        Args:
            term:要检索的词项
            
        Results:
            term对应的posting集合
        """
        return self.index.get(term, set()) 
    
    def conv_part(self, part):
        """将带有类别标记的解析结果段 转化为 eval能进行计算的代码段
        
        Args:
            part:带有类别标记的解析结果段
            
        Results:
            eval能进行计算的代码段字符串(调用 term_match() 进行计算)
        """
        flag = part[0]
        if flag == 'e':
            return "self.term_match('{}')".format(part[1:])
        elif flag == 'c':
            return "(self.term_match('{}'))".format(
                "') & self.term_match('".join(jieba.cut(part[1:])))
    
    def highlighter(self, doc, query):
        """用query对doc进行HTML高亮
        
        Args:
            doc:需要高亮的文档
            word:要进行高亮的关键词(查询)
            
        Returns:
            返回对关键词(查询)进行高亮的文档
        """
        n = 0
        #生成要进行高亮的关键词串集合
        word_set = set()
        query = query.lower()
        query_parts = self.parse_query(query)
        for query_part in query_parts:
            if query_part[0] == 'e':
                word_set.add(query_part[1:])
                if len(query_part[1:]) > n:
                    n = len(query_part[1:])
            elif query_part[0] == 'c':
                if len(query_part[1:]) > 1:
                    for word in jieba.cut(query_part[1:]):
                        word_set.add(word)
                        if len(word) > n:
                            n = len(word)
        
        #遍历文档替换高亮关键词串
        doc_low = doc.lower()
        i = 0
        result = []
        while True:
            end_idx = i + n
            if end_idx > len(doc_low):
                end_idx = len(doc_low)
            for j in range(end_idx, i, -1):
                if doc_low[i:j] in word_set:
                    break
            if doc_low[i:j] in word_set:
                result.append(
                    '<span style="color:red">{}</span>'.format(doc[i:j]))
            else:
                result.append(doc_low[i:j])
            i = j
            if i == len(doc_low):
                break
                
        return ''.join(result)

In [72]:
searcher = SearcherIIndexVII('titles.txt')
searcher.highlighter(
    '华为Mate30采用安卓系统','中国华为 mate30')

'<span style="color:red">华为</span><span style="color:red">Mate30</span>采用安卓系统'

In [73]:
searcher = SearcherIIndexVII('titles.txt')

query = '华为 mate30'
print(searcher.parse_query(query))
print(searcher.conv_query(query))
result = searcher.search(query)
if result:
    for doc in result:
        display(HTML(doc))
else:
    print('No result.')

['c华为', 's ', 'emate30', 's ']
(self.term_match('华为')) & self.term_match('mate30') 


In [63]:
doc = '华为Mate30采用安卓系统'

n = 6
word_set = set(
    ['华为' ,'安卓', '安卓系统'])

#正向最大分词
i = 0
result_f = []
while True:
    end_idx = i + n
    if end_idx > len(doc):
        end_idx = len(doc)
    for j in range(end_idx, i, -1):
        if doc[i:j] in word_set:
            break
    result_f.append(doc[i:j])
    i = j
    if i == len(doc):
        break
print('|'.join(result_f))

#逆向最大分词
i = len(doc)
result_b = []
while True:
    end_idx = i - n
    if end_idx < 0:
        end_idx = 0
    for j in range(end_idx, i):
#         print(j,i,doc[j:i])
        if doc[j:i] in word_set:
            break
    result_b.insert(0, doc[j:i])
    i = j
    if i == 0:
        break
print('|'.join(result_b))

华为|M|a|t|e|3|0|采|用|安卓系统
华为|M|a|t|e|3|0|采|用|安卓系统


In [64]:
#列表倒序的简单方法：切片法
[1, 2, 3, 4, 5][::-1]

[5, 4, 3, 2, 1]