In [2]:
from IPython.core.display import HTML
import jieba

class SearcherIIndex():
    """倒排索引文本搜索实现类
    
    用倒排索引
    利用Python的集合运算，来实现候选结果集之间交、并运算
    
    Attributes:
        index: 检索使用的倒排索引
        max_id: 当前索引的文档最大ID
        doc_list: 索引文档原文8
    """
    def __init__(self, docs_file): 
        """初始化，用文件中的文本行构建倒排索引
        
        Args:
            docs_file:包含带索引文档(文本)的文件名
            
        """
        self.index = dict()    
        self.max_id = 0
        self.doc_list = [] 
        
        with open(docs_file, 'r') as f:
            docs_data = f.read()
        
        for doc in docs_data.split():
            self.add_doc(doc)

    def add_doc(self, doc):
        """向索引中添加新文档
        
        Args:
            doc:待检索的文档(文本)
        
        Returns:
            新增文档ID
        """
        self.doc_list.append(doc)
        for term in list(jieba.cut_for_search(doc)):
            #构建和更新各Term对应的Posting(集合)
            if term in self.index: 
                self.index[term].add(self.max_id)
            else:
                self.index[term] = set([self.max_id])
        self.max_id += 1
        return self.max_id - 1
    
    def word_match(self, word):
        """从倒排索引中获取包含word的候选文档ID集合
        
        Args:
            word:待检索的词(短语)
            
        Returns：
            包含待检索词(短语)的文档ID集合
        """
        result = None
        for term in list(jieba.cut(word)):
            if result is None:
                result = self.index.get(term, set())
            else:
                result = result & self.index.get(term, set())
        if result is None:
            result = set()
        return result

    def conv_query(self, query):
        """将用户的查询转换成用eval可运行、返回结果ID集合的代码段
        
        Args:
            query:待转换的原始查询字符串
        
        Returns:
            转换完成可通过eval执行返回ID集合的代码段字符串
        """
        query_new_parts = []
        all_parts = list(jieba.cut(query))
        idx = 0
        cache = '' #缓存变量，用于回收分词过程被切开的短语片段
        count_parts = len(all_parts)
        while idx < count_parts:
            if all_parts[idx] == '(' or all_parts[idx] == ')':
                query_new_parts.append(all_parts[idx])
            elif all_parts[idx] == ' ':
                query_new_parts.append(' ')
            elif all_parts[idx] in ('and', 'AND', '+'):
                query_new_parts.append('&')
            elif all_parts[idx] in ('or', 'OR'):
                query_new_parts.append('|')
            elif all_parts[idx] in ('not', 'NOT', '-'):
                query_new_parts.append('-')
            elif (idx + 1 < count_parts #被分词切开的短语部分回收至缓存
                  and all_parts[idx+1] not in (' ', ')')): 
                cache += all_parts[idx]
            elif (idx + 2 < count_parts #处理词间空格的形式
                  and all_parts[idx+1] == " " 
                  and all_parts[idx+2] not in ('(', ')', 'and', 'AND', '+', 'or', 'OR', 'NOT', 'not', '+', '-', ' ')): 
                query_new_parts.append("self.word_match('{}') & ".format(all_parts[idx]))
                idx += 2
                continue
            else:
                query_new_parts.append("self.word_match('{}')".format(cache + all_parts[idx]))
                cache = '' #合并完成清空缓存
            idx += 1
        query_new = ''.join(query_new_parts)
        return query_new

    def highlighter(self, doc, word):
        """用word对doc进行HTML高亮
        
        Args:
            doc:需要高亮的文档
            word:要进行高亮的关键词(查询)
            
        Returns:
            返回对关键词(查询)进行高亮的文档
        """
        for part in list(jieba.cut(word)):
            #TODO(CHG):短语高亮需要先分词
            if part not in ('(', ')', 'and', 'AND', 'or', 'OR', 'NOT', 'not', ' '):
                doc = doc.replace(part, '<span style="color:red">{}</span>'.format(part))
        return doc

    def search(self, query):
        """用query进行查询返回结果文档列表
        
        Args:
            query:用户的(复合)布尔查询字符串
            
        Returns:
            复合查询要求的(高亮)文档结果列表
        """
        result = []
        query_new = self.conv_query(query)
        for did in eval(query_new):
            result.append(self.highlighter(self.doc_list[did], query))
        return result

In [3]:
import string

class SearcherIIndexVII(SearcherIIndex):
    """倒排索引文本搜索实现类(改进)
    
    自定义解析，保留英文片段，将中文片段多粒度分词处理
    
    Attributes:
        index: 检索使用的倒排索引
        max_id: 当前索引的文档最大ID
        doc_list: 索引文档原文
    """
    def parse_doc(self, doc):
        """对文档进行自定义解析，保留英文串，对中文串多粒度分词
        
        Args:
            doc:待解析的原始文档
        
        Returns:
            解析结果列表，元素是切分得到的term
        """
        result = []
        state_last = ''
        cache = ''
        for c in doc:
            state_c = c in string.ascii_letters \
                or c.isdigit() \
                or c in ('-', ':', '.')
            if c == ' ':
                if state_last:
                    result.append(cache)
                else:
                    result.extend(list(jieba.cut_for_search(cache)))
                result.append(' ')
                cache = ''
                state_last = '' 
            else:
                if state_c == state_last:
                    cache += c
                else:
                    if state_last != '':
                        if state_last:
                            result.append(cache)
                        else:
                            result.extend(list(jieba.cut_for_search(cache)))
                    cache = c
                state_last = state_c
        if cache:
            if state_last:
                result.append(cache)
            else:
                result.extend(list(jieba.cut_for_search(cache)))
        return result
    
    def add_doc(self, doc):
        """向索引中添加新文档(正常索引及ngram索引)
        
        Args:
            doc:待检索的文档(文本)
        
        Returns:
            新增文档ID
        """
        self.doc_list.append(doc)
        doc = doc.lower()
        for term in self.parse_doc(doc):
            #构建和更新各Term对应的Posting(集合)
            if term in self.index: 
                self.index[term].add(self.max_id)
            else:
                self.index[term] = set([self.max_id])
        
        #构建ngram索引(以二元为例)
        doclen = len(doc)
        for i in range(doclen-1):
            term = doc[i:i+2]
            if term in self.index_b: 
                self.index_b[term].add(self.max_id)
            else:
                self.index_b[term] = set([self.max_id])
                
        self.max_id += 1
        return self.max_id - 1
    
    def dumpIndex(self):
        """原样输出索引，用于检查索引构建结果
        
        Returns:
            对索引(字典结构)的Dump输出
        """
        print(self.index)
    
    def conv_query(self, query):
        """将用户的查询转换成用eval可运行、返回结果ID集合的代码段
        
        Args:
            query:待转换的原始查询字符串
        
        Returns:
            转换完成可通过eval执行返回ID集合的代码段字符串
        """
        query_new_parts = []
        all_parts = list(self.parse_query(query))
        idx = 0
        cache = '' #缓存变量，用于回收分词过程被切开的短语片段
        count_parts = len(all_parts)
        while idx < count_parts:
            if all_parts[idx][1:] == '(' or all_parts[idx][1:] == ')':
                query_new_parts.append(all_parts[idx][1:])
            elif all_parts[idx][1:] == ' ':
                query_new_parts.append(' ')
            elif all_parts[idx][1:] in ('and', 'AND', '+'):
                query_new_parts.append('&')
            elif all_parts[idx][1:] in ('or', 'OR'):
                query_new_parts.append('|')
            elif all_parts[idx][1:] in ('not', 'NOT', '-'):
                query_new_parts.append('-')
            elif (idx + 1 < count_parts #对连续的内容分段结果集合中间加”&“运算符
                  and all_parts[idx+1][1:] not in (' ', ')')): 
                query_new_parts.append("{} & ".format(self.conv_part(all_parts[idx])))
            elif (idx + 2 < count_parts #处理词间、词与符号间空格的情况
                  and all_parts[idx+1][1:] == " " 
                  and all_parts[idx+2][1:] not in (')', 'and', 'AND', '+', 'or', 'OR', 'NOT', 'not', '+', '-', ' ')): 
                query_new_parts.append("{} & ".format(self.conv_part(all_parts[idx])))
                idx += 2
                continue
            else:
                query_new_parts.append(self.conv_part(cache + all_parts[idx]))
                cache = '' #合并完成清空缓存
            idx += 1
        query_new = ''.join(query_new_parts)
        return query_new
    
    def term_match(self, term):
        """在索引里找到term对应的posting集合
        
        Args:
            term:要检索的词项
            
        Results:
            term对应的posting集合
        """
        return self.index.get(term, set()) 
    
    def conv_part(self, part):
        """将带有类别标记的解析结果段 转化为 eval能进行计算的代码段
        
        Args:
            part:带有类别标记的解析结果段
            
        Results:
            eval能进行计算的代码段字符串(调用 term_match() 进行计算)
        """
        flag = part[0]
        if flag == 'e':
            return "self.term_match('{}')".format(part[1:])
        elif flag == 'c':
            return "(self.term_match('{}'))".format(
                "') & self.term_match('".join(jieba.cut(part[1:])))
        elif flag == 'f':
            return "self.frag_match('{}')".format(part[1:])
    
    def highlighter(self, doc, query):
        """用query对doc进行HTML高亮
        
        Args:
            doc:需要高亮的文档
            word:要进行高亮的关键词(查询)
            
        Returns:
            返回对关键词(查询)进行高亮的文档
        """
        n = 0
        #生成要进行高亮的关键词串集合
        word_set = set()
        query = query.lower()
        query_parts = self.parse_query(query)
        for query_part in query_parts:
            if query_part[0] == 'e' or query_part[0] == 'f':
                word_set.add(query_part[1:])
                if len(query_part[1:]) > n:
                    n = len(query_part[1:])
            elif query_part[0] == 'c':
                if len(query_part[1:]) > 1:
                    for word in jieba.cut(query_part[1:]):
                        word_set.add(word)
                        if len(word) > n:
                            n = len(word)
        
        #遍历文档替换高亮关键词串
        doc_low = doc.lower()
        i = 0
        result = []
        while True:
            end_idx = i + n
            if end_idx > len(doc_low):
                end_idx = len(doc_low)
            for j in range(end_idx, i, -1):
                if doc_low[i:j] in word_set:
                    break
            if doc_low[i:j] in word_set:
                result.append(
                    '<span style="color:red">{}</span>'.format(doc[i:j]))
            else:
                result.append(doc_low[i:j])
            i = j
            if i == len(doc_low):
                break
                
        return ''.join(result)
    
    def __init__(self, docs_file): 
        """初始化，用文件中的文本行构建倒排索引
        
        Args:
            docs_file:包含带索引文档(文本)的文件名
            
        """
        self.index = dict() #标准倒排索引
        self.index_b = dict() #ngram索引
        self.max_id = 0
        self.doc_list = [] 
        
        with open(docs_file, 'r') as f:
            docs_data = f.read()
        
        for doc in docs_data.split():
            self.add_doc(doc)
            
    def frag_match(self, frag):
        """对片段frag用ngram索引实现原样搜索
        
        Args:
            frag:要原样搜索的字符串
            
        Results:
            片段原样搜索的结果(文档ID)集合
        """
        frag = frag.lower() #大小写归一化
        result = None
        doclen = len(frag)
        for i in range(doclen - 1):
            term = frag[i:i+2]
            if result is None:
                result = self.index_b.get(term, set())
            else:
                result = result & self.index_b.get(term, set())
        return result
    
    def get_char_type(self, c):
        """返回当前字符的类型(e,c,s,f,b)
        
        Args:
            c:要进行判断的单个字符
            
        Results:
            返回判断结果(前缀)：e为英文，c为中文，s为空格，f为引号，b为括号
        """
        result = 'c'
        if c in string.ascii_letters \
                or c.isdigit() \
                or c in ('-', ':', '.'):
            result = 'e'
        elif c == '"':
            result = 'f'
        elif c == ' ':
            result = 's'
        elif c in ('(', ')'):
            result = 'b'
        return result
    
    def parse_query(self, doc):
        """对查询进行自定义解析，保留英文串，对中文串原型插入
        
        Args:
            doc:待解析的原始文档
        
        Returns:
            解析结果列表，元素是带有串类型标记(首字符，e为英文，c为中文，s为空格，f为引号，b为括号)
            的切分term结果
        """
        doc = doc.lower() + ' ' #解决末位字符状态切换问题的小技巧
        result = []
        doclen = len(doc)
        i = 0
        while True:
            cur_char_type = self.get_char_type(doc[i])
            for j in range(i+1, doclen):
                if cur_char_type == 'f': #当前符号为引号，找下一个引号
                    if self.get_char_type(doc[j]) == 'f':
                        break
                elif self.get_char_type(doc[j]) != cur_char_type: #当前符号非引号，找下一个状态变化
                    break
            if cur_char_type == 's': #对多个空格连续出现的情况进行合并
                result.append('s ')
            elif cur_char_type == 'f': #对引号只提取引号内字符串
                result.append(cur_char_type + doc[i+1:j])
                j += 1
            else:
                result.append(cur_char_type + doc[i:j])
            i = j
            if i >= doclen - 1:
                break
        return result

In [30]:
query = '"七连胜"'
searcher = SearcherIIndexVII('titles.txt')
print(searcher.parse_query(query))
print(searcher.search(query))

['f七连胜']
['一边倒！中国女排3-0横扫美国取<span style="color:red">七连胜</span>', '一边倒！中国女排3-0横扫美国取<span style="color:red">七连胜</span>', '一边倒！中国女排3-0横扫美国取<span style="color:red">七连胜</span>', '一边倒！中国女排3-0横扫美国取<span style="color:red">七连胜</span>', '一边倒！中国女排3-0横扫美国取<span style="color:red">七连胜</span>']


In [31]:
searcher = SearcherIIndexVII('titles.txt')

query = '"mate"'
print(searcher.parse_query(query))
print(searcher.conv_query(query))
result = searcher.search(query)
if result:
    for doc in result:
        display(HTML(doc))
else:
    print('No result.')

['fmate']
self.frag_match('mate')


In [32]:
import nltk
print(nltk.corpus.words.words()[:10])

['A', 'a', 'aa', 'aal', 'aalii', 'aam', 'Aani', 'aardvark', 'aardwolf', 'Aaron']


In [42]:
from collections import Counter
import nltk

class Corrector():
    """用二元索引实现拼写校正
    
    Attributes:
        index_b: 检索使用的二元索引
        max_id: 当前索引的单词最大ID
        doc_list: 索引单词原文
    """
    def __init__(self): 
        """初始化，用NLTK的words词典构建倒排索引
        """
        self.index_b = dict() #ngram索引
        self.max_id = 0
        self.doc_list = [] 
        
        for doc in nltk.corpus.words.words():
            self.add_doc(doc)
            
    def add_doc(self, doc):
        """向索引中添加新词(单词的二元索引)
        
        Args:
            doc:待检索的单词
        
        Returns:
            新增单词ID
        """
        self.doc_list.append(doc)
        doc = doc.lower()
        
        #构建二元索引
        doclen = len(doc)
        for i in range(doclen-1):
            term = doc[i:i+2]
            if term in self.index_b: 
                self.index_b[term].append(self.max_id)
            else:
                self.index_b[term] = [self.max_id]
                
        self.max_id += 1
        return self.max_id - 1
    
    def correct(self, word, limit=5):
        """拼写校正函数
        
        Args:
            word:待校正的词
            limit:返回结果的最大条数，默认值为5
            
        Returns:
            最可能的校正单词列表
        """
        word = word.lower() #大小写归一化
        result = []
        docid_list = []
        doclen = len(word)
        for i in range(doclen - 1):
            term = word[i:i+2]
            docid_list += self.index_b.get(term, [])
        docid_counter = Counter(docid_list)
        count = 0
        for elem in docid_counter.most_common(300):
            cor_word = self.doc_list[elem[0]]
            if len(cor_word) >= doclen - 1 and len(cor_word) <= doclen + 1:
                result.append(cor_word)
                count += 1
                if count > limit:
                    break
        return result

In [43]:
cor = Corrector()

In [44]:
print(cor.correct('retrival'))

['retrieval', 'pretribal', 'rearrival', 'rerival', 'retreatal', 'retrial']


In [25]:
from collections import Counter

Counter([1,1,2,3,4])

Counter({1: 2, 2: 1, 3: 1, 4: 1})

In [23]:
%debug

> [0;32m<ipython-input-13-c918f0d68a99>[0m(168)[0;36mconv_query[0;34m()[0m
[0;32m    166 [0;31m                [0mcache[0m [0;34m=[0m [0;34m''[0m [0;31m#合并完成清空缓存[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    167 [0;31m            [0midx[0m [0;34m+=[0m [0;36m1[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 168 [0;31m        [0mquery_new[0m [0;34m=[0m [0;34m''[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0mquery_new_parts[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    169 [0;31m        [0;32mreturn[0m [0mquery_new[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    170 [0;31m[0;34m[0m[0m
[0m


ipdb>  quit


In [63]:
doc = '华为Mate30采用安卓系统'

n = 6
word_set = set(
    ['华为' ,'安卓', '安卓系统'])

#正向最大分词
i = 0
result_f = []
while True:
    end_idx = i + n
    if end_idx > len(doc):
        end_idx = len(doc)
    for j in range(end_idx, i, -1):
        if doc[i:j] in word_set:
            break
    result_f.append(doc[i:j])
    i = j
    if i == len(doc):
        break
print('|'.join(result_f))

#逆向最大分词
i = len(doc)
result_b = []
while True:
    end_idx = i - n
    if end_idx < 0:
        end_idx = 0
    for j in range(end_idx, i):
#         print(j,i,doc[j:i])
        if doc[j:i] in word_set:
            break
    result_b.insert(0, doc[j:i])
    i = j
    if i == 0:
        break
print('|'.join(result_b))

华为|M|a|t|e|3|0|采|用|安卓系统
华为|M|a|t|e|3|0|采|用|安卓系统


In [64]:
#列表倒序的简单方法：切片法
[1, 2, 3, 4, 5][::-1]

[5, 4, 3, 2, 1]

#### 采集网易新闻排行新闻标题、URL列表 

In [12]:
from lxml import etree
import csv
import requests

news_list = []
url = 'http://news.163.com/rank/'
r = requests.get(url) #下载
sel = etree.HTML(r.text) #解析
nodes = sel.xpath('//td/a')
for node in nodes:
    news_list.append([node.text, node.attrib['href']])
    
with open('163news.csv','w') as f: #入库
    f_csv = csv.writer(f)
    f_csv.writerows(news_list)

In [13]:
!head 163news.csv

南充小伙老家建洋气别墅 一栋自己住一栋送堂哥,https://home.163.com/19/1209/07/EVUGCN4I001081EI.html
恋情实锤？杨幂魏大勋入住同一酒店没出来,https://ent.163.com/19/1210/08/F017MIA700038FO9.html
奥巴马花8260万买豪宅 11.7万㎡大庄园拥私,https://home.163.com/19/1210/07/F012SVAN001081EI.html
39岁董洁扮嫩演高中生 被吐槽比剧中姑妈还老气,https://ent.163.com/19/1209/07/EVUJ6ND000038FO9.html
山西古村被“圈”收门票 九成都是危房只剩老人居住,https://home.163.com/19/1209/07/EVUGCSBR001081EI.html
"""冰花男孩""父亲申请贫困户遭拒 村主任:其名下有",https://news.163.com/19/1209/09/EVUP0P590001899O.html
五星级酒店竟是“毛坯房”?一晚9万6人们排队住,https://home.163.com/19/1209/07/EVUGCTIM001081EI.html
心虚?李小璐晒旅游照又秒删 疑与PG One国外,https://ent.163.com/19/1209/09/EVUPOMRO00038FO9.html
62岁老人自制创意鲁班家具 不用一颗钉子还可折叠,https://home.163.com/19/1210/07/F012SP99001081EI.html
飞机起飞前乘客收到亲人噩耗 航班紧急滑回航站楼,https://news.163.com/19/1209/11/EVV0NUPF0001875P.html


#### 采集新闻正文

In [33]:
from lxml import etree
import csv
import requests

from tomorrow import threads

#用tomorrow实现多线程采集
@threads(5)
def download(url):
    return requests.get(url)

news_list = []
with open('163news.csv', 'r') as f:
    f_csv = csv.reader(f)
    for row in f_csv:
        news_list.append([row[0], row[1]])
        
result_detail = []

responses = [download(item[1]) for item in news_list[566:]]
for r in responses:
    url = r.url
    sel = etree.HTML(r.text)
    try:
        title = sel.xpath("//title/text()")[0]
        if 'dy.163.com' in url:
            nodes = sel.xpath("//div[@id='content']/p/text()")
        else:
            nodes = sel.xpath("//div[@id='endText']/p/text()")
        content = ''.join(map(lambda x: x.strip(), nodes))
        if(len(content) == 0):
            print(url)
            break
    except Exception:
        print(url)
        break
    result_detail.append([url, title, content])
#     print(url, len(content))
    
with open('163news_d.csv','w') as f:
    f_csv = csv.writer(f)
    f_csv.writerows(result_detail)
    
print('Done.')

https://play.163.com/photoview/ITPG0031/95896.html
Done.


In [29]:
!head 163news_d.csv

https://home.163.com/19/1209/07/EVUGCN4I001081EI.html,南充小伙老家建洋气别墅 一栋自己住一栋送堂哥_网易家居,四川是一个山清水秀的好地方，本着地理优势，成为国内的旅游大省。近些年农村兴起的建房热，作为四川人，可当然不能浪费了家乡优美的自然环境。南充一小伙就投身成为了建房热中的一员。在自己老家的山脚下建起了别墅，还不是一栋，建了两栋，一栋自己住，一栋送给堂哥家。有钱就是任性啊，快跟家居君来看看。这是小伙家之前的老宅，传统的瓦房，已经有着不少的岁月痕迹了。拆除了旧房子，平整场地之后直接开挖，打地基。小伙家建的是两层别墅，一层结构完成之后，工人师傅们很快就开始了二层的基础工作。因为不是北方，有些农业大省，需要平顶房在房顶晾晒粮食。所以采用了美观的斜坡式封顶，也更利于雨天的排水。蓝色的瓦片一层叠一层很是好看。房子终于完工了，可以看到整个的面积很大很气派，整整有191㎡。浅黄色的外墙和蓝顶的双色搭配，洋气！小伙堂哥家的经济状况不是很好，小伙另外建了一栋一层的小别墅送给堂哥家。完工得很快。配色风格和自家的是一样的，外墙贴上黄色瓷砖，房顶是蓝色瓦片。但是面积小了一些，只有104㎡，不过也足够一家人正常生活了。【】这位四川小伙真是仗义，知道堂哥家经济状况不好，另建了一栋小别墅送给堂哥，这钱花的有排面。虽说这两栋别墅价值不菲，但是现在农村建房的人，不缺钱的可真不少。比如下面这湖南农村一家，承包建设的别墅可是100多万，室内还有大KTV房，（来源：田园雅墅，由网易家居综合整理）
https://ent.163.com/19/1210/08/F017MIA700038FO9.html,恋情实锤？杨幂魏大勋入住同一酒店没出来_网易娱乐,12月3日，魏大勋被拍到与杨幂前后脚进同一家酒店，疑似恋情再添实锤。爆料称，魏大勋进入杨幂剧组酒店后就没再出现，一直到第二天早上六点左右，他才全副武装偷偷溜出酒店，走出酒店大门后还恋恋不舍地回头朝酒店里张望，随后便大步流星离开了酒店。据悉，此前就有媒体曝光杨幂与魏大勋住所中间只隔一条马路，七月时还有媒体曝光魏大勋曾前往大连剧组探班杨幂，八月被网友拍到两人穿同款鞋共游798艺术区，前几日又被网友扒出戴了使用痕迹一模一样的帽子。此前，杨幂和魏大勋多次被传恋爱绯闻，两人曾被网友遇到一同亲密逛街，虽然魏大勋发文表

#### 继承新闻索引类

In [17]:
import csv

class NewsIndex(SearcherIIndexVII):
    """新闻内容索引，实现对CSV文件的索引、对长文的高亮摘要、对新闻链接的支持
    """
    def __init__(self, docs_file): 
        """初始化，用CSV文件中的title+content构建倒排索引，将url保存在link_list里
        
        Args:
            docs_file:包含新闻URL、title和content的csv文件的文件名
            
        """
        self.index = dict() #标准倒排索引
        self.index_b = dict() #ngram索引
        self.max_id = 0
        self.doc_list = [] 
        self.link_list = []
        
        with open(docs_file, 'r') as f:
            f_csv = csv.reader(f)
            for row in f_csv:
                self.add_doc(row[1] + row[2])
                self.link_list.append(row[0])

    def highlighter(self, doc, query):
        """用query对doc进行HTML高亮
        
        Args:
            doc:需要高亮的文档
            word:要进行高亮的关键词(查询)
            
        Returns:
            返回对关键词(查询)进行高亮的文档
        """
        n = 0
        #生成要进行高亮的关键词串集合
        word_set = set()
        query = query.lower()
        query_parts = self.parse_query(query)
        for query_part in query_parts:
            if query_part[0] == 'e' or query_part[0] == 'f':
                word_set.add(query_part[1:])
                if len(query_part[1:]) > n:
                    n = len(query_part[1:])
            elif query_part[0] == 'c':
                if len(query_part[1:]) > 1:
                    for word in jieba.cut(query_part[1:]):
                        word_set.add(word)
                        if len(word) > n:
                            n = len(word)
        
        #遍历文档替换高亮关键词串
        doc_low = doc.lower()
        i = 0
        result = []
        side_len = 10 #设置上下文保留的宽度
        last_end = 0 #上一个命中关键词的末位位置
        while True:
            end_idx = i + n
            if end_idx > len(doc_low):
                end_idx = len(doc_low)
            for j in range(end_idx, i, -1):
                if doc_low[i:j] in word_set:
                    break
            if doc_low[i:j] in word_set:
                #追加从上一个关键词末位到当前关键词开头的周围上下文
                if i - last_end > 2 * side_len:
                    result.append(doc[last_end:last_end+side_len])
                    result.append('...')
                    result.append(doc[i-side_len:i])
                else:
                    result.append(doc[last_end:i])
                #追加高亮处理的关键词串
                result.append(
                    '<span style="color:red">{}</span>'.format(doc[i:j]))
                last_end = j
#             else:
#                 result.append(doc_low[i:j])
            i = j
            if i == len(doc_low):
                right_idx = last_end + side_len
                if right_idx > len(doc):
                    result.append(doc[last_end:])
                else:
                    result.append(doc[last_end:right_idx])
                    result.append('...')
                break
                
        return ''.join(result)
    
    def search(self, query):
        """用query进行查询返回结果文档列表
        
        Args:
            query:用户的(复合)布尔查询字符串
            
        Returns:
            复合查询要求的(高亮)文档结果列表
        """
        result = []
        query_new = self.conv_query(query)
        for did in eval(query_new):
            result.append([self.highlighter(self.doc_list[did], query), self.link_list[did]])
        return result

In [18]:
news_searcher = NewsIndex('163news_d.csv')

In [19]:
query = '梅西'
result = news_searcher.search(query)
if result:
    for doc,url in result:
        display(HTML('<a href="{}" target="_blank">{}</a>'.format(url,doc)))
else:
    print('No result.')

#### tomorrow多线程采集示例

In [26]:
urls = [
    'http://google.com',
    'http://facebook.com',
    'http://youtube.com',
    'http://baidu.com',
    'http://yahoo.com',
]

import time
import requests

from tomorrow import threads

@threads(5)
def download(url):
    return requests.get(url)

if __name__ == "__main__":

    start = time.time()
    responses = [download(url) for url in urls]
    html = [response.text for response in responses]
    end = time.time()
    print("Time: %f seconds" % (end - start))

Time: 4.428255 seconds


#### API采集示例 (参考 https://www.jianshu.com/p/c54e25349b77 )

In [65]:
for i in range(0,30,10):
    url = 'https://3g.163.com/touch/reconstruct/article/list/BA8D4A3Rwangning/{}-10.html'.format(i)
    r = requests.get(url)
    results = json.loads(r.text[9:-1])
    for result in results['BA8D4A3Rwangning']:
        print(result['title'], result['url'])

态℃|不，你不想成为李子柒 http://3g.163.com/tech/19/1211/17/F04NR2FT000999D8.html
"腿骨折了想要跑步机”社会扶贫App被指奇葩需求多 http://3g.163.com/tech/19/1211/14/F04D2RA400097U82.html
加拿大非法逮捕孟晚舟文件曝光 手机信息给了FBI http://3g.163.com/tech/19/1211/11/F045TTL400097U7S.html
重返月球要花多少钱？一颗火箭可能要百亿 http://3g.163.com/tech/19/1211/11/F045M34P00097U81.html
不是iPhone，库克谈苹果对人类最大贡献是啥 http://3g.163.com/tech/19/1211/10/F040UFT900097U7S.html
德国电信运营商宣布将采用华为设备建设5G网络 http://3g.163.com/tech/19/1211/19/F050V2TE000999LD.html
iOS 13.3发布：可以过滤iMessage垃圾信息了 http://3g.163.com/tech/19/1211/07/F03M5VFI00097U7T.html
蔚来又在硅谷裁员141人 大砍自动驾驶团队 http://3g.163.com/tech/19/1211/08/F03QLD8V00097U7T.html
VLOG25#：一起看荣耀V30 Pro镜头里的TeamLab有多美 http://3g.163.com/mobile/19/1211/17/F04NG6LM00119821.html
英特尔公布技术路线图：10年后推1.4纳米工艺 http://3g.163.com/tech/19/1211/10/F03VTN4N00097U7T.html
亚马逊向中国卖家开放新加坡站点 全球站点增至13个 http://3g.163.com/tech/19/1211/21/F055KTSN00097U7R.html
苹果iOS更新：升级垃圾短信过滤能力 增家长控制功能 http://3g.163.com/tech/19/1211/20/F051PHDO00097U7S.html
报名|5G·未来沙龙第二弹！这次聊应用、谈模式、讲落地 htt

#### 有些页面用默认设置无法采集

In [63]:
r = requests.get('https://book.douban.com/tag/%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C')

In [64]:
r.status_code

418

#### 采集有反爬虫机制的网页
浏览器正常访问 -> Copy as Curl -> 粘贴至 https://curl.trillworks.com/ 转成Requests请求代码

In [58]:
cookies = {
    
}

headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
    'Sec-Fetch-User': '?1',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'navigate',
    'Referer': 'https://book.douban.com/',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}

response = requests.get('https://book.douban.com/tag/%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C', headers=headers, cookies=cookies)

In [62]:
sel = etree.HTML(response.text) #解析
nodes = sel.xpath("//li[@class='subject-item']")
for node in nodes:
    title = node.xpath('.//h2/a/text()')[0].strip()
    print(title)

神经网络与深度学习
Python深度学习
Python神经网络编程
深度学习入门
深度学习的数学
Neural Networks and Deep Learning
神经网络与机器学习（原书第3版）
神经网络与机器学习
连接组：造就独一无二的你
深入理解神经网络
意识的宇宙
人工智能 （第2版）
神经网络在应用科学和工程中的应用
Neural Networks and Learning Machines
Hands-On Machine Learning with Scikit-Learn and TensorFlow
Neural Network Methods in Natural Language Processing
神经网络设计
Make Your Own Neural Network
图解深度学习与神经网络：从张量到TensorFlow实现
深度学习
