In [8]:
import requests  
from lxml import etree 
import pickle
import os
from IPython.core.display import display, HTML
import timeit
import jieba

class MySearcherC6V0:
    """
    第五次课升级的搜索类版本：
    1、增加初始化参数scale，用于倍增文档集
    2、增加缓存机制，避免重复匹配相同关键词
    3、增加线下缓存预填充机制，用猜测得到的用户查询词预填充
    4、用文档分词得到的词表进行缓存预填充
    """
    def __init__(self, scale=1):
        self.docs = []
        self.load_data()
        self.docs *= scale
        self.cache = {}
        self.vocab = set()
        self.build_cache()
        
    def build_cache(self):
        for doc in self.docs:
            for word in jieba.cut(
                doc[1] +' ' + doc[2]
            ):
                r = self.search(word)
                self.vocab.add(word)
    
    def load_data(self):
        data_filename = 'news_list.dat'
        if os.path.exists(data_filename):
            with open(data_filename,'rb') as f:
                self.docs += pickle.load(f)
#                 self.docs = self.docs + pickle.load(f)
        else:
            url = 'http://news.163.com/special/0001386F/rank_tech.html'  
            headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 Edg/85.0.564.63'}
            r = requests.get(url, headers=headers)  
            sel = etree.HTML(r.text) 
            link_set = set()
            news_list = []
            count = 0
            for item in sel.xpath('//td/a'):  
                title = item.text
                link = item.attrib['href']
            #     print(link, title)
                if link not in link_set:
                    r = requests.get(link, headers=headers)  
                    sel = etree.HTML(r.text)  
                    text_block = sel.xpath('//div[@id="endText"]') 
                #     print(''.join(text_block[0].itertext()))
                    if text_block:
                        content = ''.join(text_block[0].xpath('./p/text()'))
                        title = sel.xpath('//h1/text()')[0]
                        self.docs.append([link, title, content])
                    link_set.add(link)
                count += 1
                if count % 15 == 0:
                    print(count, 'processed.')
            with open(data_filename,'wb') as f:
                pickle.dump(self.docs, f)
    
    def search(self, keyword):
        keyword_l = keyword.lower()
        if keyword_l in self.cache:
            sorted_result = self.cache[keyword_l] 
        else:
            count = 0
            sorted_result = []
            for item in self.docs:
                if keyword_l in (item[1] + item[2]).lower():
            #         count += 1
            #         print(count, highlight(title, keyword))
                    sorted_result.append([count, self.score(item, keyword)])
                count += 1
            sorted_result.sort(key=lambda x: x[1], reverse=True)
            self.cache[keyword_l] = sorted_result
        return sorted_result
    
    def highlight(self, text, keyword):
        idx = text.lower().find(keyword.lower())
        result = text
        if idx >= 0:
            ori_word = text[idx:idx+(len(keyword))]
            result = text.replace(ori_word, '<span style="color:red";>{}</span>'.format(ori_word))
        return result
    
    def score(self, item, keyword):
        return (item[1].lower().count(keyword.lower()) * 5 
          + item[2].lower().count(keyword.lower()) * 3)
    
    def render_search_result(self, keyword):
        count = 0
        for item in self.search(keyword):
            count += 1
        #     print(count, '[{}] {}'.format(item[1], 
        #         highlight(news_list[item[0]][1], keyword)))
            display(HTML('{} [{}] {}'.format(count, item[1], 
                self.highlight(self.docs[item[0]][1], keyword))))

In [3]:
%%time
searcher_1x = MySearcherC6V0()

Wall time: 38.7 s


In [9]:
%prun searcher_1x = MySearcherC6V0()

 

         11591540 function calls in 138.802 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
  6071604   93.981    0.000   93.981    0.000 {method 'lower' of 'str' objects}
   215165   31.666    0.000  127.703    0.001 <ipython-input-8-9c1e8c6ac724>:67(search)
    31598    2.139    0.000    2.319    0.000 __init__.py:180(get_DAG)
   805368    1.407    0.000    1.992    0.000 __init__.py:177(<genexpr>)
   308868    0.911    0.000    0.911    0.000 {method 'count' of 'str' objects}
   203750    0.904    0.000    9.289    0.000 __init__.py:249(__cut_DAG)
    31598    0.849    0.000    3.617    0.000 __init__.py:172(calc)
   425487    0.837    0.000    2.909    0.000 {built-in method builtins.max}
    18019    0.790    0.000    1.507    0.000 __init__.py:37(viterbi)
   154434    0.776    0.000    5.769    0.000 <ipython-input-8-9c1e8c6ac724>:92(score)
   215454    0.643    0.000   10.262    0.000 __init__.py:289(cut)
        1  

In [6]:
class MySearcherC6V1(MySearcherC6V0):
    """
    避免重复查询相同词
    """
    def build_cache(self):
        word_set = set()
        for doc in self.docs:
            for word in jieba.cut(
                doc[1] +' ' + doc[2]
            ):
                if word not in word_set:
                    r = self.search(word)
                    self.vocab.add(word)
                    word_set.add(word)

In [7]:
%%time
searcher_1x = MySearcherC6V1()

Wall time: 2min 11s


In [38]:
class MySearcherC6V2(MySearcherC6V1):
    """
    尽量减少lower()的运行次数
    """
    def __init__(self, scale=1):
        self.docs = []
        self.load_data()
        self.docs *= scale
        self.cache = {}
        self.vocab = set()
        self.lower_preprocess()
        self.build_cache()
        self.simple_test()
    
    def build_cache(self):
        word_set = set()
        for doc in self.docs:
            for word in jieba.cut(
                doc[1] +' ' + doc[2]
            ):
                if word not in word_set:
                    r = self.search(word)
                    self.vocab.add(word)
                    word_set.add(word)
                    
    def lower_preprocess(self):
        for doc_id in range(len(self.docs)):
            self.docs[doc_id].append(
                (self.docs[doc_id][1] 
                 + ' ' 
                 + self.docs[doc_id][2]).lower()
            )
        
    def search(self, keyword):
        keyword_l = keyword.lower()
        if keyword_l in self.cache:
            sorted_result = self.cache[keyword_l] 
        else:
            count = 0
            sorted_result = []
            for item in self.docs:
                if keyword_l in item[3]:
            #         count += 1
            #         print(count, highlight(title, keyword))
                    sorted_result.append([count, self.score(item, keyword)])
                count += 1
            sorted_result.sort(key=lambda x: x[1], reverse=True)
            self.cache[keyword_l] = sorted_result
        return sorted_result
    
    def simple_test(self):
        assert(len(self.search('tiktok')) > 1)

In [39]:
%%time
searcherv2_1x = MySearcherC6V2()

Wall time: 29 s


In [31]:
class MySearcherC6V3(MySearcherC6V2):
    """
    用文档刷词构建缓存
    """
    def build_cache(self):
        doc_id = 0
        for doc in self.docs:
            doc_word_set = set()
            for word in jieba.cut(
                doc[3]
            ):
                if word not in doc_word_set:
                    result_item = [doc_id, self.score(doc, word)]
                    if word not in self.cache:
                        self.cache[word] = [result_item]
                    else:
                        self.cache[word].append(result_item)
                    self.vocab.add(word)
                    doc_word_set.add(word)
            doc_id += 1
        
        for word in self.cache:
            self.cache[word].sort(key=lambda x: x[1], reverse=True)
                    
    def __init__(self, scale=1):
        self.docs = []
        self.load_data()
        self.docs *= scale
        self.cache = {}
        self.vocab = set()
        self.lower_preprocess()
        self.build_cache()

In [32]:
%%time
searcherv3_1x = MySearcherC6V3()

Wall time: 10.1 s


In [53]:
%%time
searcherv3_1x.render_search_result('B站')

Wall time: 15.5 ms


In [46]:
class MySearcherC6V4(MySearcherC6V3):
    """
    去掉search里的文档扫描过程
    """
    def search(self, keyword):
        keyword_l = keyword.lower()
        if keyword_l in self.cache:
            sorted_result = self.cache[keyword_l] 
        else:
            sorted_result = []
        return sorted_result

In [47]:
%%time
searcherv4_1x = MySearcherC6V4()

Wall time: 10 s


In [51]:
%%time
searcherv4_1x.search('b站')

Wall time: 0 ns


[]

In [54]:
' '.join(jieba.cut('B站CEO陈睿：5G时代视频将是绝对的主流'))

'B 站 CEO 陈睿 ： 5G 时代 视频 将 是 绝对 的 主流'