In [15]:
import requests  
from lxml import etree 
import pickle
import os
from IPython.core.display import display, HTML

class MySearcherC4V0:
    """
    第四次课集成的搜索类初始版本
    """
    def __init__(self):
        self.docs = []
        self.load_data()
    
    def load_data(self):
        data_filename = 'news_list.dat'
        if os.path.exists(data_filename):
            with open(data_filename,'rb') as f:
                self.docs += pickle.load(f)
#                 self.docs = self.docs + pickle.load(f)
        else:
            url = 'http://news.163.com/special/0001386F/rank_tech.html'  
            headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 Edg/85.0.564.63'}
            r = requests.get(url, headers=headers)  
            sel = etree.HTML(r.text) 
            link_set = set()
            news_list = []
            count = 0
            for item in sel.xpath('//td/a'):  
                title = item.text
                link = item.attrib['href']
            #     print(link, title)
                if link not in link_set:
                    r = requests.get(link, headers=headers)  
                    sel = etree.HTML(r.text)  
                    text_block = sel.xpath('//div[@id="endText"]') 
                #     print(''.join(text_block[0].itertext()))
                    if text_block:
                        content = ''.join(text_block[0].xpath('./p/text()'))
                        title = sel.xpath('//h1/text()')[0]
                        self.docs.append([link, title, content])
                    link_set.add(link)
                count += 1
                if count % 15 == 0:
                    print(count, 'processed.')
            with open(data_filename,'wb') as f:
                pickle.dump(self.docs, f)
    
    def search(self, keyword):
        count = 0
        sorted_result = []
        keyword_l = keyword.lower()
        for item in self.docs:
            if keyword_l in (item[1] + item[2]).lower():
        #         count += 1
        #         print(count, highlight(title, keyword))
                sorted_result.append([count, self.score(item, keyword)])
            count += 1
        sorted_result.sort(key=lambda x: x[1], reverse=True)
        return sorted_result
    
    def highlight(self, text, keyword):
        idx = text.lower().find(keyword.lower())
        result = text
        if idx >= 0:
            ori_word = text[idx:idx+(len(keyword))]
            result = text.replace(ori_word, '<span style="color:red";>{}</span>'.format(ori_word))
        return result
    
    def score(self, item, keyword):
        return (item[1].lower().count(keyword.lower()) * 5 
          + item[2].lower().count(keyword.lower()) * 3)
    
    def render_search_result(self, keyword):
        count = 0
        for item in self.search(keyword):
            count += 1
        #     print(count, '[{}] {}'.format(item[1], 
        #         highlight(news_list[item[0]][1], keyword)))
            display(HTML('{} [{}] {}'.format(count, item[1], 
                self.highlight(self.docs[item[0]][1], keyword))))

In [8]:
searcher = MySearcherC4V0()

In [10]:
searcher.load_data()

In [11]:
len(searcher.docs)

578

In [34]:
searcher.render_search_result('华为')

In [24]:
class MySearcherC5V1(MySearcherC4V0):
    """
    增加初始化参数scale，用于倍增文档集
    """
    def __init__(self, scale=1):
        self.docs = []
        self.load_data()
        self.docs *= scale

In [37]:
%%time
searcher = MySearcherC5V1(200)

Wall time: 12.5 ms


In [31]:
len(searcher.docs)

578000

In [38]:
%%time
r = searcher.search('华为')

Wall time: 1.6 s


In [39]:
import timeit

In [44]:
"%0.3f" % timeit.timeit(
    stmt="r = searcher.search('华为')",
    number=1,
    globals=globals()
             )

'1.618'

In [49]:
searcher_1x = MySearcherC5V1(1)
searcher_10x = MySearcherC5V1(10)
searcher_100x = MySearcherC5V1(100)
searcher_1000x = MySearcherC5V1(1000)

In [51]:
print('1x\t', "%0.3f" % timeit.timeit(
    stmt="r = searcher_1x.search('华为')",
    number=1,
    globals=globals()
             ))
print('10x\t', "%0.3f" % timeit.timeit(
    stmt="r = searcher_10x.search('华为')",
    number=1,
    globals=globals()
             ))
print('100x\t', "%0.3f" % timeit.timeit(
    stmt="r = searcher_100x.search('华为')",
    number=1,
    globals=globals()
             ))
print('1000x\t', "%0.3f" % timeit.timeit(
    stmt="r = searcher_1000x.search('华为')",
    number=1,
    globals=globals()
             ))

1x	 0.011
10x	 0.106
100x	 0.807
1000x	 7.794


In [52]:
print('10x1\t', "%0.3f" % timeit.timeit(
    stmt="r = searcher_10x.search('华为')",
    number=1,
    globals=globals()
             ))
print('10x100\t', "%0.3f" % timeit.timeit(
    stmt="r = searcher_10x.search('华为')",
    number=100,
    globals=globals()
             ))
print('10x1000\t', "%0.3f" % timeit.timeit(
    stmt="r = searcher_10x.search('华为')",
    number=1000,
    globals=globals()
             ))

10x1	 0.098
10x100	 7.868
10x1000	 77.826


In [55]:
class MySearcherC5V2(MySearcherC5V1):
    """
    增加缓存机制，避免重复匹配相同关键词
    """
    def __init__(self, scale=1):
        self.docs = []
        self.load_data()
        self.docs *= scale
        self.cache = {}
        
    def search(self, keyword):
        keyword_l = keyword.lower()
        if keyword_l in self.cache:
            sorted_result = self.cache[keyword_l] 
        else:
            count = 0
            sorted_result = []
            for item in self.docs:
                if keyword_l in (item[1] + item[2]).lower():
            #         count += 1
            #         print(count, highlight(title, keyword))
                    sorted_result.append([count, self.score(item, keyword)])
                count += 1
            sorted_result.sort(key=lambda x: x[1], reverse=True)
            self.cache[keyword_l] = sorted_result
        return sorted_result

In [56]:
%%time
searcher_1x    = MySearcherC5V2(1)
searcher_10x   = MySearcherC5V2(10)
searcher_100x  = MySearcherC5V2(100)
searcher_1000x = MySearcherC5V2(1000)

Wall time: 31.8 ms


In [58]:
print('10000x1\t', "%0.3f" % timeit.timeit(
    stmt="r = searcher_1000x.search('华为')",
    number=1,
    globals=globals()
             ))
print('10000x100\t', "%0.3f" % timeit.timeit(
    stmt="r = searcher_1000x.search('华为')",
    number=100,
    globals=globals()
             ))
print('10000x1000\t', "%0.3f" % timeit.timeit(
    stmt="r = searcher_1000x.search('华为')",
    number=1000,
    globals=globals()
             ))

10000x1	 6.985
10000x100	 0.000
10000x1000	 0.001


In [62]:
class MySearcherC5V3(MySearcherC5V2):
    """
    增加线下缓存预填充机制，用猜测得到的用户查询词预填充
    """
    def __init__(self, scale=1):
        self.docs = []
        self.load_data()
        self.docs *= scale
        self.cache = {}
        self.vocab = set(['华为', '苹果', 'tiktok'])
        self.build_cache()
        
    def build_cache(self):
        for word in self.vocab:
            r = self.search(word)

In [63]:
%%time
searcher_1000x = MySearcherC5V3(1000)

Wall time: 19.7 s


In [65]:
print('10000x1\t', "%0.3f" % timeit.timeit(
    stmt="r = searcher_1000x.search('手机')",
    number=1,
    globals=globals()
             ))
print('10000x100\t', "%0.3f" % timeit.timeit(
    stmt="r = searcher_1000x.search('手机')",
    number=100,
    globals=globals()
             ))
print('10000x1000\t', "%0.3f" % timeit.timeit(
    stmt="r = searcher_1000x.search('手机')",
    number=1000,
    globals=globals()
             ))

10000x1	 7.964
10000x100	 0.000
10000x1000	 0.001


In [66]:
searcher_1000x.search('阿仙奴')

[]

In [68]:
searcher_1x.docs[0]

['https://tech.163.com/20/1013/15/FOR1CSJ8000999LD.html',
 '日媒拆解华为5G基站：中企零部件约占一半 美零部件占3成',
 '\n                        （原标题：日媒拆解华为5G基站：中企设计零部件约占一半 美国零部件占3成）\n                    原标题：日媒拆解华为5G基站：美国零部件占3成IT之家10月13日消息 据日经中文网报道，美国政府9月15日强化了禁止向华为供应使用美国技术的半导体，这也对华为全球份额居首的通信基站产生了影响。近日日本经济新闻在专业调查公司Fomalhaut Techno Solutions（东京都江东区）的协助下，拆解并分析了华为的最新5G基站中被称为基带的核心装置。IT之家了解到，拆解发现，在基站的1320美元估算成本中，。由于美国加强管制，这些零部件有可能无法使用。另外，。其中“FPGA”半导体为美国莱迪思半导体（Lattice Semiconductor）和赛灵思（Xilinx）公司的产品。对基站不可缺少的电源进行控制的半导体是美国德州仪器（TI）和安森美半导体（ONSemiconductor）等的产品。此外，韩国零部件的使用数量仅次于美国，内存由三星电子制造，日本企业的零部件只有TDK和精工爱普生等的产品。']

In [None]:
日媒 拆解 华为 5G 基站 ： 中企 零部件 约 占 一半 美 零部件 占 3 成

In [69]:
import jieba

In [71]:
' '.join(jieba.cut(
    '日媒拆解华为5G基站：中企零部件约占一半 美零部件占3成'))

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\CHENGU~1\AppData\Local\Temp\jieba.cache
Loading model cost 2.859 seconds.
Prefix dict has been built successfully.


'日媒 拆解 华为 5G 基站 ： 中企 零部件 约 占 一半   美 零部件 占 3 成'

In [74]:
class MySearcherC5V4(MySearcherC5V3):
    """
    用文档分词得到的词表进行缓存预填充
    """
    def __init__(self, scale=1):
        self.docs = []
        self.load_data()
        self.docs *= scale
        self.cache = {}
        self.vocab = set()
        self.build_cache()
        
    def build_cache(self):
        for doc in self.docs:
            for word in jieba.cut(
                doc[1] +' ' + doc[2]
            ):
                r = self.search(word)
                self.vocab.add(word)


In [75]:
%%time
searcher_1x = MySearcherC5V4(1)

Wall time: 1min 56s
