In [13]:
import requests  
from lxml import etree 
import pickle
import os
from IPython.core.display import display, HTML
import timeit
import jieba

class MySearcherC7V0:
    """
    第六次课升级的搜索类版本：
    1、避免重复查询相同词
    2、尽量减少lower()的运行次数
    3、用文档刷词构建缓存
    4、去掉search里的文档扫描过程
    """
    def __init__(self, scale=1):
        self.docs = []
        self.load_data()
        self.docs *= scale
        self.cache = {}
        self.vocab = set()
        self.lower_preprocess()
        self.build_cache()
    
    def build_cache(self):
        doc_id = 0
        for doc in self.docs:
            doc_word_set = set()
            for word in jieba.cut(
                doc[3]
            ):
                if word not in doc_word_set:
                    result_item = [doc_id, self.score(doc, word)]
                    if word not in self.cache:
                        self.cache[word] = [result_item]
                    else:
                        self.cache[word].append(result_item)
                    self.vocab.add(word)
                    doc_word_set.add(word)
            doc_id += 1
        
        for word in self.cache:
            self.cache[word].sort(key=lambda x: x[1], reverse=True)
                    
    def lower_preprocess(self):
        for doc_id in range(len(self.docs)):
            self.docs[doc_id].append(
                (self.docs[doc_id][1] 
                 + ' ' 
                 + self.docs[doc_id][2]).lower()
            )
        
    def search(self, keyword):
        keyword_l = keyword.lower()
        if keyword_l in self.cache:
            sorted_result = self.cache[keyword_l] 
        else:
            sorted_result = []
        return sorted_result
    
    def simple_test(self):
        assert(len(self.search('tiktok')) > 1)
    
    def load_data(self):
        data_filename = 'news_list.dat'
        if os.path.exists(data_filename):
            with open(data_filename,'rb') as f:
                self.docs += pickle.load(f)
#                 self.docs = self.docs + pickle.load(f)
        else:
            url = 'http://news.163.com/special/0001386F/rank_tech.html'  
            headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 Edg/85.0.564.63'}
            r = requests.get(url, headers=headers)  
            sel = etree.HTML(r.text) 
            link_set = set()
            news_list = []
            count = 0
            for item in sel.xpath('//td/a'):  
                title = item.text
                link = item.attrib['href']
            #     print(link, title)
                if link not in link_set:
                    r = requests.get(link, headers=headers)  
                    sel = etree.HTML(r.text)  
                    text_block = sel.xpath('//div[@id="endText"]') 
                #     print(''.join(text_block[0].itertext()))
                    if text_block:
                        content = ''.join(text_block[0].xpath('./p/text()'))
                        title = sel.xpath('//h1/text()')[0]
                        self.docs.append([link, title, content])
                    link_set.add(link)
                count += 1
                if count % 15 == 0:
                    print(count, 'processed.')
            with open(data_filename,'wb') as f:
                pickle.dump(self.docs, f)
    
    def highlight(self, text, keyword):
        idx = text.lower().find(keyword.lower())
        result = text
        if idx >= 0:
            ori_word = text[idx:idx+(len(keyword))]
            result = text.replace(ori_word, '<span style="color:red";>{}</span>'.format(ori_word))
        return result
    
    def score(self, item, keyword):
        return (item[1].lower().count(keyword.lower()) * 5 
          + item[2].lower().count(keyword.lower()) * 3)
    
    def render_search_result(self, keyword):
        count = 0
        for item in self.search(keyword):
            count += 1
        #     print(count, '[{}] {}'.format(item[1], 
        #         highlight(news_list[item[0]][1], keyword)))
            display(HTML('{} [{}] {}'.format(count, item[1], 
                self.highlight(self.docs[item[0]][1], keyword))))

In [14]:
%%time
searcherv0_1x = MySearcherC7V0()

Wall time: 11.1 s


In [18]:
searcherv0_1x.render_search_result('斗鱼')

In [16]:
' '.join(jieba.cut('日媒拆解华为5G基站：中企零部件约占一半 美零部件占3成'))

'日媒 拆解 华为 5G 基站 ： 中企 零部件 约 占 一半   美 零部件 占 3 成'

In [23]:
' '.join(jieba.cut_for_search('腾讯系虎牙斗鱼终于合并，但快手抖音B站已杀来'))

'腾讯 系 虎牙 斗鱼 终于 合并 ， 但 快手 抖音 B站 已 杀 来'

In [22]:
jieba.load_userdict('dict.txt')

In [20]:
%%writefile dict.txt
B站
抖音
快手

Writing dict.txt


In [21]:
!cat dict.txt

B绔�
鎶栭煶
蹇�鎵�


In [47]:
class MySearcherC7V1(MySearcherC7V0):
    """
    1、初始化过程加载自定义分词词典
    2、改为使用cut_for_search进行分词
    
    3、对查询分词
    4、对分词结果取posting
    5、对posting lists取交集
    6、将posting保存格式改成只用doc_id
    """
    def __init__(self, scale=1):
        self.docs = []
        self.load_data()
        self.docs *= scale
        self.cache = {}
        self.vocab = set()
        self.lower_preprocess()
        self.build_cache()
        jieba.load_userdict('dict.txt')
    
    def build_cache(self):
        doc_id = 0
        for doc in self.docs:
            doc_word_set = set()
            for word in jieba.cut_for_search(
                doc[3]
            ):
                if word not in doc_word_set:
                    result_item = doc_id
                    if word not in self.cache:
                        self.cache[word] = set([result_item])
                    else:
                        self.cache[word].add(result_item)
                    self.vocab.add(word)
                    doc_word_set.add(word)
            doc_id += 1
    
    def search(self, query):
        result = None
        for keyword in jieba.cut(query.lower()):
            if keyword in self.cache:
                if result is None:
                    result = self.cache[keyword]
                else:
                    result = result & self.cache[keyword]
            else:
                result = set([])
                break
                
        if result is None:
            result = set([])
        
        sorted_result = self.rank(query, result)
        
#         keyword_l = keyword.lower()
#         if keyword_l in self.cache:
#             sorted_result = self.cache[keyword_l] 
#         else:
#             sorted_result = []
        
        return sorted_result
    
    def rank(self, query, result_set):
        result = []
        for doc_id in result_set:
            result.append([doc_id, 
                self.score(self.docs[doc_id],
                          query)])
        result.sort(key=lambda x: x[1], reverse=True)
        return result       
    
    def score(self, item, query):
        score = 0
        #todo cut
        for keyword in jieba.cut(query):
            score += item[1].lower().count(keyword.lower()) * 5 \
                      + item[2].lower().count(keyword.lower()) * 3
        return score

In [24]:
a = set([1,2,3])
b = set([2,3,4])

In [25]:
a & b

{2, 3}

In [26]:
a | b

{1, 2, 3, 4}

In [27]:
a - b

{1}

In [48]:
%%time
searcherv1_1x = MySearcherC7V1()

Wall time: 6.7 s


In [50]:
searcherv1_1x.search('华为手机')

[[63, 260],
 [134, 134],
 [53, 77],
 [3, 69],
 [23, 69],
 [83, 57],
 [146, 46],
 [123, 46],
 [80, 41],
 [133, 40],
 [231, 39],
 [50, 26],
 [77, 21],
 [114, 21],
 [57, 17],
 [104, 12],
 [108, 12],
 [58, 9]]

In [51]:
searcherv1_1x.render_search_result('华为手机')