In [1]:
import requests  
from lxml import etree 
import pickle
import os
from IPython.core.display import display, HTML
import timeit
import jieba

class MySearcherC9V0:
    """
    第七次课升级的搜索类版本：
    1、__init__()初始化过程加载自定义分词词典
    2、build_cache()改用cut_for_search进行分词
    3、search()对查询分词
    4、search()对分词结果取posting
    5、search()对posting lists进行合并(交集)
    6、build_cache()将posting保存格式改成只用doc_id(方便集合运算)
    7、rank()实现对候选文档打分排序
    8、score()实现对查询中包含的多词统计词频计分
    """
    def __init__(self, scale=1):
        self.docs = []
        self.load_data()
        self.docs *= scale
        self.cache = {}
        self.vocab = set()
        self.lower_preprocess()
        self.build_cache()
        jieba.load_userdict('dict.txt')
    
    def build_cache(self):
        doc_id = 0
        for doc in self.docs:
            doc_word_set = set()
            for word in jieba.cut_for_search(
                doc[3]
            ):
                if word not in doc_word_set:
                    result_item = doc_id
                    if word not in self.cache:
                        self.cache[word] = set([result_item])
                    else:
                        self.cache[word].add(result_item)
                    self.vocab.add(word)
                    doc_word_set.add(word)
            doc_id += 1
    
    def search(self, query):
        result = None
        for keyword in jieba.cut(query.lower()):
            if keyword in self.cache:
                if result is None:
                    result = self.cache[keyword]
                else:
                    result = result & self.cache[keyword]
            else:
                result = set([])
                break
                
        if result is None:
            result = set([])
        
        sorted_result = self.rank(query, result)
        return sorted_result
                    
    def lower_preprocess(self):
        for doc_id in range(len(self.docs)):
            self.docs[doc_id].append(
                (self.docs[doc_id][1] 
                 + ' ' 
                 + self.docs[doc_id][2]).lower()
            )
    
    def simple_test(self):
        assert(len(self.search('tiktok')) > 1)
    
    def load_data(self):
        data_filename = 'news_list.dat'
        if os.path.exists(data_filename):
            with open(data_filename,'rb') as f:
                self.docs += pickle.load(f)
#                 self.docs = self.docs + pickle.load(f)
        else:
            url = 'http://news.163.com/special/0001386F/rank_tech.html'  
            headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 Edg/85.0.564.63'}
            r = requests.get(url, headers=headers)  
            sel = etree.HTML(r.text) 
            link_set = set()
            news_list = []
            count = 0
            for item in sel.xpath('//td/a'):  
                title = item.text
                link = item.attrib['href']
            #     print(link, title)
                if link not in link_set:
                    r = requests.get(link, headers=headers)  
                    sel = etree.HTML(r.text)  
                    text_block = sel.xpath('//div[@id="endText"]') 
                #     print(''.join(text_block[0].itertext()))
                    if text_block:
                        content = ''.join(text_block[0].xpath('./p/text()'))
                        title = sel.xpath('//h1/text()')[0]
                        self.docs.append([link, title, content])
                    link_set.add(link)
                count += 1
                if count % 15 == 0:
                    print(count, 'processed.')
            with open(data_filename,'wb') as f:
                pickle.dump(self.docs, f)
    
    def highlight(self, text, keyword):
        idx = text.lower().find(keyword.lower())
        result = text
        if idx >= 0:
            ori_word = text[idx:idx+(len(keyword))]
            result = text.replace(ori_word, '<span style="color:red";>{}</span>'.format(ori_word))
        return result
    
    def rank(self, query, result_set):
        result = []
        for doc_id in result_set:
            result.append([doc_id, 
                self.score(self.docs[doc_id],
                          query)])
        result.sort(key=lambda x: x[1], reverse=True)
        return result       
    
    def score(self, item, query):
        score = 0
        #todo cut
        for keyword in jieba.cut(query):
            score += item[1].lower().count(keyword.lower()) * 5 \
                      + item[2].lower().count(keyword.lower()) * 3
        return score
    
    def render_search_result(self, keyword):
        count = 0
        for item in self.search(keyword):
            count += 1
            display(HTML('{} [{}] {}'.format(count, item[1], 
                self.highlight(self.docs[item[0]][1], keyword) + '<br/>' + self.docs[item[0]][2])))

In [60]:
class MySearcherC9V1(MySearcherC9V0):
    """
    多关键词摘要和高亮
    """
    def highlight(self, item, query, sidelen=12):
        result = ''
        positions = []
        content_lower = item[2].lower()
        word_start_map = []
        word_end_map = []
        last_word_end = -1
        query_words = list(jieba.cut(query))
        for keyword in query_words:
            idx = content_lower.find(keyword.lower())
            positions.append(idx)

        for keyword in jieba.cut(content_lower):
            cur_word_start = last_word_end + 1
            cur_word_end = cur_word_start + len(keyword) - 1
            for i in range(cur_word_start, cur_word_end + 1):
                word_start_map.append(cur_word_start)
                word_end_map.append(cur_word_end)
            last_word_end = cur_word_end

    #     print(word_start_map)
    #     print(word_end_map)
    #     print(positions)
        positions.sort()
    #     print(positions)
        segments = []
        i = 0
        while i < len(positions):
            start_pos = max(positions[i] - sidelen, 0)
            end_pos = min(positions[i] + sidelen, len(content_lower) - 1)
            while (i < len(positions) - 1 
                and positions[i+1] - positions[i] <= 2 * sidelen):
                end_pos = min(positions[i+1] + sidelen, len(content_lower) - 1)
                i += 1  
            start_ddd = '...' if start_pos > 0 else ''
            end_ddd = '...' if end_pos < len(content_lower) else ''
#             print(len(content_lower), end_pos)
            segments.append(start_ddd 
                            + item[2][word_start_map[start_pos]:word_end_map[end_pos] + 1]
                            + end_ddd)
            i += 1
    #     print(segments)
        result = text = item[1] + '<br/>' + ''.join(segments)
        text_lower = text.lower()
        for keyword in query_words:
    #         print(keyword)
            idx = text_lower.find(keyword.lower())
            if idx >= 0:
                ori_word = text[idx:idx+(len(keyword))]
    #             print(ori_word)
                result = result.replace(ori_word, '<span style="color:red";>{}</span>'.format(ori_word))
        return result
    
    def render_search_result(self, query):
        count = 0
        for item in self.search(query):
            count += 1
            display(HTML('{} [{}] {}'.format(count, item[1], 
                self.highlight(self.docs[item[0]], query))))
            print(self.highlight(self.docs[item[0]], query))

In [61]:
searcher_v1 = MySearcherC9V1()

In [62]:
searcher_v1.render_search_result('手机iPhone')

没大惊喜，但<span style="color:red";>iPhone</span>12这次诚意满满，我心动了<br/>...发布会，带来了姗姗来迟的<span style="color:red";>iPhone</span>12系列。虽然......华为超越苹果成为全球第二大<span style="color:red";>手机</span>厂商；然而，由于受到美国...


没大惊喜，但<span style="color:red";>iPhone</span>12这次诚意满满，我心动了<br/>...发布会，带来了姗姗来迟的<span style="color:red";>iPhone</span>12系列。虽然......华为超越苹果成为全球第二大<span style="color:red";>手机</span>厂商；然而，由于受到美国...


华为Mate40来了 硬刚<span style="color:red";>iPhone</span>12！买哪个？网友吵起来了<br/>...华为Mate40来了，硬刚<span style="color:red";>iPhone</span>12！该买哪一个......，华为Mate 40系列<span style="color:red";>手机</span>将与<span style="color:red";>iPhone</span> 12...


<span style="color:red";>iPhone</span> 12大猜想！直角边框、5G、蓝色、不送耳机<br/>...       （原标题：<span style="color:red";>iPhone</span> 12大猜想：......微博数码博主详细介绍了4款<span style="color:red";>手机</span>的具体参数和颜色：顶配...


<span style="color:red";>iPhone</span> 12系列摄影能力有多强？夜间拍照更强了<br/>...	作者：陈思学 梁桂海【<span style="color:red";>手机</span>中国新闻】介绍完<span style="color:red";>iPhone</span> 12后，苹果...


<span style="color:red";>iPhone</span> 12系列摄影能力有多强？夜间拍照更强了<br/>...	作者：陈思学 梁桂海【<span style="color:red";>手机</span>中国新闻】介绍完<span style="color:red";>iPhone</span> 12后，苹果...


富士康万元招人，新<span style="color:red";>iPhone</span>要和华为"绝版"正面对决<br/>...：富士康重奖万元招人，新<span style="color:red";>iPhone</span>要和华为“绝版......等事业群。iDPBG是做<span style="color:red";>手机</span>组装的部门，目前正在组装...


<span style="color:red";>iPhone</span> 12今天没来，苹果早就告诉你要迟到，不过A14芯片提前来了<br/>...正在举办今年秋季新品盛宴，<span style="color:red";>iPhone</span>12缺席，却热度......苹果高管确认苹果的首代5G<span style="color:red";>手机</span>，即秋季发布的<span style="color:red";>iPhone</span>12...


苹果<span style="color:red";>iPhone</span> 12或掀起"超级周期" 定价至关重要<br/>苹果将举行<span style="color:red";>iPhone</span> 12系列新品......售价远低于1000美元的<span style="color:red";>手机</span>。近1000美元是2017...


苹果<span style="color:red";>iPhone</span> 12或掀起"超级周期" 定价至关重要<br/>苹果将举行<span style="color:red";>iPhone</span> 12系列新品......售价远低于1000美元的<span style="color:red";>手机</span>。近1000美元是2017...


<span style="color:red";>iPhone</span> 12再曝新料：搭载智能数据模式 4G/5G智能切换<br/>...       （原标题：<span style="color:red";>iPhone</span> 12发布前再......全新<span style="color:red";>iPhone</span> 12系列<span style="color:red";>手机</span>等多款产品将正式亮相。...


<span style="color:red";>iPhone</span> 12最新爆料：首批产品10月5日向经销商发货<br/>...5.4英寸苹果新机被曝命名为<span style="color:red";>iPhone</span> 12 mini......是两款<span style="color:red";>iPhone</span> 12<span style="color:red";>手机</span>，最小的5.4英寸<span style="color:red";>iPhone</span>...


又为苹果"打工"？美国两大运营商补贴送<span style="color:red";>iPhone</span>12<br/>...Verizon宣布将免费赠送5G版<span style="color:red";>iPhone</span>，显示他们正为......2017年以来对这款智能<span style="color:red";>手机</span>进行的首次重大重新设计...


<span style="color:red";>iPhone</span>12恐缺席9月发布会 苹果回应：敬请期待<br/>...       （原标题：<span style="color:red";>iPhone</span>12恐缺席9月......市场高度关注的苹果首款5G<span style="color:red";>手机</span><span style="color:red";>iPhone</span> 12的发布...


iOS迎重大更新，偷偷搜集你信息的APP瑟瑟发抖！<br/>...进行大改，适用于包括初代<span style="color:red";>iPhone</span> SE和<span style="color:red";>iPhone</span>......<span style="color:red";>iPhone</span> 6S在内的旧款智能<span style="color:red";>手机</span>。这次的更新中有个真正...


发布会前仅数小时，疑似<span style="color:red";>iPhone</span> 12官方渲染图曝光<br/>...标题：发布会前数小时，疑似<span style="color:red";>iPhone</span> 12官方渲染......举世瞩目的<span style="color:red";>iPhone</span> 12<span style="color:red";>手机</span>产品线和HomePod...


<span style="color:red";>iPhone</span> 12细节：支持大量5G频段，mini仅支持单SIM卡<br/>...    （原标题：准备买<span style="color:red";>iPhone</span> 12了？让我们......以及<span style="color:red";>iPhone</span> 12系列<span style="color:red";>手机</span>。产品发布后，苹果也第一...


<span style="color:red";>iPhone</span> 12细节：支持大量5G频段，mini仅支持单SIM卡<br/>...    （原标题：准备买<span style="color:red";>iPhone</span> 12了？让我们......以及<span style="color:red";>iPhone</span> 12系列<span style="color:red";>手机</span>。产品发布后，苹果也第一...


发布会前仅数小时，疑似<span style="color:red";>iPhone</span> 12官方渲染图曝光<br/>...标题：发布会前数小时，疑似<span style="color:red";>iPhone</span> 12官方渲染......举世瞩目的<span style="color:red";>iPhone</span> 12<span style="color:red";>手机</span>产品线和HomePod...


<span style="color:red";>iPhone</span> 12要来了！新<span style="color:red";>手机</span>之外还有什么值得期待？<br/>...特别发布会，预计届时将推出<span style="color:red";>iPhone</span> 12智能<span style="color:red";>手机</span>以及跟踪设备AirTags...


传<span style="color:red";>iPhone</span> 12Mini于11月中旬发货 HomePod mini卖99美元<br/>...最新泄露的消息称，苹果新款<span style="color:red";>iPhone</span> 12 mini......12系列，并列出了这些新款<span style="color:red";>手机</span>的价格，最低的两款都比...


下周苹果5G<span style="color:red";>手机</span>要来了 但美国运营商还没准备好<br/>... （原标题：下周苹果5G<span style="color:red";>手机</span>要来了 但美国运营商还......苹果公司预计将在下周发布新一代<span style="color:red";>iPhone</span><span style="color:red";>手机</span>（<span style="color:red";>iPhone</span>...


周周更新不停歇，iOS 14.0.1系统发布：修复部分bug<br/>...官方说明，本次更新包括针对<span style="color:red";>iPhone</span>的错误修复，如果......现在就可以备份好数据，并将<span style="color:red";>手机</span>升级至最新的系统。iOS...


拼多多插入京东、苏宁腹地<br/>...的数据来看，拼多多已经是<span style="color:red";>手机</span>数码3C 市场最大的经销......，新补贴政策不仅拉低了 <span style="color:red";>iphone</span>、戴森的价格，...


消息称<span style="color:red";>iPhone</span> 12能实现5G和4G之间的无缝过渡：提高续航<br/>...    （原标题：消息称<span style="color:red";>iPhone</span> 12能实现5G......12本周即将发布，对于这款<span style="color:red";>手机</span>，目前可以肯定的是，其...


华为抢购芯片+<span style="color:red";>手机</span>热卖，三星Q3营收或达570亿美元<br/>...大幅度增长，原因可能是智能<span style="color:red";>手机</span>销售强劲复苏，以及华为......下降，因为其大客户苹果新款<span style="color:red";>iPhone</span>的发布时间晚于...


1.7万元！腾讯奖励万名员工每人一台华为折叠屏<span style="color:red";>手机</span><br/>...员工还收到一款华为折叠屏<span style="color:red";>手机</span>Mate Xs礼物。这是......一名员工都获得了一部顶配版<span style="color:red";>iPhone</span> X<span style="color:red";>手机</span>。而在...


杀进100美元 苹果开始认真做智能音箱<br/>...HomePod Mini将和<span style="color:red";>iPhone</span> 12一同在苹果......。总结苹果带头重塑了智能<span style="color:red";>手机</span>行业，但它也不得不承认...


罗永浩回归！本人主持的秋季旧机发布会来了 与苹果撞期<br/>...直播之后，本人就暂时告别了<span style="color:red";>手机</span>发布会，不少人期待下一场......：罗永浩发布会时间与苹果<span style="color:red";>iPhone</span> 12发布日期...


罗永浩回归！本人主持的秋季旧机发布会来了 与苹果撞期<br/>...直播之后，本人就暂时告别了<span style="color:red";>手机</span>发布会，不少人期待下一场......：罗永浩发布会时间与苹果<span style="color:red";>iPhone</span> 12发布日期...


苹果A14仿生芯片有多香？目前安卓类<span style="color:red";>手机</span>芯片看了都慌<br/>...非常关键的硬件，因为新款<span style="color:red";>iPhone</span> 12智能<span style="color:red";>手机</span>、首款使用苹果自主研发...


传苹果下周不会发布自研芯片Mac 11月才亮相<br/>...在很大程度上将集中在新款<span style="color:red";>iPhone</span> 12设备上，......发布会上似乎将重点放在智能<span style="color:red";>手机</span>和音频产品上，而在春季...


Win应用程序商店公开怼苹果：一视同仁，不屏蔽对手<br/>...的数量远远不及个人电脑和<span style="color:red";>手机</span>。游戏机的商业模式与围绕......商业模式完全不同，后者在<span style="color:red";>iPhone</span>和iPad设备...


特斯拉系统遭全球性宕机！有车主被困在沙漠两小时<br/>...表示，“特斯拉的App在<span style="color:red";>iPhone</span>上已经‘冻结’......系统的宕机，由于特斯拉的<span style="color:red";>手机</span>App正成为其生态系统...


In [38]:
def highlight(item, query, sidelen=12):
    result = ''
    positions = []
    content_lower = item[2].lower()
    word_start_map = []
    word_end_map = []
    last_word_end = -1
    query_words = list(jieba.cut(query))
    for keyword in query_words:
        idx = content_lower.find(keyword.lower())
        positions.append(idx)
        
    for keyword in jieba.cut(content_lower):
        cur_word_start = last_word_end + 1
        cur_word_end = cur_word_start + len(keyword) - 1
        for i in range(cur_word_start, cur_word_end + 1):
            word_start_map.append(cur_word_start)
            word_end_map.append(cur_word_end)
        last_word_end = cur_word_end
        
#     print(word_start_map)
#     print(word_end_map)
#     print(positions)
    positions.sort()
#     print(positions)
    segments = []
    i = 0
    while i < len(positions):
        start_pos = max(positions[i] - sidelen, 0)
        end_pos = min(positions[i] + sidelen, len(content_lower) - 1)
        while (i < len(positions) - 1 
            and positions[i+1] - positions[i] <= 2 * sidelen):
            end_pos = min(positions[i+1] + sidelen, len(content_lower) - 1)
            i += 1  
        start_ddd = '...' if start_pos > 0 else ''
        end_ddd = '...' if end_pos < len(content_lower) else ''
        print(len(content_lower), end_pos)
        segments.append(start_ddd 
                        + item[2][word_start_map[start_pos]:word_end_map[end_pos] + 1]
                        + end_ddd)
        i += 1
#     print(segments)
    result = text = item[1] + '<br/>' + ''.join(segments)
    text_lower = text.lower()
    for keyword in query_words:
#         print(keyword)
        idx = text_lower.find(keyword.lower())
        if idx >= 0:
            ori_word = text[idx:idx+(len(query))]
#             print(ori_word)
            result = result.replace(ori_word, '<span style="color:red";>{}</span>'.format(ori_word))
    return result

display(HTML(highlight([
    '',
    '华为Mate40来了 硬刚iPhone12！买哪个？网友吵起来了', 
    '华为Mate40来了，硬刚iPhone12！该买哪一个？网友吵起来了） 每经编辑 何小桃这个10月，华为Mate 40系列手机将与iPhone 12系列手机正面对决'
], '对决华为系列手机')))

82 12
82 81


In [19]:
keywords = list(jieba.cut('华为Mate40来了 硬刚iPhone12！'))

In [20]:
keywords

['华为', 'Mate40', '来', '了', ' ', '硬刚', 'iPhone12', '！']

In [23]:
for keyword in keywords:
    print(keyword)

华为
Mate40
来
了
 
硬刚
iPhone12
！


In [29]:
for i in range(1,3+1):
    print(i)

1
2
3


In [None]:
'abcdef'