In [126]:
from IPython.core.display import display, HTML
import bisect
from collections import defaultdict
import jieba
import pickle

class MySearchC5V0():
    """
    C3V0: Base class for Search Engine.
    C3V1: Data multiplication added.
    C3V2: Sorting optimization.
    C3V3: Add lowered version of docs.
    C3V4: For long doc.
    C3V5: Caching search results.
    C3V6: Pre-caching all words in docs.
    C3V7: Add Serialize/UnSerialize.
    C4V1: Add basic Bool query support
    C4V2: Add wordseg to get_word_match()
    ----------------C5V0-----------------

    Attributes
    ----------
    filename : str
        file name of doc data
    multi_factor : int
        data multiplication factor(default 1)

    Methods
    -------
    load_data(filename):
        load data from file.
    save_data(filename):
        save data to file
    pre_cache_all():
        Pre-caching all words in docs.
    highlight(text, keyword):
        highlight text with keyword.
    score(text, keyword):
        get score of text for a query.
    get_word_match(self, keyword):
        get doc set containing keyword.
    search(keyword, num=15):
        get top num search results of a query.
    render(result_list, keyword):
        output search results with highlight.
    query_to_set_expression(query):
        convert bool query to set expression(for eval process).
    get_word_match(word):
        get match set of the word.
    """
    
    def __init__(self, filename, multi_factor=1):
        self.docs = []
        self.docs_lower = []
        self.search_cache = defaultdict(set)
        self.multi_factor = multi_factor
        self.load_data(filename)
    
    def highlight(self, text, keyword, ori_text):
        idx = text.find(keyword)
        result = text
        if idx >= 0:
            ori_keyword = ori_text[idx:idx+len(keyword)]
            result = ori_text.replace(ori_keyword, f'<span style="color:red">{ori_keyword}</span>')
        return result
    
    def score(self, text, keyword):
        result = text.count(keyword)
        return result
    
    def query_to_set_expression(self, query):
        query_new_parts = []
        all_parts = list(query.replace('(', ' ( ').replace(')', ' ) ').split())
        idx = 0
        cache = ''
        count_parts = len(all_parts)
        while idx < count_parts:
            if all_parts[idx] == '(' or all_parts[idx] == ')':
                query_new_parts.append(all_parts[idx])
            elif all_parts[idx] == ' ' or all_parts[idx] == '':
                query_new_parts.append(' ')
            elif all_parts[idx] in ('and', 'AND', '+'):
                query_new_parts.append('&')
            elif all_parts[idx] in ('or', 'OR'):
                query_new_parts.append('|')
            elif all_parts[idx] in ('not', 'NOT', '-'):
                query_new_parts.append('-')
            else:
                if cache:
                    cache += ' ' + all_parts[idx]
                else:
                    cache = all_parts[idx]

                if (idx + 1 == count_parts
                  or all_parts[idx + 1] in ('(', ')', 'and', 'AND', '+', 'or', 'OR', 'NOT', 'not', '+', '-', ' ', '')):
                    query_new_parts.append(f"self.get_word_match('{cache}')")
                    cache = ''
            idx += 1
        query_new = ''.join(query_new_parts)
        return query_new
    
    def get_word_match(self, word):
        if_first_subword = True
        result = None
        for term in list(jieba.cut(word)):
            if if_first_subword:
                result = self.search_cache[term]
                if_first_subword = False
            else:
                result = result & self.search_cache[term]
            if not result:
                break
        return result
    
    def search(self, query, num=15):
        query_lower = query.lower()    
        result_list = []
        min_score = 0
        query_new = self.query_to_set_expression(query_lower)
        for tid in eval(query_new):
            doc = self.docs_lower[tid]
            score = 1 #self.score(doc, keyword_lower)
            if len(result_list) == num:
                if score > min_score:
                    insert_idx = bisect.bisect(
                        [doc_score[1] for doc_score in result_list], 
                        score
                    )
                    min_score = result_list[0][1]
                    result_list = result_list[1:insert_idx] + \
                                    [(tid, score)] + \
                                    result_list[insert_idx:]
            elif len(result_list) < num - 1:
                result_list.append((tid, score))
            elif len(result_list) == num - 1:
                result_list.append((tid, score))
                result_list.sort(key = lambda x: x[1])
                min_score = result_list[0][1]
        return [doc_id for doc_id, _ in result_list[::-1]]
    
    def render(self, result_list, keyword):
        count = 1
        for item in result_list:
            result = self.highlight(
                self.docs_lower[item], 
                keyword.lower(), 
                self.docs[item]
            ).replace('$$$', '<br/>') #
            display(HTML(f"{count}„ÄÅ{result[:150]}......")) #
            count += 1
            
    def pre_cache_all(self):
        for tid, doc in enumerate(self.docs_lower):
            for word in jieba.cut_for_search(doc):
                self.search_cache[word].add(tid)
                
    def load_data(self, filename):
        if filename[-3:] == 'txt':
            with open(filename, 'r') as f:
                self.docs = f.read().split('\n')
            self.docs_lower = [doc.lower() for doc in self.docs]
            self.docs = self.docs * self.multi_factor 
            self.docs_lower = self.docs_lower * self.multi_factor
            self.pre_cache_all()
        elif filename[-3:] == 'dat':
            with open(filename, 'rb') as f:
                self.docs, self.docs_lower, self.search_cache = pickle.load(f)
                
    def save_data(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump((self.docs, self.docs_lower, self.search_cache), f)
            

## score()ÂáΩÊï∞ÁöÑÊîπËøõ ‚Äî‚Äî> Êü•ËØ¢/ÊñáÊ°£Áõ∏‰ººÂ∫¶ÊâìÂàÜ(Ranking)

![Image on 2021-10-14 09.45.06 AM.jpg](attachment:2148bafb-1486-446d-b363-15de6b205ec6.jpg)

### Â¶Ç‰ΩïËØÑ‰ª∑Êü•ËØ¢ÂíåÊñáÊ°£‰πãÈó¥ÁöÑÁõ∏ÂÖ≥Â∫¶Ôºü
#### ËØ≠‰πâ„ÄÅÊÑèÂõæ„ÄÅÁ±ªÂûãÁ≠âÂ§öÁßç`ÈöêÊÄß`Áª¥Â∫¶

#### ÊúÄÁõ¥Êé•ÁöÑÊÉ≥Ê≥ïÔºöÂØπ`Êü•ËØ¢`Âíå`ÊñáÊ°£`‰πãÈó¥`ÂÖ±ÂêåÂá∫Áé∞ËØç`ËøõË°åËÆ°Êï∞
\begin{equation}
\operatorname{score}(Q, D)=|Q \bigcap D|
\end{equation}

### * ËØçË¢ãÊ®°Âûã

In [125]:
import jieba

def score_intersection(doc1, doc2):
    word_set_doc1 = set(jieba.cut(doc1.lower()))
    word_set_doc2 = set(jieba.cut(doc2.lower()))
    return len(word_set_doc1 & word_set_doc2)

In [127]:
q = 'iphone ÊâãÊú∫'

doc_1 = 'iPhone 13Á≥ªÂàóÂ§ßÈôç‰ª∑ÔºÅÂõΩ‰∫ßÊâãÊú∫ÈöæÂèó‰∫Ü$$$Âçé‰∏∫ÊâãÊú∫ÈÄêÊ∏êÈÄÄÂá∫‰∏ªÊµÅÂ∏ÇÂú∫ÂêéÔºåÂÖ∂‰ΩôÂõΩ‰∫ßÊâãÊú∫ÂìÅÁâåÊäì‰ΩèÊú∫‰ºöÂä™ÂäõÂÜ≤ÂáªÈ´òÁ´ØÂ∏ÇÂú∫„ÄÇ‰ΩÜÂèØÊÉúÁöÑÊòØÔºåÈöèÁùÄËãπÊûúiPhone‚ÄÇ13ÁöÑÂèëÂ∏ÉÔºåÂõΩ‰∫ßÊâãÊú∫ÂìÅÁâåÈ´òÁ´ØÂåñËøõÁ®ãÂèóÂà∞‰∏•ÈáçÂÜ≤Âáª„ÄÇÁî±‰∫éÂÖ®ÁêÉÁº∫ËäØÂíåÈÉ®ÂàÜÂéüÊùêÊñôÊ∂®‰ª∑ÔºåËãπÊûúiPhone‚ÄÇ13Á≥ªÂàóÂú®ÂèëÂ∏É‰πãÂâçË¢´ËÆ§‰∏∫‰ºöÁï•ÂæÆÊ∂®‰ª∑„ÄÇ‰ΩÜÂÆûÈôÖÊÉÖÂÜµÊòØÔºåÁΩëÂèã‰ª¨‰∏ÄÁõ¥Ë∞É‰æÉÁöÑ‚ÄúÂçÅ‰∏âÈ¶ô‚ÄùÁ°ÆÂÆûÂ∫îÈ™å‰∫ÜÔºåiPhone‚ÄÇ13Á≥ªÂàóÁöÑÂõΩË°åÂîÆ‰ª∑Áõ∏ÊØîÂâç‰ª£iPhone‰æøÂÆú‰∫Ü300ÂÖÉ-800ÂÖÉÔºåËÄåÁæéÁâà‰ª∑Ê†ºÂàô‰øùÊåÅ‰∏çÂèò„ÄÇÂØπ‰∫é‰∏≠ÂõΩÊ∂àË¥πËÄÖÊù•ËØ¥ÔºåËãπÊûúiPhone‚ÄÇ13Á≥ªÂàóÂä†Èáè‰∏çÂä†‰ª∑Ëá™ÁÑ∂ÊòØ‰∏Ä‰ª∂Â§ßÂ•Ω‰∫ã„ÄÇ‰ΩÜÊòØÂØπ‰∫éÂõΩÂÜÖÁöÑÊô∫ËÉΩÊâãÊú∫ÂéÇÂïÜÊù•ËØ¥ÔºåiPhone‚ÄÇ13Á≥ªÂàóÁöÑÈôç‰ª∑ÂèØËÉΩ‰ºöÁ†¥ÂùèÂõΩ‰∫ßÊâãÊú∫ÂéÇÂïÜÁöÑÈ´òÁ´ØÂåñËøõÁ®ã„ÄÇÊàë‰ª¨Êù•ÁúãÁúãËãπÊûúiPhone‚ÄÇ13Á≥ªÂàóÁöÑÂîÆ‰ª∑ÔºöiPhone‚ÄÇ13‚ÄÇmini‚ÄÇ128G‚ÄÇÁâà‰ª∑Ê†º‰∏∫5199ÂÖÉÔºå256GBÁâà5999ÂÖÉ„ÄÅ512GBÁâà7599ÂÖÉÔºåiPhone‚ÄÇ13‚ÄÇ128GBÁâà5999ÂÖÉ„ÄÅ256GBÁâà6799ÂÖÉ„ÄÅ512GBÁâà8399ÂÖÉ„ÄÇiPhone‚ÄÇ13‚ÄÇPro‚ÄÇ128GBÁâà7999ÂÖÉ„ÄÅ256GBÁâà8799ÂÖÉ„ÄÅ512GBÁâà10399ÂÖÉ„ÄÅ1TÁâà11999ÂÖÉÔºåiPhone‚ÄÇ13‚ÄÇPro‚ÄÇMax‚ÄÇ128GBÁâà8999ÂÖÉ„ÄÅ256GBÁâà9799ÂÖÉ„ÄÅ512GBÁâà11399ÂÖÉ„ÄÅ1TÁâà12999ÂÖÉ„ÄÇÈô§‰∫Ü‰ª∑Ê†º‰∏ãÈôç‰ª•Â§ñÔºåiPhone‚ÄÇ13Á≥ªÂàóÁöÑÂêÑÈ°πÂèÇÊï∞‰πüÊúâ‰∏çÂ∞èÁöÑÊèêÂçáÔºåÂ∞§ÂÖ∂ÊòØÂú®Áª≠Ëà™ÊñπÈù¢„ÄÇÊ†πÊçÆÁü•ÂêçÊï∞Á†ÅËØÑÊµãÂçö‰∏ª@Â∞èÁôΩÊµãËØÑÁöÑÊï∞ÊçÆÔºåiPhone‚ÄÇ13‚ÄÇPro‚ÄÇMaxÂíåiPhone‚ÄÇ13‰∏§Ê¨æÊú∫ÂûãÈáçÂ∫¶‰ΩøÁî®5Â∞èÊó∂ÂêéÔºå‰ªçÊóßÂâ©‰Ωô‰∏çÂ∞ëÁîµÈáè„ÄÇÂ∞§ÂÖ∂ÊòØiPhone‚ÄÇ13‚ÄÇPro‚ÄÇMaxÂú®‰∫îÂ∞èÊó∂Ê∑±Â∫¶Áª≠Ëà™ÊµãËØï‰∏≠Ë°®Áé∞ÂçÅÂàÜ‰ºòÁßÄÔºåÂâ©‰ΩôÁîµÈáè35%ÔºåËøúË∂ÖÂÖ∂‰ªñÊô∫ËÉΩÊâãÊú∫Ôºå‰ΩçÂàóÊéíË°åÊ¶úÁ¨¨‰∏Ä„ÄÇÈô§Ê≠§‰πãÂ§ñÔºåiPhone‚ÄÇ13Á≥ªÂàóÂè¶‰∏Ä‰∏™ÊúÄÊòéÊòæÁöÑÂçáÁ∫ßÊòØÊîØÊåÅ120HzÈ´òÂà∑ÔºåËøõ‰∏ÄÊ≠•ÊèêÂçá‰∫ÜÊµÅÁïÖÂ∫¶„ÄÇËÄåÂú®ÊãçÁÖß„ÄÅÊëÑÂΩ±ÊñπÈù¢ÔºåÊ≠§Ê¨°iPhone‚ÄÇ13Á≥ªÂàó‰πüÂêåÊ†∑ÊúâËæÉÂ§ßÂπÖÂ∫¶Âú∞ÂçáÁ∫ßÔºåÁâπÂà´ÊòØÁîµÂΩ±Ê®°ÂºèÁöÑÂä†ÂÖ•ÔºåËÆ©ÊâãÊú∫‰∏ì‰∏öÊëÑÂΩ±Êàê‰∏∫ÂèØËÉΩ„ÄÇÂú®Âä†Èáè‰∏çÂä†‰ª∑ÂêéÔºåËãπÊûúiPhone‚ÄÇ13Áõ∏ËæÉÂõΩ‰∫ßÂÆâÂçìÈ´òÁ´ØÊâãÊú∫ÁöÑ‰ºòÂäøÂ∞±Êõ¥Â§ß‰∫Ü„ÄÇÁõÆÂâçÂõΩ‰∫ßÈ´òÁ´ØÊâãÊú∫ÁöÑ‰ª∑‰ΩçÊôÆÈÅçÊù•Âà∞5000-8000ÂÖÉÔºå‰ΩÜÊòØÂú®ÊÄßËÉΩ„ÄÅÂΩïÂΩ±„ÄÅÁª≠Ëà™„ÄÅÁ≥ªÁªüÁîüÊÄÅ‰ΩìÈ™åÁ≠âÊñπÈù¢‰∏éiPhone‚ÄÇ13Â≠òÂú®ËæÉÂ§ßÂ∑ÆË∑ùÔºåËøôÂæàÂèØËÉΩÂØºËá¥ÂõΩÂÜÖÈ´òÁ´ØÂ∏ÇÂú∫Ë¢´ËãπÊûú‚ÄúÂûÑÊñ≠‚Äù„ÄÇÈöèÁùÄËãπÊûúiPhone‚ÄÇ13Á≥ªÂàóÈôç‰ª∑ÔºåÂπ∂‰∏îË°•ÈΩê‰∫ÜÁª≠Ëà™„ÄÅÈ´òÂà∑Êñ∞ÁéáÁ≠âÁü≠ÊùøÔºåÂÆâÂçìÊóóËà∞ÊâãÊú∫ÊòØÂê¶ËøòÂÄºÂæóË¥≠‰π∞ÔºüÊõæÂá†‰ΩïÊó∂ÔºåÂçé‰∏∫Âá≠ÂÄüÂú®ÂΩ±ÂÉèÊñπÈù¢ÁöÑÂàõÊñ∞‰∏ÄÊ≠•Ê≠•Á´ôÁ®≥È´òÁ´ØÂ∏ÇÂú∫ÔºåËÄåÂçé‰∏∫ÁöÑÊàêÂäüÂØπ‰∫éÂÖ∂‰ªñÂõΩ‰∫ßÊâãÊú∫ÂéÇÂïÜÂÜ≤ÂáªÈ´òÁ´ØÂ∏ÇÂú∫ÂÖ∑ÊúâÊûÅÂÖ∂Ê∑±ËøúÁöÑÂÄüÈâ¥ÊÑè‰πâ„ÄÇÂõ†Ê≠§ÔºåÂΩ±ÂÉèÊàê‰∏∫‰∫ÜÂÆâÂçìÊóóËà∞ÊâãÊú∫ÂØπÊØîiPhone‚ÄÇ13‚ÄÇPro‰∏∫Êï∞‰∏çÂ§öÁöÑ‰ºòÂäø‰πã‰∏Ä„ÄÇ‰ªäÂπ¥ËãπÊûúiPhone‚ÄÇ13Á≥ªÂàóÂú®Áõ∏Êú∫ÂèÇÊï∞ÊñπÈù¢Âπ∂Ê≤°ÊúâÂ§ßÂπÖÂçáÁ∫ßÔºå‰∏ªÊëÑ‰æùÁÑ∂ÊòØ1200‰∏áÂÉèÁ¥†Ôºå‰ΩÜÊòØÂçáÁ∫ß‰∫ÜCMOSÂõæÂÉè‰º†ÊÑüÂô®ÂíåÂÖâÂúàÔºåÊèêÂçá‰∫ÜËøõÂÖâÈáèÔºåÊï¥‰ΩìÁ°¨‰ª∂Â∑ÆË∑ù‰ªçÁÑ∂ÂíåÂÆâÂçìÈòµËê•ÊúâËæÉÂ§ßÁöÑÂ∑ÆË∑ù„ÄÇËôΩÁÑ∂ÁõÆÂâçÊâãÊú∫ÂΩ±ÂÉèÂ∑≤ÁªèËøõÂÖ•ËÆ°ÁÆóÊëÑÂΩ±Êó∂‰ª£ÔºåÁÆóÊ≥ïÂèØËÉΩÊØîÁ°¨‰ª∂Êõ¥Âä†ÈáçË¶ÅÔºå‰ΩÜÊòØÂçé‰∏∫„ÄÅ‰∏âÊòüÁ≠â‰∏Ä‰ºóÈ°∂Á∫ßÂéÇÂïÜÁöÑÁÆóÊ≥ï‰πüÂπ∂‰∏çÂ∑Æ„ÄÇÂõ†Ê≠§ÔºåÂá≠ÂÄüÊõ¥Âº∫ÁöÑÁ°¨‰ª∂Á¥†Ë¥®ÔºåÂÆâÂçìÈ´òÁ´ØÊâãÊú∫Âú®ÊâãÊú∫ÊàêÂÉèË¥®ÈáèÊñπÈù¢‰æùÁÑ∂ÊòØÈ¢ÜÂÖàËãπÊûúiPhone„ÄÇÂè¶Â§ñÔºåÂÆâÂçìÈ´òÁ´ØÊâãÊú∫ÁöÑÂÖÖÁîµÂäüÁéáÁõÆÂâçÂ∑≤ÁªèËøõÂÖ•ÁôæÁì¶Êó∂‰ª£ÔºåÈÉ®ÂàÜÂÆâÂçìÊú∫Âûã‰ªÖÈúÄÂçäÂ∞èÊó∂‰∏çÂà∞Êó∂Èó¥Â∞±ÂèØ‰ª•ÂÖÖÊª°ÁîµÈáè„ÄÇËÄåiPhone‚ÄÇ13Á≥ªÂàóÁöÑÂÖÖÁîµÂäüÁéá‰ªÖÊúâ20WÔºåÂÖÖÊª°ÁîµÈáèÈúÄË¶Å‰∏Ä‰∏™Â§öÂ∞èÊó∂„ÄÇÂõ†Ê≠§ÔºåÂ¶ÇÊûú‰Ω†ÁâπÂà´Âú®ÊÑèÂÖÖÁîµÈÄüÂ∫¶ÂíåÊãçÁÖßÊàêÁâáË¥®ÈáèÔºåÈÇ£‰πàËøòÊòØÂª∫ËÆÆÈ¶ñÈÄâÂÆâÂçìÊóóËà∞ÊâãÊú∫„ÄÇ‰ΩÜÂ¶ÇÊûú‰ªéÁªºÂêà‰ΩìÈ™å‰∏äÊù•ÁúãÔºåÁ¨îËÄÖËÆ§‰∏∫ËãπÊûúiPhone‚ÄÇ13Á≥ªÂàóÁöÑ‰ºòÂäøÊõ¥Â§ßÔºåÈô§‰∫ÜÂú®ÊÄßËÉΩ„ÄÅÂΩïÂΩ±„ÄÅÁª≠Ëà™„ÄÅÁ≥ªÁªüÁîüÊÄÅ‰ΩìÈ™åÁ≠âÊñπÈù¢ÁöÑÈ¢ÜÂÖà‰ºòÂäøÈùûÂ∏∏ÊòéÊòæ‰ª•Â§ñÔºåiPhone‚ÄÇ13Á≥ªÂàóÁöÑÊãçÊëÑ‰ΩìÈ™å‰πüÈùûÂ∏∏Ê£í„ÄÇËôΩÁÑ∂ÊàêÂÉèË¥®Èáè‰∏çÂ¶ÇÂÆâÂçìÊóóËà∞Ôºå‰ΩÜÊòØÂá≠ÂÄüÊõ¥Âº∫ÁöÑA15Â§ÑÁêÜÂô®ÔºåËãπÊûúiPhone‚ÄÇ13Á≥ªÂàóÂ∏¶Êù•Êõ¥Âä†ÊµÅÁïÖÁöÑÊãçÊëÑ‰ΩìÈ™åÔºåÊØîÂ¶ÇÊöóÂÖâÁéØÂ¢É‰∏ãÔºå‰∏ÄÁßíÂ∞±ËÉΩÊàêÁâáÔºåËÄåÂÆâÂçìÊóóËà∞ÂàôÈúÄË¶ÅÂ§ßÈáèÊó∂Èó¥ËøõË°åËÆ°ÁÆó„ÄÇÂèàÊØîÂ¶ÇiPhone‚ÄÇ13Á≥ªÂàóÊãçÁÖßÊó∂ÔºåÈ¢ÑËßàÊ°ÜÂíåÊàêÁâáÂá†‰πéÂèØ‰ª•‰øùÊåÅ‰∏ÄËá¥ÔºåÂÅöÂà∞ÊâÄËßÅÂç≥ÊâÄÂæó„ÄÇËÄåÂÆâÂçìÊóóËà∞ÊâãÊú∫ÁöÑËÆ°ÁÆóÊÄßËÉΩÂàôÊó†Ê≥ïÊîØÊåÅËøôÊ†∑ÁöÑÊãçÊëÑ‰ΩìÈ™å„ÄÇÊõ¥‰ΩéÁöÑÂîÆ‰ª∑ÔºåÊõ¥Â•ΩÁöÑ‰ΩøÁî®‰ΩìÈ™åÔºåËãπÊûúiPhone‚ÄÇ13ÂØπ‰∫éÂÆâÂçìÊóóËà∞ÊâãÊú∫Êù•ËØ¥Á°ÆÂÆûÊòØ‰∏Ä‰∏™È¢á‰∏∫Âç±Èô©ÁöÑ‰ø°Âè∑„ÄÇÊëÜÂú®ÂõΩ‰∫ßÊâãÊú∫ÂéÇÂïÜÈù¢ÂâçÂè™Êúâ‰∏§Êù°Ë∑ØÔºö‰∏ÄÊù°Ë∑ØÊòØÈôç‰ΩéÊóóËà∞Êú∫ÂûãÂîÆ‰ª∑Ôºå‰ΩÜËøô‰ºö‰∏•ÈáçÂΩ±ÂìçÂõΩ‰∫ßÂìÅÁâåËøõÂáªÈ´òÁ´ØÂ∏ÇÂú∫ÁöÑÂÜ≥ÂøÉ;Âè¶‰∏ÄÊù°Ë∑ØÊòØÂä†Â§ßÁ†îÂèëÂàõÊñ∞ÊäïÂÖ•ÔºåÂØªÊâæÊñ∞ÁöÑÂ∑ÆÂºÇÂåñ‰ºòÂäøÔºåÂª∫ÈÄ†Êä§ÂüéÊ≤≥„ÄÇ - THE END -     $$$https://news.mydrivers.com/1/785/785198.htm'

doc_2 = 'Â∞èÁ±≥Âè≤‰∏äÊúÄÁ™Ñ‚Äú‰∏ãÂ∑¥‚ÄùÔºÅXiaomi CiviÂºÄÁÆ±ÂõæËµè$$$9Êúà22Êó•ÔºåÂ∞èÁ±≥ÂÆòÂÆ£ÂÖ®Êñ∞ÊâãÊú∫Á≥ªÂàó‚Äî‚ÄîXiaomi CiviÔºåÂÆö‰Ωç‰∏ì‰∏∫Âπ¥ËΩª‰∫∫ÊâìÈÄ†ÊΩÆÊµÅÊâãÊú∫ÔºåÈ¶ñÊ¨æÊú∫ÂûãÂÆöÊ°£‰∫é9Êúà27Êó•ÂèëÂ∏É„ÄÇÁé∞Âú®ËøôÊ¨æÊâãÊú∫Â∑≤ÁªèÊä¢ÂÖàÊù•Âà∞Êàë‰ª¨ËØÑÊµãÂÆ§Ôºå‰∏ãÈù¢‰∏∫Â§ßÂÆ∂Â∏¶Êù•ÂõæËµè„ÄÇÔºåÂú®‰øùÊåÅ‰∏ùÊªëÊâãÊÑüÁöÑ‰ΩìÈ™å‰πã‰∏äÔºåËìùËâ≤ÂíåÈªëËâ≤ËøòÂ∏¶Êù•‰∫ÜBlingBlingÁöÑÈó™‰∫ÆÂ§ñËßÇÔºåÂêåÊó∂‰∏çÁïôÊåáÁ∫πÔºåËÄåÁã¨ÁâπÁöÑC‰ΩçÁ≤âËâ≤ÁâàÊú¨ËøòÂ∏¶ÊúâÂÖ®Êñ∞ÁöÑÁªíÊØõÁ∫πÁêÜ„ÄÇÔºõÂØπÊâãÈÉ®ÁöÑË¥üÊãÖÊõ¥Â∞èÔºåËøôÂØπÂ•≥ÊÄßÁî®Êà∑Êù•ËØ¥Â∞§ÂÖ∂ÈáçË¶Å„ÄÇÊñ∞Êú∫ÈÖçÂ§á6.55Ëã±ÂØ∏Â±èÂπïÔºåÂÅöÂà∞‰∫Ü‰∏é6.1Ëã±ÂØ∏Â±èÂπïÁöÑiPhone 13Á≠âÂÆΩ„ÄÇXiaomi CiviÁöÑÁ≤æËá¥ÊÑü‰∏ç‰ªÖÊù•Ëá™ÂêéÁõñÁöÑÂ∑•Ëâ∫Âíå‰∏≠Ê°ÜÁöÑÂºßÂ∫¶ÔºåÂêåÊó∂ËøòÊúâÊûÅËá¥Á™ÑËæπÊ°ÜÔºå - THE END -ËΩ¨ËΩΩËØ∑Ê≥®ÊòéÂá∫Â§ÑÔºöÂø´ÁßëÊäÄ     $$$https://news.mydrivers.com/1/785/785197.htm'

print(f"score of doc_1: {score_intersection(q, doc_1)}\nscore of doc_2: {score_intersection(q, doc_2)}")

score of doc_1: 3
score of doc_2: 3


In [128]:
set(jieba.cut('iPhone ÊâãÊú∫'))

{' ', 'iPhone', 'ÊâãÊú∫'}

### * ÂÅúÁî®ËØç  
![Image on 2021-10-14 10.48.08 AM.jpg](attachment:7b5d170f-d0f5-4741-9405-54a012d50e82.jpg)

In [129]:
def score_intersection(doc1, doc2):
    stop_word_set = set([' ', '$', '-', '.', '/', 'ÔºÅ', 'Ôºå', 'Ôºö', 'Ôºü', 'ÊòØ', 'ËÆ©', '‰∫Ü', 'ÁöÑ', 'Âïä', 'Âêß'])
    
    word_set_doc1 = set(jieba.cut(doc1.lower()))
    word_set_doc2 = set(jieba.cut(doc2.lower()))
    return len(word_set_doc1 & word_set_doc2 - stop_word_set)

In [130]:
print(f"score of doc_1: {score_intersection(q, doc_1)}\nscore of doc_2: {score_intersection(q, doc_2)}")

score of doc_1: 2
score of doc_2: 2


#### JaccardÁ≥ªÊï∞(Jaccard index)  -> JaccardË∑ùÁ¶ª
![Image on 2021-10-14 09.51.22 AM.jpg](attachment:8fde4141-8beb-431b-8178-5d3cf2bed0b9.jpg)

### * Áõ∏‰ººÂ∫¶(Similarity) vs. Ë∑ùÁ¶ª(Distance)

#### JaccardÁõ∏‰ººÂ∫¶
#### ËÄÉËôë‰∫ÜÊñáÊ°£ÈïøÂ∫¶ÂØπÁõ∏‰ººÂ∫¶ÁöÑÂΩ±ÂìçÔºåÊñáÊ°£Ë∂äÈïøÔºåÁõ∏‰ººÂ∫¶Ë∂ä‰Ωé
\begin{equation}
\operatorname{score}(Q, D)=\frac{|Q \cap D|}{|Q|+|D|-|Q \cap D|}
\end{equation}

In [131]:
def score_jaccard(doc1, doc2):
    word_set_doc1 = set(jieba.cut(doc1.lower()))
    word_set_doc2 = set(jieba.cut(doc2.lower()))
    return len(word_set_doc1 & word_set_doc2) \
                        / (len(word_set_doc1) + len(word_set_doc2) - len(word_set_doc1 & word_set_doc2))

In [132]:
print(f"score of doc_1: {score_jaccard(q, doc_1)}\nscore of doc_2: {score_jaccard(q, doc_2)}")

score of doc_1: 0.007936507936507936
score of doc_2: 0.022900763358778626


### ÊîπËøõÊñπÂêë‚Ä¶‚Ä¶

## ÂêëÈáèÁ©∫Èó¥Ê®°Âûã

![Image on 2021-10-14 02.03.03 PM.jpg](attachment:99d0a869-07e8-4432-ba1c-ae52bc71bc3d.jpg)

![Image on 2021-10-14 02.07.38 PM.jpg](attachment:ba523685-a70e-4cea-99bd-1b04f387bf28.jpg)

![Image on 2021-10-14 07.15.52 PM.jpg](attachment:26e6bbf3-bedd-413a-8a12-7e387e51a43a.jpg)

In [133]:
from math import sqrt

def score_vsm(doc1, doc2):
    stop_word_set = set([' ', '$', '-', '.', '/', 'ÔºÅ', 'Ôºå', 'Ôºö', 'Ôºü', '‚Äú', '‚Äù', '„ÄÅ', '„ÄÇ', ':', ';', '@', 'ÊòØ', 'ËÆ©', '‰∫Ü', 'ÁöÑ', 'Âïä', 'Âêß'])
    
    word_set_doc1 = set(jieba.cut(doc1.lower()))
    word_set_doc2 = set(jieba.cut(doc2.lower()))
    
    #Ëé∑ÂèñËØçÂÖ∏(ËØçË°®ÔºåÊâÄÊúâËØçÁöÑÈõÜÂêà)
    vocabulary = sorted(list((word_set_doc1 | word_set_doc2) - stop_word_set))
    
    #ÁîüÊàêone-hotË°®Á§∫ÁöÑÊñáÊ°£ÂêëÈáèÔºåÊØè‰∏ÄÁª¥Áî®0/1Ë°®Á§∫ËØ•ËØçÊòØÂê¶Âá∫Áé∞
    vector_doc1 = [1 if word in word_set_doc1 else 0 for word in vocabulary]
    vector_doc2 = [1 if word in word_set_doc2 else 0 for word in vocabulary]
    
    #Â§πËßí‰ΩôÂº¶Áõ∏‰ººÂ∫¶ÁöÑËÆ°ÁÆó cosine  = ( V1 * V2 ) / ||V1|| x ||V2||
    cosine = sum([vector_doc1[i] * vector_doc2[i] for i in range(len(vocabulary))]) \
        /(sqrt(sum([vector_doc1[i] * vector_doc1[i] for i in range(len(vocabulary))])) \
        * sqrt(sum([vector_doc2[i] * vector_doc2[i] for i in range(len(vocabulary))])))
    
    return cosine

In [134]:
print(f"score of doc_1: {score_vsm(q, doc_1)}\nscore of doc_2: {score_vsm(q, doc_2)}")

score of doc_1: 0.07474350927519358
score of doc_2: 0.13074409009212268


### Èù¢ÂêëÊñáÊ°£ÊØîËæÉÁöÑÊîπËøõÔºöÁªü‰∏ÄÂêëÈáèÁ©∫Èó¥(Áª¥Â∫¶)

In [136]:
from math import sqrt

def score_vsm(doc1, doc2, vocabulary):
    word_set_doc1 = set(jieba.cut(doc1.lower()))
    word_set_doc2 = set(jieba.cut(doc2.lower()))
    
    #ÁîüÊàêone-hotË°®Á§∫ÁöÑÊñáÊ°£ÂêëÈáèÔºåÊØè‰∏ÄÁª¥Áî®0/1Ë°®Á§∫ËØ•ËØçÊòØÂê¶Âá∫Áé∞
    vector_doc1 = [1 if word in word_set_doc1 else 0 for word in vocabulary]
    vector_doc2 = [1 if word in word_set_doc2 else 0 for word in vocabulary]
    
    #Â§πËßí‰ΩôÂº¶Áõ∏‰ººÂ∫¶ÁöÑËÆ°ÁÆó cosine  = ( V1 * V2 ) / ||V1|| x ||V2||
    cosine = sum([vector_doc1[i] * vector_doc2[i] for i in range(len(vocabulary))]) \
        /(sqrt(sum([vector_doc1[i] * vector_doc1[i] for i in range(len(vocabulary))])) \
        * sqrt(sum([vector_doc2[i] * vector_doc2[i] for i in range(len(vocabulary))])))
    
    return cosine

In [137]:
stop_word_set = set([' ', '$', '-', '.', '/', 'ÔºÅ', 'Ôºå', 'Ôºö', 'Ôºü', '‚Äú', '‚Äù', '„ÄÅ', '„ÄÇ', ':', ';', '@', 'ÊòØ', 'ËÆ©', '‰∫Ü', 'ÁöÑ', 'Âïä', 'Âêß'])
word_set_q = set(jieba.cut(q.lower()))
word_set_doc_1 = set(jieba.cut(doc_1.lower()))
word_set_doc_2 = set(jieba.cut(doc_2.lower()))
vocabulary = sorted(list((word_set_q | word_set_doc_1 | word_set_doc_2) - stop_word_set))
print(f"score of doc_1: {score_vsm(q, doc_1, vocabulary)}\nscore of doc_2: {score_vsm(q, doc_2, vocabulary)}")

score of doc_1: 0.07474350927519358
score of doc_2: 0.13074409009212268


### Âä†ÂÖ•ËØçÈ¢ë‰ø°ÊÅØ

In [138]:
from math import sqrt

def score_vsm(doc1, doc2, vocabulary):
    word_list_doc1 = list(jieba.cut(doc1.lower()))
    word_list_doc2 = list(jieba.cut(doc2.lower()))
    
    #ÁîüÊàêËØçËÆ°Êï∞Ë°®Á§∫ÁöÑÊñáÊ°£ÂêëÈáèÔºåÊØè‰∏ÄÁª¥Áî®0/1Ë°®Á§∫ËØ•ËØçÊòØÂê¶Âá∫Áé∞
    vector_doc1 = [word_list_doc1.count(word) for word in vocabulary]
    vector_doc2 = [word_list_doc2.count(word) for word in vocabulary]
    
    #Â§πËßí‰ΩôÂº¶Áõ∏‰ººÂ∫¶ÁöÑËÆ°ÁÆó cosine  = ( V1 * V2 ) / ||V1|| x ||V2||
    cosine = sum([vector_doc1[i] * vector_doc2[i] for i in range(len(vocabulary))]) \
        /(sqrt(sum([vector_doc1[i] * vector_doc1[i] for i in range(len(vocabulary))])) \
        * sqrt(sum([vector_doc2[i] * vector_doc2[i] for i in range(len(vocabulary))])))
    
    return cosine

In [139]:
stop_word_set = set([' ', '$', '-', '.', '/', 'ÔºÅ', 'Ôºå', 'Ôºö', 'Ôºü', '‚Äú', '‚Äù', '„ÄÅ', '„ÄÇ', ':', ';', '@', 'ÊòØ', 'ËÆ©', '‰∫Ü', 'ÁöÑ', 'Âïä', 'Âêß'])
word_set_q = set(jieba.cut(q.lower()))
word_set_doc_1 = set(jieba.cut(doc_1.lower()))
word_set_doc_2 = set(jieba.cut(doc_2.lower()))
vocabulary = sorted(list((word_set_q | word_set_doc_1 | word_set_doc_2) - stop_word_set))
print(f"score of doc_1: {score_vsm(q, doc_1, vocabulary)}\nscore of doc_2: {score_vsm(q, doc_2, vocabulary)}")

score of doc_1: 0.3922077957854423
score of doc_2: 0.20739033894608505


### ÂÖ≥‰∫éÁõ∏‰ººÂ∫¶ÂæóÂàÜÁöÑÊÄùËÄÉ

### ÂêëÈáèÁ©∫Èó¥Áª¥Â∫¶ÊòØÂê¶Ê≠£‰∫§ÔºüÈáçË¶ÅÊÄßÊòØÂê¶ÂùáÁ≠âÔºü

![Image on 2021-10-14 03.05.13 PM.jpg](attachment:d33b36ba-e4e3-4563-97ae-12e201d562ff.jpg)

In [None]:
from math import sqrt
    
class VSMOneHot():
    def __init__(self, doc_list):
        self.stop_word_set = set([' ', '$', '-', '.', '/', 'ÔºÅ', 'Ôºå', 'Ôºö', 'Ôºü', '‚Äú', '‚Äù', '„ÄÅ', '„ÄÇ', ':', ';', '@', 'ÊòØ', 'ËÆ©', '‰∫Ü', 'ÁöÑ', 'Âïä', 'Âêß'])
        self.vocabulary = []
        for doc in doc_list:
            self.vocabulary += list(jieba.cut(doc.lower()))
        self.vocabulary = sorted(list(set(self.vocabulary) - self.stop_word_set))
    
    def score(self, q, doc):
        vector_q = self.vectorize(q)
        vector_doc = self.vectorize(doc)
        return self.cosine(vector_q, vector_doc)

    def vectorize(self, doc):
        word_set = set(jieba.cut(doc.lower()))
        return [1 if word in word_set else 0 for word in self.vocabulary]
                          
    def cosine(self, vec1, vec2):
        return sum([vec1[i] * vec2[i] for i in range(len(self.vocabulary))]) \
            /(sqrt(sum([vec1[i] * vec1[i] for i in range(len(self.vocabulary))])) \
            * sqrt(sum([vec2[i] * vec2[i] for i in range(len(self.vocabulary))])))

In [None]:
vsm_model = VSMOneHot([doc_1, doc_2])

In [None]:
print(f"score of doc_1: {vsm_model.score(q, doc_1)}\nscore of doc_2: {vsm_model.score(q, doc_2)}")

### Âä†ÂÖ•Ôºö
#### TF: ËØçÈ¢ëÔºàÂâçÊôØÔºâ
#### DF: ÊñáÊ°£È¢ëÔºàËÉåÊôØÔºâ

In [None]:
from math import sqrt, log10
from collections import defaultdict    
    
class VSMTFIDF():
    def __init__(self, doc_list):
        self.stop_word_set = set([' ', '$', '-', '.', '/', 'ÔºÅ', 'Ôºå', 'Ôºö', 'Ôºü', '‚Äú', '‚Äù', '„ÄÅ', '„ÄÇ', ':', ';', '@', 'ÊòØ', 'ËÆ©', '‰∫Ü', 'ÁöÑ', 'Âïä', 'Âêß'])
        self.doc_count = len(doc_list)
        self.vocabulary = []
        self.df = defaultdict(int)
        for doc in doc_list:
            doc_word_set = set(jieba.cut(doc.lower()))
            for word in doc_word_set:
                self.df[word] += 1
            self.vocabulary += list(doc_word_set)
        self.vocabulary = sorted(list(set(self.vocabulary) - self.stop_word_set))
    
    def score(self, q, doc):
        vector_q = self.vectorize(q)
        vector_doc = self.vectorize(doc)
        result = self.cosine(vector_q, vector_doc)
        return result

    def vectorize(self, doc):
        result = []
        word_list = list(jieba.cut(doc.lower()))
        word_set = set(word_list)
        for word in self.vocabulary:
            tf = word_list.count(word)
            idf = sqrt(self.doc_count / self.df[word])
            result.append(tf * idf)        
        return result
                          
    def cosine(self, vec1, vec2):
        return sum([vec1[i] * vec2[i] for i in range(len(self.vocabulary))]) \
            /(sqrt(sum([vec1[i] * vec1[i] for i in range(len(self.vocabulary))])) \
            * sqrt(sum([vec2[i] * vec2[i] for i in range(len(self.vocabulary))])))

In [None]:
vsm_model = VSMTFIDF([doc_1, doc_2])

In [None]:
print(f"score of doc_1: {vsm_model.score(q, doc_1)}\nscore of doc_2: {vsm_model.score(q, doc_2)}")

### TFÔºöËØçÈ¢ë ‚Äî‚Äî **ËØçÁöÑ`ÂåπÈÖçÊÄß`**
### DFÔºöÊñáÊ°£È¢ë ‚Äî‚Äî **ËØçÁöÑ`Âå∫ÂàÜÊÄß`**
### ÊÄùËÄÉÔºöDFÂ∫îËØ•Âú®‰ªÄ‰πàËåÉÂõ¥ÂÜÖËøõË°åÁªüËÆ°Ôºü

![Image on 2021-10-14 08.52.40 PM.jpg](attachment:c1470f1c-2295-4368-b7c4-2aafb19efece.jpg)

In [None]:
from IPython.core.display import display, HTML
import bisect
from collections import defaultdict
import jieba
import pickle
from math import sqrt, log10
from collections import defaultdict    
    
class VSMTFIDF():
    def __init__(self, doc_list):
        self.stop_word_set = set([' ', '$', '-', '.', '/', 'ÔºÅ', 'Ôºå', 'Ôºö', 'Ôºü', '‚Äú', '‚Äù', '„ÄÅ', '„ÄÇ', ':', ';', '@', 'ÊòØ', 'ËÆ©', '‰∫Ü', 'ÁöÑ', 'Âïä', 'Âêß'])
        self.doc_count = len(doc_list)
        self.vocabulary = []
        self.df = defaultdict(int)
        for doc in doc_list:
            doc_word_set = set(jieba.cut(doc.lower()))
            for word in doc_word_set:
                self.df[word] += 1
            self.vocabulary += list(doc_word_set)
        self.vocabulary = sorted(list(set(self.vocabulary) - self.stop_word_set))
    
    def score(self, q, doc):
        vector_q = self.vectorize(q)
        vector_doc = self.vectorize(doc)
        result = self.cosine(vector_q, vector_doc)
        return result

    def vectorize(self, doc):
        result = []
        word_list = list(jieba.cut(doc.lower()))
        word_set = set(word_list)
        for word in self.vocabulary:
            tf = word_list.count(word)
            idf = sqrt(self.doc_count / self.df[word])
            result.append(tf * idf)        
        return result
                          
    def cosine(self, vec1, vec2):
        return sum([vec1[i] * vec2[i] for i in range(len(self.vocabulary))]) \
            /(sqrt(sum([vec1[i] * vec1[i] for i in range(len(self.vocabulary))])) \
            * sqrt(sum([vec2[i] * vec2[i] for i in range(len(self.vocabulary))])))

class MySearchC5V1(MySearchC5V0):
    """
    C3V0: Base class for Search Engine.
    C3V1: Data multiplication added.
    C3V2: Sorting optimization.
    C3V3: Add lowered version of docs.
    C3V4: For long doc.
    C3V5: Caching search results.
    C3V6: Pre-caching all words in docs.
    C3V7: Add Serialize/UnSerialize.
    C4V1: Add basic Bool query support
    C4V2: Add wordseg to get_word_match()
    ----------------C5V0-----------------
    C5V1: Use VSMTFIDF.score() as score

    Attributes
    ----------
    filename : str
        file name of doc data
    multi_factor : int
        data multiplication factor(default 1)

    Methods
    -------
    load_data(filename):
        load data from file.
    save_data(filename):
        save data to file
    pre_cache_all():
        Pre-caching all words in docs.
    highlight(text, keyword):
        highlight text with keyword.
    score(text, keyword):
        get score of text for a query.
    get_word_match(self, keyword):
        get doc set containing keyword.
    search(keyword, num=15):
        get top num search results of a query.
    render(result_list, keyword):
        output search results with highlight.
    query_to_set_expression(query):
        convert bool query to set expression(for eval process).
    get_word_match(word):
        get match set of the word.
    """
    
    def search(self, query, num=15):
        query_lower = query.lower()    
        result_list = []
        min_score = 0
        query_new = self.query_to_set_expression(query_lower)
        match_tid_list = list(eval(query_new))
        vsm_model = VSMTFIDF([self.docs_lower[tid] for tid in match_tid_list])
        query_new = ' '.join(set(jieba.cut(query_lower)) - set(['(', ')', 'and', 'AND', '+', 'or', 'OR', 'NOT', 'not', '+', '-', ' ', '']))
        result_list = [(tid, vsm_model.score(query_new, self.docs_lower[tid])) for tid in match_tid_list]
        result_list.sort(key = lambda x: x[1], reverse=True)
        return [doc_id for doc_id, _ in result_list[:num]]

In [None]:
searcher = MySearchC5V1('titles_l.dat', 1)

In [None]:
query = 'ÊâãÊú∫ AND ÊÄßËÉΩ'
search_result = searcher.search(query, num=10)
searcher.render(search_result, query)

### BM25(Okapi BM25)  
#### BM25ÊòØ‰ø°ÊÅØÁ¥¢ÂºïÈ¢ÜÂüüÁî®Êù•ËÆ°ÁÆóquery‰∏éÊñáÊ°£Áõ∏‰ººÂ∫¶ÂæóÂàÜÁöÑÁªèÂÖ∏ÁÆóÊ≥ï„ÄÇ  

\begin{equation}
\operatorname{score}(D, Q)=\sum_{i=1}^{n} \operatorname{IDF}\left(q_{i}\right) \cdot \frac{f\left(q_{i}, D\right) \cdot\left(k_{1}+1\right)}{f\left(q_{i}, D\right)+k_{1} \cdot\left(1-b+b \cdot \frac{|D|}{\text { avgdl }}\right)}
\end{equation}  
\begin{equation}
\operatorname{IDF}\left(q_{i}\right)=\ln \left(\frac{N-n\left(q_{i}\right)+0.5}{n\left(q_{i}\right)+0.5}+1\right)
\end{equation}

In [None]:
from math import log
from collections import defaultdict

class BM25():
    def __init__(self, doc_list):
        self.doc_count = len(doc_list)
        self.avgdl = 0
        self.df = defaultdict(int)
        for doc in doc_list:
            for word in set(jieba.cut(doc)):
                self.df[word] += 1
            self.avgdl += len(doc)
        self.avgdl /= self.doc_count
    
    def score(self, q, doc):
        k1 = 1.5
        b = 0.75
        result = 0
        query_new = set(jieba.cut(q.lower())) - set(['(', ')', 'and', 'AND', '+', 'or', 'OR', 'NOT', 'not', '+', '-', ' ', ''])
        word_list_doc = list(jieba.cut(doc.lower()))
        for keyword in query_new:
            f = word_list_doc.count(keyword)
            dl = len(doc)
            idf = log((self.doc_count - self.df[keyword] + 0.5) / (self.df[keyword] + 0.5) + 1)
            result += idf * ((f * (k1 + 1)) / (f + k1 * (1 - b + b * dl / self.avgdl)))
        return result

In [None]:
bm25_model = BM25([doc_1, doc_2])

In [None]:
print(f"score of doc_1: {bm25_model.score(q, doc_1)}\nscore of doc_2: {bm25_model.score(q, doc_2)}")

In [None]:
from IPython.core.display import display, HTML
import bisect
from collections import defaultdict
import jieba
import pickle
from math import sqrt, log
from collections import defaultdict    
    
class BM25():
    def __init__(self, doc_list):
        self.doc_count = len(doc_list)
        self.avgdl = 0
        self.df = defaultdict(int)
        for doc in doc_list:
            for word in set(jieba.cut(doc)):
                self.df[word] += 1
            self.avgdl += len(doc)
        self.avgdl /= self.doc_count
    
    def score(self, q, doc):
        k1 = 1.5
        b = 0.75
        result = 0
        query_new = set(jieba.cut(q.lower())) - set(['(', ')', 'and', 'AND', '+', 'or', 'OR', 'NOT', 'not', '+', '-', ' ', ''])
        word_list_doc = list(jieba.cut(doc.lower()))
        for keyword in query_new:
            f = word_list_doc.count(keyword)
            dl = len(doc)
            idf = log((self.doc_count - self.df[keyword] + 0.5) / (self.df[keyword] + 0.5) + 1)
            result += idf * ((f * (k1 + 1)) / (f + k1 * (1 - b + b * dl / self.avgdl)))
        return result
    

class MySearchC5V2(MySearchC5V0):
    """
    C3V0: Base class for Search Engine.
    C3V1: Data multiplication added.
    C3V2: Sorting optimization.
    C3V3: Add lowered version of docs.
    C3V4: For long doc.
    C3V5: Caching search results.
    C3V6: Pre-caching all words in docs.
    C3V7: Add Serialize/UnSerialize.
    C4V1: Add basic Bool query support
    C4V2: Add wordseg to get_word_match()
    ----------------C5V0-----------------
    C5V1: Use VSMTFIDF.score() as score
    C5V2: Use BM25.score() as score

    Attributes
    ----------
    filename : str
        file name of doc data
    multi_factor : int
        data multiplication factor(default 1)

    Methods
    -------
    load_data(filename):
        load data from file.
    save_data(filename):
        save data to file
    pre_cache_all():
        Pre-caching all words in docs.
    highlight(text, keyword):
        highlight text with keyword.
    score(text, keyword):
        get score of text for a query.
    get_word_match(self, keyword):
        get doc set containing keyword.
    search(keyword, num=15):
        get top num search results of a query.
    render(result_list, keyword):
        output search results with highlight.
    query_to_set_expression(query):
        convert bool query to set expression(for eval process).
    get_word_match(word):
        get match set of the word.
    """
    
    def search(self, query, num=15):
        query_lower = query.lower()    
        result_list = []
        min_score = 0
        query_new = self.query_to_set_expression(query_lower)
        match_tid_list = list(eval(query_new))
        bm25_model = BM25([self.docs_lower[tid] for tid in match_tid_list])
        query_new = ' '.join(set(jieba.cut(query_lower)) - set(['(', ')', 'and', 'AND', '+', 'or', 'OR', 'NOT', 'not', '+', '-', ' ', '']))
        result_list = [(tid, bm25_model.score(query_new, self.docs_lower[tid])) for tid in match_tid_list]
        result_list.sort(key = lambda x: x[1], reverse=True)
        return [doc_id for doc_id, _ in result_list[:num]]

In [None]:
searcher = MySearchC5V2('titles_l.dat', 1)

In [None]:
query = 'ÊâãÊú∫ AND ÊÄßËÉΩ'
search_result = searcher.search(query, num=10)
searcher.render(search_result, query)