In [2]:
# simple use of the class
class Document():
    def __init__(self, title, author, context):
        print('init function called')
        self.title = title
        self.author = author
        self.__context = context # __表示私有属性

    def get_context_length(self):
        return len(self.__context)

book1 = Document("Python", "zhenzhenli", "this is a guide to useing python")
print(book1.get_context_length())
print(book1.title)

init function called
32
Python


In [3]:
# introduction to classmethod、staticmethod
class Document():
    WELCOME_STR = "welcome，the content of this book is {}."
    def __init__(self, title, author, context):
        print('init function called')
        self.title = title
        self.author = author
        self.__context = context # __表示private

    def get_context_length(self):
        return len(self.__context)

    @classmethod
    def create_empty_book(cls):
        return Document('', '', '')

    @staticmethod
    def get_welcome(context):
        return Document.WELCOME_STR.format(context)

# classmethod are typically used to creat different construction methods
empty_book = Document.create_empty_book()
print(empty_book.get_context_length())

# in general, static functions can be used to do simple, independent tasks，
print(Document.get_welcome("indeed nothing"))


init function called
0
welcome，the content of this book is indeed nothing.


In [None]:
# class example：search engine

import re
import ipdb;  # ipdb.set_trace() breakpoint

# the base class of search engine
class SearchEngineBase():
    def __init__(self):
        print("super init")
        pass

    def add_corpus(self, file_path):
        with open(file_path, 'r') as fin:
            text = fin.read()
        self.process_corpus(file_path, text)
    
    def process_corpus(self, id, text):
        raise Exception('process_corpus not implemented.')

    def search(self, query):
        raise Exception('the query method not implemented.')

# simple search engine
class SimpleSearchEngine(SearchEngineBase):
    def __init__(self):
        super(SimpleSearchEngine, self).__init__();
        print("sub init")
        self.__id_to_texts = {} # private

    def process_corpus(self, id, text):
        self.__id_to_texts[id] = text

    def search(self, query):
        results = []
        for id, text in self.__id_to_texts.items():
            if query in text:
                results.append(id)
        return results

# Bag of Words Model, Word Segmentation-based Search Engine
class BOWSearchEngine(SearchEngineBase):
    def __init__(self):
        super(BOWSearchEngine, self).__init__()
        print('sub init')
        self.__id_to_words = {} 

    def process_corpus(self, id, text):
        self.__id_to_words[id] = self.parse_text_to_words(text)

    def search(self, query):
        results = []
        query_words = self.parse_text_to_words(query)
        for id, words in self.__id_to_words.items():
            if self.query_match(query_words, words):
                results.append(id)
        return results

    @staticmethod
    def parse_text_to_words(text):
        #  user regular expressions to remove punctuation and newlines
        text = re.sub(r'[^\w ]', ' ', text)

        # to lower case
        text = text.lower();

        # generate a list of all the words
        word_list = text.split(' ')

        # Remove blank words
        word_list = filter(None, word_list)

        # return
        return set(word_list)

    @staticmethod
    def query_match(query_words, words):
        for query_word in query_words:
            if query_word not in words:
                return False
        return True;
        
def main(search_engine):
    for file_path in ['1.txt', '2.txt', '3.txt', '4.txt']:
        search_engine.add_corpus(file_path)

    while True:
        query = input('please enter the search tern, ending with q')
        if query == 'q':
            break
        returns = search_engine.search(query)
        print("found {} result(s):".format(len(returns)))

        for result in returns:
            print(result)

if __name__ == '__main__': 
    # search_engine = SimpleSearchEngine()
    search_engine = BOWSearchEngine()
    main(search_engine)