Crawling

In [1]:
from pathlib import Path

In [2]:
data = {}
for doc in Path('../data/documents').iterdir():
    if doc.suffix != '.txt':
        continue

    with open(doc) as f:
        data[doc.stem] = f.read()

Indexing

In [3]:
from string import punctuation
from abc import ABC, abstractmethod


class TextProcessor(ABC):
    @abstractmethod
    def transform(self, text):
        pass

    # transformer 1:

class ConvertCase(TextProcessor):
    def __init__(self, casing='lower'):
        self.casing = casing

    def transform(self, text):
        if self.casing == 'lower':
            return text.lower()
        elif self.casing == 'upper':
            return text.upper()
        elif self.casing == 'title':
            return text.title()


# transformer 2:

class RemoveDigit(TextProcessor):
    def transform(self, text):
        text = ''.join(filter(lambda char: not char.isdigit(), text))
        return text


# transformer 3:

class RemoveSpace(TextProcessor):
    def transform(self, text):
        return ' '.join(text.split())


# transformer 4:

class RemovePunkt(TextProcessor):
    def __init__(self, replace_char=' '):
        self.replace_char = replace_char

    def transform(self, text):
        return ''.join(char if char not in punctuation else self.replace_char for char in text)


# pipeline:

class StringPipeline:
    def __init__(self, *args):
        self.transformers = args

    def transform(self, text):
        for i in self.transformers:
            text = i.transform(text)
        return text

    def __str__(self):
        return '''Transforms text with transformers:
        ConvertCase(lower,upper,title) RemoveDigit RemovePunkt RemoveSpace'''


In [4]:
pipeline = StringPipeline(
        ConvertCase('lower'),
        RemoveSpace(),
        RemovePunkt(),
    )

In [5]:
stop_words = open('../data/stop_words.txt').readlines()
stop_words = list(map(str.strip, stop_words))
stop_words = set(map(pipeline.transform, stop_words))

In [6]:
indexing = {}
for doc_name, doc_content in data.items():

    words = pipeline.transform(doc_content).split()

    for i in words:
        if i in stop_words:
            continue
        elif i not in indexing:
            indexing[i] = {doc_name}
        else:
            indexing[i].add(doc_name)

Search

In [7]:
while True:
    search_input = input('search here or "q" to quit')
    search_input = pipeline.transform(search_input)
    if search_input == 'q':
        break

    search_tokens = search_input.split()
    result = []
    for token in search_tokens:
        result.extend(indexing.get(token, []))

    from termcolor import colored

    for i in result:
        print(colored(f' - {i}','light_blue'))



[94m - adam sandler[0m
[94m - michael jackson[0m
[94m - jennifer lopez[0m
