# Indexer

In [4]:
import os

from whoosh.fields import ID, TEXT, Schema
from whoosh.index import create_in, open_dir
from whoosh.qparser import QueryParser
from whoosh.writing import AsyncWriter

In [5]:
class Indexer:
    def __init__(
        self,
        index_dir: str = "../data/index_directory",
        documents_dir: str = "../data/scrapped/class_data_function__1_1",
    ) -> None:
        self._index_dir = index_dir
        self._documents_dir = documents_dir

        if not os.path.exists(self._index_dir):
            print("Index is not found, creating new...")
            os.mkdir(path=self._index_dir)
            self.build_index()
            print("Complete!")

    def build_index(self):
        schema = Schema(filename=ID(stored=True), content=TEXT)
        create_in(self._index_dir, schema=schema)

        with AsyncWriter(open_dir(self._index_dir)) as writer:
            for filename in os.listdir(self._documents_dir):
                filepath = os.path.join(self._documents_dir, filename)

                if filename.endswith(".txt"):
                    with open(filepath, "r", encoding="utf-8") as file:
                        content = file.read()
                    writer.add_document(filename=filename, content=content)

    def files_by_text(self, search_text: str) -> list[tuple[str, float]]:
        indexer = open_dir(self._index_dir)

        files = []
        with indexer.searcher() as searcher:
            query = QueryParser("content", indexer.schema).parse(search_text)

            for obj in searcher.search(query, scored=True):
                files.append(
                    (os.path.join(self._documents_dir, obj["filename"]), obj.score)
                )

        return sorted(files, key=lambda x: -x[1])

    @property
    def documents_dir(self) -> str:
        return self._documents_dir

    @property
    def index_dir(self) -> str:
        return self._index_dir


indexer = Indexer()

Index is not found, creating new...


  elif fixedsize is 0:


Complete!


In [6]:
%%time
indexer.files_by_text("math")

CPU times: total: 78.1 ms
Wall time: 124 ms


[('../data/scrapped/class_data_function__1_1\\errno.ERANGE.txt',
  8.378559488424658),
 ('../data/scrapped/class_data_function__1_1\\math.cos.txt',
  8.265495899901422),
 ('../data/scrapped/class_data_function__1_1\\math.cosh.txt',
  8.265495899901422),
 ('../data/scrapped/class_data_function__1_1\\math.fabs.txt',
  8.265495899901422),
 ('../data/scrapped/class_data_function__1_1\\math.sin.txt',
  8.265495899901422),
 ('../data/scrapped/class_data_function__1_1\\math.sinh.txt',
  8.265495899901422),
 ('../data/scrapped/class_data_function__1_1\\math.sqrt.txt',
  8.265495899901422),
 ('../data/scrapped/class_data_function__1_1\\math.tan.txt',
  8.265495899901422),
 ('../data/scrapped/class_data_function__1_1\\math.tanh.txt',
  8.265495899901422),
 ('../data/scrapped/class_data_function__1_1\\errno.EDOM.txt',
  8.15544313155565)]

In [7]:
%%time
indexer.files_by_text("math sine")

CPU times: total: 46.9 ms
Wall time: 15 ms


[('../data/scrapped/class_data_function__1_1\\math.sin.txt',
  19.515744970897224),
 ('../data/scrapped/class_data_function__1_1\\math.sinh.txt',
  19.515744970897224),
 ('../data/scrapped/class_data_function__1_1\\math.asinh.txt',
  19.25589827973826),
 ('../data/scrapped/class_data_function__1_1\\math.asin.txt',
  18.2822082995113)]

In [8]:
%%time
indexer.files_by_text("constant")

CPU times: total: 15.6 ms
Wall time: 14 ms


[('../data/scrapped/class_data_function__1_1\\mmap.MAP_ALIGNED_SUPER.txt',
  8.983226992585767),
 ('../data/scrapped/class_data_function__1_1\\mmap.MAP_ANON.txt',
  8.983226992585767),
 ('../data/scrapped/class_data_function__1_1\\mmap.MAP_ANONYMOUS.txt',
  8.983226992585767),
 ('../data/scrapped/class_data_function__1_1\\mmap.MAP_CONCEAL.txt',
  8.983226992585767),
 ('../data/scrapped/class_data_function__1_1\\mmap.MAP_DENYWRITE.txt',
  8.983226992585767),
 ('../data/scrapped/class_data_function__1_1\\mmap.MAP_EXECUTABLE.txt',
  8.983226992585767),
 ('../data/scrapped/class_data_function__1_1\\mmap.MAP_POPULATE.txt',
  8.983226992585767),
 ('../data/scrapped/class_data_function__1_1\\mmap.MAP_PRIVATE.txt',
  8.983226992585767),
 ('../data/scrapped/class_data_function__1_1\\mmap.MAP_SHARED.txt',
  8.983226992585767),
 ('../data/scrapped/class_data_function__1_1\\mmap.MAP_STACK.txt',
  8.983226992585767)]