In [81]:
from dataclasses import dataclass
from typing import Iterator
from collections import Counter
import re



@dataclass
class Document:
    id: int
    title: str
    text: str

stopwords = open("../data/stopwords.txt").readlines()
stopwords = set([word.lower().strip() for word in stopwords])


In [89]:
def get_cisi_generator(filename:str) -> Iterator[Document]:
    documents = {}
    with open(filename, "r") as file:
        id = 0
        title = ""
        text = ""
        while True:
            line = file.readline()
            if not line:
                break
            if line[0:2] == ".I":
                id = int(line.split(" ")[1])
            if line[0:2] == ".T":
                title = file.readline()
            if line[0:2] == ".W":
                while True:
                    line = file.readline()
                    if re.match(r".[ITAWX]", line) is not None:
                        document = Document(id, title, text)
                        id = 0
                        title = ""
                        text = ""
                        break
                    else:
                        text += line
                yield document
class SimpleCISITokenizer:
    def __init__(self, stopwords:set[str]):
        self.stopwords = stopwords
        self.regex = re.compile(r"[a-zA-Z]+")
    
    def tokenize(self, text:str) -> list[str]:
        res = re.findall(self.regex, text.lower())
        return [token for token in res if token not in self.stopwords]


#### Read documents

In [98]:
cisi_generator = get_cisi_generator("../data/CISI.ALL")
documents = { document.id:document for document in cisi_generator }    

#### Get document term counts

In [105]:
tokenizer = SimpleCISITokenizer(stopwords)
document_term_counts = Counter(tokenizer.tokenize(documents[1].text))