In [None]:
from pyscripts.inverted_file import InvertedFile
from pyscripts.formatted_document import FormattedDocument
from pyscripts.tokenizer import Tokenizer
from pyscripts.query import NaiveQuery
from pyscripts.query import FaginQuery
import glob
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import ParseError

In [None]:
def read_files(paths, n=-1):
    """
    Read n files from a list of paths and convert them as xml trees. A root node <RAC> is added to every file to avoid some
    ParseError
    parameters :
        - paths : enumeration of strings, a list of absolute paths where datas have to be read (datas must be xml files)
        - n : number of files needed to be read, if -1, every possible files will be read
    return :
        - a list of len=(min(n, number of files) if n != -1, else number of files) of xml trees representations
          of the documents
    """
    output = []
    for path in paths:
        try:
            txt = open(path, 'r').read()
            output.append(ET.fromstring('<RAC>'+txt+'</RAC>'))
            n -= 1
            print('Successfully parsed document <{}>'.format(path))
        except ParseError as e:
            print('Can\'t parse document <{}>. Doesn\'t matter, skip'.format(path))
        except IsADirectoryError:
            print('Can\'t parse directory <{}>. Doesn\'t matter, skip'.format(path))
        if n == 0:
            return output
    return output

In [None]:
def score(token, document):
    paragraph_tokens = document['text'].copy()
    paragraph_tokens.append(document['title'])
    token_count = 0
    for paragraph in paragraph_tokens:
        for word in paragraph:
            if word == token:
                token_count += 1
    return token_count

inverted_file = InvertedFile(score)

LATIMES_PATH = './latimes'
files = glob.iglob(LATIMES_PATH + '/*')
xml_files = read_files(files, 1)
fd = FormattedDocument(xml_root_doc=xml_files[0], tokenizer=Tokenizer())

In [None]:
# add the article of the loaded document to the inferted file
for doc in fd.matches:
    inverted_file.add_document(doc)

In [None]:
def sort_by_score(posting_list):
    return sorted(posting_list, key=lambda x: x[1], reverse=True)

In [None]:
house_pl = inverted_file.map["the"]
divorce_pl = inverted_file.map["it"]
print(house_pl, "\n\n", divorce_pl, "\n\n")
print(sort_by_score(house_pl), "\n\n", sort_by_score(divorce_pl))

In [None]:
query = FaginQuery("divorce house", Tokenizer())
query.execute(inverted_file)

In [None]:
naive_query = NaiveQuery("divorce house", Tokenizer())
naive_query.execute(inverted_file, 5)