In [1]:
from pyscripts.inverted_file import InvertedFile
from pyscripts.formatted_document import FormattedDocument
import glob
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import ParseError

[nltk_data] Downloading package punkt to /home/nicolas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def read_files(paths, n=-1):
    """
    Read n files from a list of paths and convert them as xml trees. A root node <RAC> is added to every file to avoid some
    ParseError
    parameters :
        - paths : enumeration of strings, a list of absolute paths where datas have to be read (datas must be xml files)
        - n : number of files needed to be read, if -1, every possible files will be read
    return :
        - a list of len=(min(n, number of files) if n != -1, else number of files) of xml trees representations
          of the documents
    """
    output = []
    for path in paths:
        try:
            txt = open(path, 'r').read()
            output.append(ET.fromstring('<RAC>'+txt+'</RAC>'))
            n -= 1
            print('Successfully parsed document <{}>'.format(path))
        except ParseError as e:
            print('Can\'t parse document <{}>. Doesn\'t matter, skip'.format(path))
        except IsADirectoryError:
            print('Can\'t parse directory <{}>. Doesn\'t matter, skip'.format(path))
        if n == 0:
            return output
    return output

In [3]:
def score(token, document):
    paragraph_tokens = document['text']
    paragraph_tokens.append(document['title'])
    token_count = 0
    for paragraph in paragraph_tokens:
        for word in paragraph:
            if word == token:
                token_count += 1
    return token_count

inverted_file = InvertedFile(score)

LATIMES_PATH = './latimes'
files = glob.iglob(LATIMES_PATH + '/*')
xml_files = read_files(files, 1)
fd = FormattedDocument(xml_root_doc=xml_files[0])

Successfully parsed document <./latimes/la012290>


In [7]:
# add the article of the loaded document to the inferted file
for doc in fd.matches:
    inverted_file.add_document(doc)

In [8]:
print(inverted_file.map)

