# Inverted File Test

This notebook has for purpose the testing of the basic functionnality of the class InvertedFile. It does not contain benchmarking for the performances of the method. It provides however a good idea of how the system basically works and how it should be used.  
  
Constants :  
  
LATIMES_PATH : string, the path to the xml files to read  
NUMBER_OF_FILE_TO_READ = integer, the number of files to be read during the test by each inverted files

In [None]:
from pyscripts.inverted_file import InvertedFile
from pyscripts.formatted_document import FormattedDocument
import glob
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import ParseError

LATIMES_PATH = '..\\latimes'
NUMBER_OF_FILE_TO_READ = 1

## Reading articles from files

Read the files that contain the articles and convert them into json. Separate them into two inverted files for testing all the functionalities

In [None]:
def read_files(paths, n=-1):
    """
    Read n files from a list of paths and convert them as xml trees. A root node <RAC> is added to every file to avoid some
    ParseError
    parameters :
        - paths : enumeration of strings, a list of absolute paths where datas have to be read (datas must be xml files)
        - n : number of files needed to be read, if -1, every possible files will be read
    return :
        - a list of len=(min(n, number of files) if n != -1, else number of files) of xml trees representations
          of the documents
    """
    output = []
    for path in paths:
        try:
            txt = open(path, 'r').read()
            output.append(ET.fromstring('<RAC>'+txt+'</RAC>'))
            n -= 1
            print('Successfully parsed document <{}>'.format(path))
        except ParseError as e:
            print('Can\'t parse document <{}>. Doesn\'t matter, skip'.format(path))
        except IsADirectoryError:
            print('Can\'t parse directory <{}>. Doesn\'t matter, skip'.format(path))
        if n == 0:
            return output
    return output

In [None]:
def score(token, document):
    """
    Basic score function to make the inverted files work.
    Doesn't have any computational interest
    """
    paragraph_tokens = document['text']
    paragraph_tokens.append(document['title'])
    token_count = 0
    for paragraph in paragraph_tokens:
        for word in paragraph:
            if word == token:
                token_count += 1
    return token_count

def load_inverted_file(xml_files):  
    """
    process the creation of an inverted file an the jsonification of the xml_documents
    to be loaded into this file
    """
    inverted_file = InvertedFile(score)
    formatted_files = []
    for f in xml_files:
        formatted_files.append(FormattedDocument(xml_root_doc=f))
    return inverted_file, formatted_files

files = glob.iglob(LATIMES_PATH + '\\*')
xml_files = read_files(files, NUMBER_OF_FILE_TO_READ*2)

inverted_file_1, formatted_documents_1 = load_inverted_file(xml_files[::2])
inverted_file_2, formatted_documents_2 = load_inverted_file(xml_files[1::2])

## Filling the inverted files

In [None]:
for doc in formatted_documents_1:
    for article in doc.matches:
        inverted_file_1.add_document(article)
    
for doc in formatted_documents_2:
    for article in doc.matches:
        inverted_file_2.add_document(article)

## Saving inverted file on disc

In [None]:
inverted_file_1.save('test_1.sav')
inverted_file_2.save('test_2.sav')

## Load Inverted File

In [None]:
inverted_file_loaded = InvertedFile(score)
inverted_file_loaded.read_posting_lists(None, 'test_1.sav')

In [None]:
print("reference : {}".format(inverted_file_1.map.keys()[:14]))
print("loaded    : {}".format(inverted_file_loaded.map.keys()[:14]))

## Merge two inverted files

In [None]:
InvertedFile.merge_inverted_files('test_merged.sav', 'test_1.sav', 'test_2.sav')