In [136]:
import re
"""
return annotationDict{
    <start_index>: {
        string: <named entity string>,
        end_index: <end_index>,
        tag: <named entity tag>
    },
    ...
}
"""
def get_annotations(annotationFileName):
    annDict = {}
    with open(annotationFileName) as f:
        for line in f:
            parts = re.split(" |\t", line.strip())
            annDict[int(parts[2])] = {
                'string': " ".join(parts[4:]),
                'end_index': int(parts[3]),
                'tag': parts[1]
            }
            
    return annDict

In [162]:
def create_line(token, tag):
    return token + "\t" + tag + "\n"

def write_empty_line(file):
    with open(file, "a+") as f:
        f.write("\n")
        
def write_tokens(expression, tag, file):
    with open(file, "a+") as f:
        tokens = expression.strip().split(" ")
        if len(tokens) == 1 and len(tokens[0]) > 0:
            f.write(create_line(tokens[0], tag))
            return
        for i, j in enumerate(tokens):
            if len(j) == 0:
                continue
            if i == 0:
                f.write(create_line(j, "B-"+tag if tag != "O" else tag))
            else:
                f.write(create_line(j, "I-"+tag if tag != "O" else tag))

In [163]:
def handle_line(line, start_index, output_file, annotations):
    delimiters = set([".", "?", "!", ";", ":", "\"", "\n", ","])
    in_chunk = False
    endi = None
    starti = 0
    tag = "O"
    for i, c in enumerate(line):
        if not in_chunk and annotations.__contains__(start_index+i):
            if i - starti > 0:
                write_tokens(line[starti:i], tag, output_file)
            in_chunk = True
            starti = i
            endi = annotations.get(start_index+i).get('end_index')
            tag = annotations.get(start_index+i).get('tag')
        if in_chunk and (start_index+i == endi):
            write_tokens(line[starti:i], tag, output_file)
            in_chunk = False
            endi = None
            starti = i+1
            tag = "O"
        if in_chunk and c in delimiters:
            write_tokens(line[starti:i], tag, output_file)
            in_chunk = False
            endi = None
            starti = i+1
            tag = "O"
        if not in_chunk and c in delimiters:
            write_tokens(line[starti:i], tag, output_file)
            endi = None
            starti = i+1
            tag = "O"
        if c in delimiters and c != "\n":
            write_tokens(c, "O", output_file)
            
def create_corpus(input_file, annotations, en_output_file, ewo_output_file):
    current_index = 0
    with open(input_file) as in_file:
        line = in_file.readline()
        current_index += len(line)
        line = in_file.readline()
        current_index += len(line)
        line = in_file.readline()
        while line:
            verset = line.strip()
            write_empty_line(en_output_file)
            write_empty_line(ewo_output_file)
            current_index += len(line)
            line = in_file.readline()
            while line != "\n":
                handle_line(line, current_index, en_output_file, annotations)
                current_index += len(line)
                line = in_file.readline()
            current_index += len(line)
            line = in_file.readline()
            while line and line != "\n":
                handle_line(line, current_index, ewo_output_file, annotations)
                current_index += len(line)
                line = in_file.readline()
            current_index += len(line)
            line = in_file.readline()

In [140]:
!ls

Acts1-en-ewo.ann  Acts3-en-ewo.ann  corpus-en.txt     Matthew1-en-ewo.ann
Acts1-en-ewo.txt  Acts3-en-ewo.txt  corpus-ewo.txt    Matthew1-en-ewo.txt
Acts2-en-ewo.ann  Acts4-en-ewo.ann  Luke1-en-ewo.ann  Untitled.ipynb
Acts2-en-ewo.txt  Acts4-en-ewo.txt  Luke1-en-ewo.txt


In [164]:
import os
input_file_names = ["Acts1-en-ewo", "Acts3-en-ewo", "Acts4-en-ewo", "Luke1-en-ewo", "Matthew1-en-ewo"]
# input_file_names = ["Acts2-en-ewo"]
en_side = "corpus-en.txt"
ewo_side = "corpus-ewo.txt"
print(list(annotations.keys())[:5])
try:
    os.remove(en_side)
    os.remove(ewo_side)
except:
    pass
for file_name in input_file_names:
    print("file> ", file_name)
    annotations = get_annotations(file_name+".ann")
    create_corpus(file_name+".txt", annotations, en_side, ewo_side)

[30, 102, 127, 145, 214]
file>  Acts1-en-ewo
file>  Acts3-en-ewo
file>  Acts4-en-ewo
file>  Luke1-en-ewo
file>  Matthew1-en-ewo
