## Folia2Conll

convertFoliaClass2ConllTag: an intermediate function used to convert folia tags into suitable conll tags (Token-tag).

NOTE: 'MISC' tag is not used. 

In [16]:
import re
def convertFoliaClass2ConllTag(e):
    per = 'I-PER'
    loc = 'I-LOC'
    org = 'I-ORG'
    cls = e.cls
    if re.match('^.*Target.*$', e.set):
        if cls == 'name':
            return per
    elif re.match('^.*Organizer.*$', e.set):
        if cls == 'name':
            return org
    if cls == 'loc' or cls == 'place' or cls == 'place_pub':
        return loc
    if cls == 'pname':
        return per
    if cls == 'fname':
        return org
    return 'O'

doc2conll: an intermediate function that converts a single folia file to conll.

In [13]:
from pynlpl.formats import folia
def doc2conll(fp, sentences, ids, id2token, id2tag, idx, conllfile):

    doc = folia.Document(file=fp)
    for h, sentence in enumerate(doc.sentences()):
        sentence_tokenized = sentence.select(folia.Word)
        words_folia = list(sentence_tokenized)
        sentence_tokens = []  # sentence as token ids
        for word in words_folia:
            w_id = word.id
            w_text = word.text()
            if w_id in ids:
                continue
            idx = idx + 1
            if idx == 16307 and w_text == '<P>':
                idx = idx - 1
                continue
            sentence_tokens.append(w_id)
            id2token[w_id] = w_text
            id2tag[w_id] = 'O'

            ids.append(w_id)

            sentences.append(sentence_tokens)
        for layer in sentence.select(folia.EntitiesLayer):
            for entity in layer.select(folia.Entity):
                for word in entity.wrefs():
                    word_id = word.id
                    conll_tag = convertFoliaClass2ConllTag(entity)
                    id2tag[word_id] = conll_tag

        for _id in sentence_tokens:
            line = id2token[_id] + " " + id2tag[_id] + "\n"
            conllfile.write(line)

        conllfile.write("\n")

folia2conll: accepts a folder of folia docs OR a single folia file. outputs a single conll file.

In [14]:
import os
def folia2conll(flpath, opath):
    sentences = []  # A sentence is a list of token ids.
    ids = []
    id2token = {}
    id2tag = {}
    conll_file = open(opath, 'w')

    idx = -1
    if os.path.isdir(flpath):
        for filename in os.listdir(flpath):
            fpath = flpath + '/' + filename
            doc2conll(fpath, sentences, ids, id2token, id2tag, idx, conll_file)
    else:
        doc2conll(flpath, sentences, ids, id2token, id2tag, idx, conll_file)
    conll_file.close()


To run, set the variables below and call folia2conll function.

In [17]:
folia_folder = './foliadocs/alladjudicated'
folia_file = './foliadocs/alladjudicated/' \
              'https__timesofindia.indiatimes.com_business_india-business_BSNL-Employees-Union-protests-against-disinvestment_articleshow_972751.folia.xml'
    
outfile = './folia_as_conll_test.txt'

folia2conll(folia_folder, outfile)
print('Folia docs are converted to conll format')

Folia docs are converted to conll format


## Read Folia into sentences

In [32]:
def readFoliaFileIntoSentences(filepath, ids, outfile):
    doc = folia.Document(file=filepath)
    for h, sentence in enumerate(doc.sentences()):
        sentence_tokenized = sentence.select(folia.Word)
        words_folia = list(sentence_tokenized)
        for word in words_folia:
            w_id = word.id
            w_text = word.text()
            if w_id in ids:
                continue
            ids.append(w_id)
            if w_text == '<P>':
                continue
            outfile.write(w_text + ' ')
        outfile.write('\n\n')
            

In [33]:
def readFoliaIntoSentences(path, outfile):
    ids = []
    if os.path.isdir(path):
        for filename in os.listdir(path):
            filepath = path + '/' + filename
            readFoliaFileIntoSentences(filepath, ids, outfile)
    else:
        readFoliaFileIntoSentences(path, ids, outfile)

In [None]:
folia_folder = './foliadocs/alladjudicated'
folia_file = './foliadocs/alladjudicated/' \
              'https__timesofindia.indiatimes.com_business_india-business_BSNL-Employees-Union-protests-against-disinvestment_articleshow_972751.folia.xml'

outfilepath = './folia_as_sentences.txt'
outfile = open(outfilepath, 'w')
readFoliaIntoSentences(folia_folder, outfile)
print('Folia docs are converted to sentences.')