Skip to content

Commit

Permalink
Added mappers to Mandinka and Toolbox conversion to Typecraft
Browse files Browse the repository at this point in the history
  • Loading branch information
pmanha committed Apr 8, 2014
1 parent 41b3f3d commit 963429c
Show file tree
Hide file tree
Showing 17 changed files with 1,624 additions and 504 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
@@ -1,3 +1,4 @@
include LICENSE README.rst MANIFEST.in requirements.txt
include setup.py distribute_setup.py
include src/poioapi/VERSION

36 changes: 23 additions & 13 deletions examples/poio_converter.py
Expand Up @@ -10,6 +10,7 @@
import sys
import optparse
import codecs
import os

import poioapi.annotationgraph
import poioapi.data
Expand All @@ -20,15 +21,19 @@ def main(argv):
usage = "usage: %prog [options] inputfile outputfile"
parser = optparse.OptionParser(usage=usage)
parser.add_option("-i", "--inputtype", dest="inputtype",
help="Type of the input file (elan|toolbox|shoebox)")
help="Type of the input file (elan|toolbox|shoebox|mandinka)")
parser.add_option("-o", "--outputtype", dest="outputtype",
help="Type of the output file (html|graf|typecraft)")
parser.add_option("-r", "--roottier", dest="roottier",
help="Root tier for html output, is the record marker in Toolbox")
parser.add_option("-t", "--tags", dest="tags",
help="Tag set")
parser.add_option("-m", "--more", dest="more",
help="Add extra information")
parser.add_option("-t", "--map-file", dest="mapping",
help="A JSON file containing the tier and tag mapping.")
parser.add_option("-m", "--missing-tags", action='store_true', dest="missing_tags", default=False,
help="If any missing tags are found, writes them to the output file, in JSON format. "
"If this flag is omitted, but missing tags are found, they are ignored.")
parser.add_option('-l', '--language-code', dest='language_code', default='und',
help='The language of the source text. Use the ISO 639-3 code for the language as the value'
' of this parameter.')
(options, files) = parser.parse_args()

if len(files) != 2:
Expand All @@ -42,6 +47,14 @@ def main(argv):
if options.outputtype not in ['html', 'graf', 'typecraft']:
parser.print_usage()
sys.exit(0)
mapping = None
if options.mapping:
if os.path.exists(options.mapping):
mapping = options.mapping
else:
print('The file {0} does not exist.'.format(options.mapping))
parser.print_help()
sys.exit(0)

# Load the data from files
ag = None
Expand Down Expand Up @@ -79,16 +92,13 @@ def main(argv):
writer = poioapi.io.graf.Writer()
writer.write(files[1], ag)
elif options.outputtype == "typecraft":
more_info = None
tags = None

if options.more:
more_info = options.more
if options.tags:
tags = options.tags
missing_tags = options.missing_tags

typecraft = poioapi.io.typecraft.Writer()
typecraft.write(files[1], ag, more_info=more_info, tags=tags)
if missing_tags:
typecraft.missing_tags(files[1], ag, additional_map_path=mapping)
else:
typecraft.write(files[1], ag, extra_tag_map=mapping, language=options.language_code)

if __name__ == "__main__":
main(sys.argv)
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -57,7 +57,7 @@
],
packages = [ 'poioapi', 'poioapi.io' ],
package_dir = { '': 'src' },
package_data = { 'poioapi': ['VERSION'] },
package_data = { 'poioapi': ['VERSION', 'mappings/*.json'] },
#install_requires=['PyYAML>=3.09'],
#test_suite = 'graf.test.simple',
)
30 changes: 20 additions & 10 deletions src/poioapi/annotationgraph.py
Expand Up @@ -28,6 +28,7 @@
import poioapi.io.typecraft

import poioapi.data
import poioapi.mapper

import graf

Expand All @@ -53,11 +54,13 @@ def __init__(self, data_structure_type = None):
self.meta_information = None
self.root_tiers = []
self.primary_data = None
self.from_file_type = None
self.source_type = None

self.filters = []
self.filtered_node_ids = []

self.tier_mapper = poioapi.mapper.TierMapper()

@classmethod
def from_elan(cls, stream):
"""This method generates a GrAF object
Expand All @@ -67,12 +70,13 @@ def from_elan(cls, stream):
return cls._from_file(stream, poioapi.data.EAF)

@classmethod
def from_mandinka(cls, stream):
def from_mandinka(cls, stream, tier_map_file_path=''):
"""This method generates a GrAF object
from a Elan file.
"""
return cls._from_file(stream, poioapi.data.MANDINKA)
cls.tier_mapper = poioapi.io.mandinka.tier_mapping()
return cls._from_file(stream, poioapi.data.MANDINKA, tier_map_file_path=tier_map_file_path)

@classmethod
def from_obt(cls, stream):
Expand Down Expand Up @@ -115,12 +119,13 @@ def from_toolboxxml(cls, stream):
return cls._from_file(stream, poioapi.data.TOOLBOXXML)

@classmethod
def from_toolbox(cls, stream):
def from_toolbox(cls, stream, tier_map_file_path=''):
"""This method generates a GrAF object
from a xml toolbox file.
"""
return cls._from_file(stream, poioapi.data.TOOLBOX)
cls.tier_mapper = poioapi.io.toolbox.tier_mapping()
return cls._from_file(stream, poioapi.data.TOOLBOX, tier_map_file_path=tier_map_file_path)

@classmethod
def from_graf(cls, stream):
Expand Down Expand Up @@ -149,8 +154,13 @@ def _open_file_(self, filename):
return codecs.open(filename, "r", "utf-8")

@classmethod
def _from_file(cls, stream, stream_type, **kwargs):
def _from_file(cls, stream, stream_type, tier_labels_file_path='', **kwargs):
ag = cls()

#load aditional tier labels if supplied
if tier_labels_file_path != '' and tier_labels_file_path is not None:
ag.tier_mapper.load_mapping(tier_labels_file_path)

# TODO: move the stream opening to the parser classes
if stream_type != poioapi.data.TOOLBOX:
if not hasattr(stream, 'read'):
Expand All @@ -160,7 +170,7 @@ def _from_file(cls, stream, stream_type, **kwargs):
if stream_type == poioapi.data.EAF:
parser = poioapi.io.elan.Parser(stream)
elif stream_type == poioapi.data.MANDINKA:
parser = poioapi.io.mandinka.Parser(stream)
parser = poioapi.io.mandinka.Parser(stream, tier_label_map=ag.tier_mapper)
elif stream_type == poioapi.data.OBT:
parser = poioapi.io.obt.Parser(stream)
elif stream_type == poioapi.data.TYPECRAFT:
Expand All @@ -174,7 +184,7 @@ def _from_file(cls, stream, stream_type, **kwargs):
elif stream_type == poioapi.data.TOOLBOX:
if not hasattr(stream, 'read'):
stream = codecs.open(stream, "rb")
parser = poioapi.io.toolbox.Parser(stream)
parser = poioapi.io.toolbox.Parser(stream, mapper=ag.tier_mapper)

converter = poioapi.io.graf.GrAFConverter(parser)
converter.parse()
Expand All @@ -185,7 +195,7 @@ def _from_file(cls, stream, stream_type, **kwargs):
ag.graf = converter.graf
ag.primary_data = converter.primary_data

ag.from_file_type = stream_type
ag.source_type = stream_type

# set the first tier hierarchy as the default data_structure_type
ag.structure_type_handler = \
Expand Down Expand Up @@ -220,7 +230,7 @@ def root_nodes(self):

def nodes_for_tier(self, tier_name, parent_node = None):
"""Retreive all nodes for a given tier name. The parameter
tier_name specifies the type if the neigbours. For example if
tier_name specifies the type if the neighbours. For example if
the parent node is an utterance the tier name "word" specifies that
all "word" nodes that are connected to the utterance node should
be returned. The tier name must be a children of the parent node's
Expand Down
17 changes: 17 additions & 0 deletions src/poioapi/data.py
Expand Up @@ -16,6 +16,8 @@
from __future__ import unicode_literals

import sys
import json
import os.path

# Set the type of string
if sys.version_info[:2] >= (3, 0):
Expand All @@ -27,6 +29,20 @@
(EAF, EAFFROMTOOLBOX, KURA, TOOLBOX, TOOLBOXXML, SHOEBOX, TREEPICKLE,
TYPECRAFT, OBT, GRAF, MANDINKA) = range(11)

type_names = {
EAF: 'EAF',
EAFFROMTOOLBOX: 'EAFFROMTOOLBOX',
KURA: 'KURA',
TOOLBOX: 'TOOLBOX',
TOOLBOXXML: 'TOOLBOXXML',
SHOEBOX: 'SHOEBOX',
TREEPICKLE: 'TREEPICKLE',
TYPECRAFT: 'TYPECRAFT',
OBT: 'OBT',
GRAF: 'GRAF',
MANDINKA: 'MANDINKA'
}

# Tier types
(TIER_UTTERANCE, TIER_WORD, TIER_MORPHEME, TIER_POS, TIER_GLOSS, TIER_GRAID1,
TIER_GRAID2, TIER_TRANSLATION, TIER_COMMENT) = range(9)
Expand Down Expand Up @@ -400,3 +416,4 @@ class DataStructureTypeMorphsynt(DataStructureType):
[ TIER_MORPHEME, [ TIER_GLOSS ] ],
TIER_POS ],
TIER_TRANSLATION, TIER_COMMENT ]

62 changes: 45 additions & 17 deletions src/poioapi/io/mandinka.py
Expand Up @@ -12,6 +12,8 @@
import collections

import poioapi.io.graf
import poioapi.data
import poioapi.mapper

re_last_quote = re.compile("[^\"]*$")

Expand Down Expand Up @@ -65,13 +67,26 @@
'^«\s+Bisímilláahí ',
'^[\r\n]']


def tier_mapping():
mapping = poioapi.mapper.TierMapper()
mapping.append_to_tier_labels(poioapi.data.TIER_UTTERANCE, ['phrase'])
mapping.append_to_tier_labels(poioapi.data.TIER_WORD, ['word'])
mapping.append_to_tier_labels(poioapi.data.TIER_TRANSLATION, ['translation'])
mapping.append_to_tier_labels(poioapi.data.TIER_MORPHEME, ['morpheme'])
mapping.append_to_tier_labels(poioapi.data.TIER_GLOSS, ['gloss'])
mapping.append_to_tier_labels(poioapi.data.TIER_POS, ['pos'])

return mapping


class Parser(poioapi.io.graf.BaseParser):
"""
Class that will handle the parsing of Mandinka data.
"""

def __init__(self, input_stream):
def __init__(self, input_stream, tier_label_map):
"""Class's constructor.
Parameters
Expand All @@ -82,6 +97,17 @@ def __init__(self, input_stream):
"""
self._input_stream = None

if tier_label_map is None:
self._tier_labels = tier_mapping()
else:
self._tier_labels = tier_label_map

# self._utterance_label = self._mapper.tier_label(poioapi.data.TIER_UTTERANCE)
# self._word_label = self._mapper.tier_label(poioapi.data.TIER_WORD)
# self._morpheme_label = self._mapper.tier_label(poioapi.data.TIER_MORPHEME)
# self._gloss_label = self._mapper.tier_label(poioapi.data.TIER_GLOSS)
# self._translation_label = self._mapper.tier_label(poioapi.data.TIER_TRANSLATION)

self.input_stream = input_stream
self.parse()

Expand Down Expand Up @@ -119,9 +145,6 @@ def parse(self):
line_count = 0
phrase_ended = False

#blocks = self.load_annotation_blocks()
#blocks = self.normalize_blocks(blocks)

#compile all regexes structures defined
separate = re.compile(r'\b(?:%s)\b' % '|'.join(word_line_separators))
ignore_lines = re.compile('|'.join(ignore_these))
Expand Down Expand Up @@ -163,7 +186,7 @@ def parse(self):
if phrase_ended:
#adding the annotations for phrase
current_phrase_id = current_id
self._annotations_for_parent[(None, 'phrase')].append(
self._annotations_for_parent[(None, poioapi.data.tier_labels[poioapi.data.TIER_UTTERANCE])].append(
poioapi.io.graf.Annotation('a{0}'.format(current_phrase_id),
re.sub('[-]+', '', block['phrase'])))

Expand All @@ -176,7 +199,8 @@ def parse(self):
current_id += 1
current_word_id = current_id
#add the word tier annotations
self._annotations_for_parent[('a{0}'.format(current_phrase_id), 'word')].append(
self._annotations_for_parent[('a{0}'.format(current_phrase_id),
poioapi.data.tier_labels[poioapi.data.TIER_WORD])].append(
poioapi.io.graf.Annotation('a{0}'.format(current_word_id),
re.sub('[-]+', '', word_tokens[i].strip())))
morphemes_for_word = word_tokens[i].split('-')
Expand All @@ -189,7 +213,8 @@ def parse(self):
gloss_word = glosses_for_word.pop(0)
current_id += 1
current_morpheme_id = current_id
self._annotations_for_parent['a{0}'.format(current_word_id), 'morpheme'].append(
self._annotations_for_parent[('a{0}'.format(current_word_id),
poioapi.data.tier_labels[poioapi.data.TIER_MORPHEME])].append(
poioapi.io.graf.Annotation('a{0}'.format(current_morpheme_id), morpheme.strip()))

#if the morpheme and gloss counts for this word don't match,
Expand All @@ -202,13 +227,15 @@ def parse(self):
for gloss in glosses_for_morpheme:
current_id += 1
current_gloss_id = current_id
self._annotations_for_parent['a{0}'.format(current_morpheme_id), 'gloss'].append(
self._annotations_for_parent[('a{0}'.format(current_morpheme_id),
poioapi.data.tier_labels[poioapi.data.TIER_GLOSS])].append(
poioapi.io.graf.Annotation('a{0}'.format(current_gloss_id), gloss.strip()))

#finally, add the translation annotation
current_id += 1
current_translation_id = current_id
self._annotations_for_parent[('a{0}'.format(current_phrase_id), 'translation')].append(
self._annotations_for_parent[('a{0}'.format(current_phrase_id),
poioapi.data.tier_labels[poioapi.data.TIER_TRANSLATION])].append(
poioapi.io.graf.Annotation('a{0}'.format(current_translation_id), block['translation']))

#increment the current annotation id for the next phrase
Expand All @@ -219,7 +246,7 @@ def parse(self):
block['gloss'] = ''
block['translation'] = ''

print('Total processed blocks: {0}'.format(current_block))
# print('Total processed blocks: {0}'.format(current_block))

def sanitize_line(self, line):
""" Function to remove unwanted character(s) from the line.
Expand Down Expand Up @@ -251,7 +278,7 @@ def get_root_tiers(self):
"""

return [poioapi.io.graf.Tier("phrase")]
return [poioapi.io.graf.Tier(poioapi.data.tier_labels[poioapi.data.TIER_UTTERANCE])]

def get_child_tiers_for_tier(self, tier):
"""This method retrieves all the child tiers
Expand All @@ -269,12 +296,13 @@ def get_child_tiers_for_tier(self, tier):
"""

if tier.name == "phrase":
return [poioapi.io.graf.Tier("word"), poioapi.io.graf.Tier("translation")]
elif tier.name == "word":
return [poioapi.io.graf.Tier("morpheme")]
elif tier.name == "morpheme":
return [poioapi.io.graf.Tier("gloss")]
if tier.name == poioapi.data.tier_labels[poioapi.data.TIER_UTTERANCE]:
return [poioapi.io.graf.Tier(poioapi.data.tier_labels[poioapi.data.TIER_WORD]),
poioapi.io.graf.Tier(poioapi.data.tier_labels[poioapi.data.TIER_TRANSLATION])]
elif tier.name == poioapi.data.tier_labels[poioapi.data.TIER_WORD]:
return [poioapi.io.graf.Tier(poioapi.data.tier_labels[poioapi.data.TIER_MORPHEME])]
elif tier.name == poioapi.data.tier_labels[poioapi.data.TIER_MORPHEME]:
return [poioapi.io.graf.Tier(poioapi.data.tier_labels[poioapi.data.TIER_GLOSS])]

def get_annotations_for_tier(self, tier, annotation_parent=None):
"""This method retrieves all the child tiers
Expand Down

0 comments on commit 963429c

Please sign in to comment.