diff --git a/NEWS b/NEWS index e0acb7f..d18e8da 100644 --- a/NEWS +++ b/NEWS @@ -2,3 +2,8 @@ Sparrowhawk - Release 0.1 This is the alpha version. +Sparrowhawk - Release 1.0 + +* Added new verbalizer serialization, with accompanying grammars. + + diff --git a/README b/README index 225277a..ccb4f42 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -Sparrowhawk - Release 0.1 +Sparrowhawk - Release 1.0 Sparrowhawk is an open-source implementation of Google's Kestrel text-to-speech text normalization system. It follows the discussion of the Kestrel system as @@ -34,6 +34,11 @@ INSTALLATION: recommend configuring with --enable-static=no for faster compiles. + NOTE: In some versions of Mac OS-X we have noticed a problem with configure + whereby it fails to find fst.h. If this occurs, try configuring as follows: + + CPPFLAGS=-I/usr/local/include LDFLAGS=-L/usr/local/lib ./configure + USAGE: Assuming you've installed under the default /usr/local, the library will be in /usr/local/lib, and the headers in /usr/local/include/sparrowhawk. diff --git a/configure b/configure index 5826b97..f5bcf5f 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for Sparrowhawk 0.1.0. +# Generated by GNU Autoconf 2.69 for Sparrowhawk 1.0.0. # # Report bugs to . # @@ -590,8 +590,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='Sparrowhawk' PACKAGE_TARNAME='sparrowhawk' -PACKAGE_VERSION='0.1.0' -PACKAGE_STRING='Sparrowhawk 0.1.0' +PACKAGE_VERSION='1.0.0' +PACKAGE_STRING='Sparrowhawk 1.0.0' PACKAGE_BUGREPORT='rws@google.com' PACKAGE_URL='' @@ -1325,7 +1325,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures Sparrowhawk 0.1.0 to adapt to many kinds of systems. +\`configure' configures Sparrowhawk 1.0.0 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1395,7 +1395,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of Sparrowhawk 0.1.0:";; + short | recursive ) echo "Configuration of Sparrowhawk 1.0.0:";; esac cat <<\_ACEOF @@ -1504,7 +1504,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -Sparrowhawk configure 0.1.0 +Sparrowhawk configure 1.0.0 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -1994,7 +1994,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by Sparrowhawk $as_me 0.1.0, which was +It was created by Sparrowhawk $as_me 1.0.0, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2857,7 +2857,7 @@ fi # Define the identity of the package. PACKAGE='sparrowhawk' - VERSION='0.1.0' + VERSION='1.0.0' cat >>confdefs.h <<_ACEOF @@ -4162,6 +4162,7 @@ unknown) esac +CPPFLAGS="$CPPFLAGS -funsigned-char" CXXFLAGS="$CXXFLAGS -std=c++11" ac_ext=cpp @@ -16052,7 +16053,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by Sparrowhawk $as_me 0.1.0, which was +This file was extended by Sparrowhawk $as_me 1.0.0, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -16109,7 +16110,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -Sparrowhawk config.status 0.1.0 +Sparrowhawk config.status 1.0.0 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 90b4358..6ecb78e 100644 --- a/configure.ac +++ b/configure.ac @@ -1,8 +1,9 @@ -AC_INIT([Sparrowhawk], [0.1.0], [rws@google.com]) +AC_INIT([Sparrowhawk], [1.0.0], [rws@google.com]) AM_INIT_AUTOMAKE([foreign nostdinc -Wall -Werror]) AM_PROG_AR +CPPFLAGS="$CPPFLAGS -funsigned-char" CXXFLAGS="$CXXFLAGS -std=c++11" AC_PROG_CXX diff --git a/documentation/README.md b/documentation/README.md index 3cf10e9..8f653aa 100644 --- a/documentation/README.md +++ b/documentation/README.md @@ -357,6 +357,77 @@ the token as a sequence of characters: 3_character :_character 3_character 0_character +### Verbalizer grammars: new serialization format (Sparrowhawk 1.0 and above) + +With Sparrowhawk 1.0, we introduce a simpler format for verbalizer +grammars. The upside of this is that it makes writing the verbalizer grammars +quite a bit simpler. The downside is that it requires a serialization +specification proto instance (see below). This new format has no relevance to +the classifier grammars, which should be written as described above in any case. + +The main salient differences between the previous format and the new +serialization format are first that the representation that is passed by the +serialization to the verbalizer is more compact. Instead of + +
+money { amount { integer_part: "3" } currency: "usd" }
+
+ +what gets passed is + +
+money|integer_part:3|currency:usd|
+
+ +For both major and minor currencies the verbalizer sees, e.g.: + +
+money|integer_part:3|currency:usd|fractional_part:50|currency:usd|
+
+ +The second major difference is that a REDUP rule is no longer needed. Rather the +serialization, and possible copying of elements is done in code, controlled by +the serialization specification, itself an ASCII protocol buffer representation +that is referenced by an additional optional specification in the Sparrowhawk +configuration file. An example is given in +"verbalizer_serialization_spec.ascii_proto". This specifies +the serialization possibilities for the different classes. For money, the +specification: + +
+class_spec {
+  semiotic_class: "money"
+  style_spec {
+    record_spec {
+      field_path: "money.amount.integer_part"
+      suffix_spec {
+        field_path: "money.currency"
+      }
+    }
+    record_spec {
+      field_path: "money.amount.fractional_part"
+      suffix_spec {
+        field_path: "money.currency"
+      }
+    }
+  }
+}
+
+ +means that the integer part of the money expression and the fractional part are +verbalized in that order, and the repetition of the "money.currency" field has +the effect of duplicating the expression for the currency itself. Again, the +verbalizer grammar is responsible for determining that the first instance would +be read as the major currency expression, and the second as the minor currency +expression. + +The protocol buffer definition of the serialization specification is found in +"src/proto/serialization_spec.proto", which is also documented with +comments on the functions of the various fields. + +The parallel English toy grammar in the new serializer format can be found in +"grammars/en_toy/verbalize_serialization". + ### Sentence boundary detection Sparrowhawk provides some simple support for sentence boundary detection. One @@ -454,6 +525,12 @@ For example in the "grammars" directory, assuming one has built all the grammars normalizer_main --config=sparrowhawk_configuration.ascii_proto --multi_line_text < test.txt 2>/dev/null +For the new serialization specification, the invocation is as follows: + +
+normalizer_main --config=sparrowhawk_configuration_serialization.ascii_proto --multi_line_text < test.txt 2>/dev/null
+
+ Integrating Sparrowhawk with Festival ------------------------- @@ -509,7 +586,7 @@ festival/examples/sparrowhawk_test_us_null.scm Sparrowhawk will perform tokenization and text normalization and leave you with a sequence of words in Festival's 'Word' relation. You need to take it from there. - + How to cite Sparrowhawk ------------------------- diff --git a/documentation/grammars/en_toy/byte.far b/documentation/grammars/en_toy/byte.far new file mode 100644 index 0000000..a09d56a Binary files /dev/null and b/documentation/grammars/en_toy/byte.far differ diff --git a/documentation/grammars/en_toy/util.far b/documentation/grammars/en_toy/util.far new file mode 100644 index 0000000..0a4e4c4 Binary files /dev/null and b/documentation/grammars/en_toy/util.far differ diff --git a/documentation/grammars/en_toy/verbalize_serialization/CARDINAL_NUMBER_NAME b/documentation/grammars/en_toy/verbalize_serialization/CARDINAL_NUMBER_NAME new file mode 100644 index 0000000..2ec6825 Binary files /dev/null and b/documentation/grammars/en_toy/verbalize_serialization/CARDINAL_NUMBER_NAME differ diff --git a/documentation/grammars/en_toy/verbalize_serialization/ORDINAL_NUMBER_NAME b/documentation/grammars/en_toy/verbalize_serialization/ORDINAL_NUMBER_NAME new file mode 100644 index 0000000..7f71b25 Binary files /dev/null and b/documentation/grammars/en_toy/verbalize_serialization/ORDINAL_NUMBER_NAME differ diff --git a/documentation/grammars/en_toy/verbalize_serialization/date.grm b/documentation/grammars/en_toy/verbalize_serialization/date.grm new file mode 100644 index 0000000..ff9a1df --- /dev/null +++ b/documentation/grammars/en_toy/verbalize_serialization/date.grm @@ -0,0 +1,69 @@ +import '../byte.grm' as b; +import '../util.grm' as u; +import 'numbers.grm' as n; + +# quotation mark +q = u.q; + +# Used to allow for different numbers of spaces coming out of the serializer. +s = u.s; + +month = b.kAlpha+; + +day = n.ORDINAL; + +d = b.kDigit; +D = b.kDigit - "0"; + +two_digit = + ((D d) @ n.CARDINAL) + | ("0" : "oh ") (D @ n.CARDINAL) + | ("00" : "hundred") +; + +# Years are not read as cardinals, generally: +year = + (("19" @ n.CARDINAL) u.I[" "] two_digit) + | (("20" @ n.CARDINAL) u.I[" "] ((D d) @ two_digit)) + | (("200" d) @ n.CARDINAL) +; + +# Remove these if they occur + +field = (b.kAlpha | "_")+; +preserve_order = "preserve_order:true"; +field_order = "field_order:" field; +field_order_specs = (preserve_order | field_order)*; + +# Verbalization for MDY +mdy = + u.D["date"] + u.D["|month:"] + month + u.I[" the "] + u.D["|day:"] + day + u.I[" "] + u.D["|year:"] + year + u.D[field_order_specs]? + u.D["|"] +; + +# Verbalization for DMY +dmy = + u.D["date"] + u.I["the "] + u.D["|day:"] + day + u.I[" of "] + u.D["|month:"] + month + u.D["|year:"] + u.I[" "] + year + u.D[field_order_specs]? + u.D["|"] +; + +export DATE = Optimize[mdy | dmy]; diff --git a/documentation/grammars/en_toy/verbalize_serialization/measure.grm b/documentation/grammars/en_toy/verbalize_serialization/measure.grm new file mode 100644 index 0000000..3fb2839 --- /dev/null +++ b/documentation/grammars/en_toy/verbalize_serialization/measure.grm @@ -0,0 +1,39 @@ +import '../byte.grm' as b; +import '../util.grm' as u; +import 'numbers.grm' as n; + +# Except with exactly 1, the plural form is used, so we map to that form, and +# then singularize below. +measures = + ("centimeter" : "centimeters") + | ("kilogram" : "kilograms") + | ("degree" : "degrees") +; + +# quotation mark +q = u.q; + +# Used to allow for different numbers of spaces coming out of the serializer. +s = u.s; + +# Removes the markup (allowing for various spacing possibilities in the +# serialization) and verbalizes the remainder. +measure = + u.D["measure"] + u.D["|integer_part:"] + n.CARDINAL + (u.D["|fractional_part:"] + u.I[" point "] + n.DIGITS)? + u.I[" "] + u.D["|units:"] + measures + u.D["|"] +; + +sigstar = b.kBytes*; + +# Uses the singular form after exactly "one". +singularize = CDRewrite[Invert[measures], "[BOS]one ", "", sigstar]; + +export MEASURE = Optimize[measure @ singularize]; diff --git a/documentation/grammars/en_toy/verbalize_serialization/money.grm b/documentation/grammars/en_toy/verbalize_serialization/money.grm new file mode 100644 index 0000000..594f1c1 --- /dev/null +++ b/documentation/grammars/en_toy/verbalize_serialization/money.grm @@ -0,0 +1,64 @@ +import '../byte.grm' as b; +import '../util.grm' as u; +import 'numbers.grm' as n; + +q = u.q; + +# Used to allow for different numbers of spaces coming out of the serializer. +s = u.s; + +d = b.kDigit; + +currencies = StringFile['money.tsv']; + +# Simple currency amounts such as: +# money { amount { integer_part: "3"} currency: "usd" } + +sigstar = b.kBytes*; + +# Rules to insert "_maj" and "_min" at the end of the currency terms. +ins_maj = CDRewrite[u.I["_maj"], "", "[EOS]", sigstar]; +ins_min = CDRewrite[u.I["_min"], "", "[EOS]", sigstar]; + +del_zero = CDRewrite[u.D["0"], "[BOS]", "", sigstar]; + +# money { amount { integer_part: "3" fractional_part: "50"} currency: "usd" } +# Here wa assume that the input has been reduplicated (see REDUP), and then on +# the lefthand side we delete the minor currency and on the righthand side the +# major currency. The reduplication is done IN CODE (see +# RuleSystem::ApplyRules() in rule_system.cc). +# +# Removes the markup (allowing for various spacing possibilities in the +# serialization) and verbalizes the remainder. + +money = + u.D["money"] + u.D["|integer_part:"] + n.CARDINAL + u.D["|currency:"] + u.I[" "] + (ins_maj @ currencies) + (u.I[" and "] + u.D["|fractional_part:"] + (del_zero @ n.CARDINAL) + u.D["|currency:"] + u.I[" "] + (ins_min @ currencies))? + u.D[s "|"] +; + +# Singularize after "one" (as in measures). + +singulars = + ("dollars" : "dollar") + | ("cents" : "cent") + | ("pounds" : "pound") + | ("pence" : "penny") + | ("euros" : "euro") +; + +singularize = + CDRewrite[singulars, "[BOS]one " | "and one ", "", sigstar] +; + +export MONEY = Optimize[money @ singularize]; diff --git a/documentation/grammars/en_toy/verbalize_serialization/money.tsv b/documentation/grammars/en_toy/verbalize_serialization/money.tsv new file mode 100644 index 0000000..21a067a --- /dev/null +++ b/documentation/grammars/en_toy/verbalize_serialization/money.tsv @@ -0,0 +1,6 @@ +usd_maj dollars +usd_min cents +gbp_maj pounds +gbp_min pence +eur_maj euros +eur_min cents diff --git a/documentation/grammars/en_toy/verbalize_serialization/numbers.grm b/documentation/grammars/en_toy/verbalize_serialization/numbers.grm new file mode 100644 index 0000000..2e7eb9e --- /dev/null +++ b/documentation/grammars/en_toy/verbalize_serialization/numbers.grm @@ -0,0 +1,36 @@ +import '../byte.grm' as b; +import '../util.grm' as u; + +# English cardinal and ordinal number names are FSTs that are trained using the +# algorithm reported in: +# +# Kyle Gorman and Richard Sproat. "Minimally supervised models for number +# normalization." Transactions of the Association for Computational Linguistics. 2016. +cardinal = LoadFst['CARDINAL_NUMBER_NAME']; + +ordinal = LoadFst['ORDINAL_NUMBER_NAME']; + +d = b.kDigit; + +digit = d @ cardinal; + +export CARDINAL = cardinal; + +export ORDINAL = ordinal; + +export DIGITS = Optimize[digit (u.I[" "] digit)*]; + +q = u.q; + +# Used to allow for different numbers of spaces coming out of the serializer. +s = u.s; + +# Removes the markup (allowing for various spacing possibilities in the +# serialization) and verbalizes the remainder. +cardinal_markup = + u.D["cardinal|integer:"] + cardinal + u.D[s "|"] +; + +export CARDINAL_MARKUP = Optimize[cardinal_markup]; diff --git a/documentation/grammars/en_toy/verbalize_serialization/time.grm b/documentation/grammars/en_toy/verbalize_serialization/time.grm new file mode 100644 index 0000000..56f0e02 --- /dev/null +++ b/documentation/grammars/en_toy/verbalize_serialization/time.grm @@ -0,0 +1,42 @@ +import '../byte.grm' as b; +import '../util.grm' as u; +import 'numbers.grm' as n; + +q = u.q; + +# Used to allow for different numbers of spaces coming out of the serializer. +s = u.s; + +d = b.kDigit; + +hour = (u.D["0"]? d | (d - "0") d) @ n.CARDINAL; + +sigstar = b.kBytes*; + +# Various renditions of minutes: +# +# 03 -> oh three +# 13 -> thirteen +# 00 -> o'clock +# +# Note that trailing 0 is removed so that 3:03 comes in as +# +# hours: 3 minutes: 3 +minute = + ( (("" : "oh ") (d @ n.CARDINAL)) + | (d d) @ n.CARDINAL) + @ CDRewrite["oh zero" : "o'clock", "", "", sigstar]; + +# Removes the markup (allowing for various spacing possibilities in the +# serialization) and verbalizes the remainder. +time = + u.D["time"] + u.D["|hours:"] + hour + u.D["|minutes:"] + u.I[" "] + minute + u.D["|"] +; + +export TIME = Optimize[time]; diff --git a/documentation/grammars/en_toy/verbalize_serialization/verbalize.grm b/documentation/grammars/en_toy/verbalize_serialization/verbalize.grm new file mode 100644 index 0000000..d59a3ec --- /dev/null +++ b/documentation/grammars/en_toy/verbalize_serialization/verbalize.grm @@ -0,0 +1,11 @@ +import 'date.grm' as d; +import 'measure.grm' as M; +import 'money.grm' as m; +import 'numbers.grm' as n; +import 'time.grm' as t; +import 'verbatim.grm' as v; + +# Combines all of the semiotic classes together. + +export ALL = Optimize[ + d.DATE | M.MEASURE | m.MONEY | n.CARDINAL_MARKUP | t.TIME | v.VERBATIM]; diff --git a/documentation/grammars/en_toy/verbalize_serialization/verbatim.grm b/documentation/grammars/en_toy/verbalize_serialization/verbatim.grm new file mode 100644 index 0000000..549f1da --- /dev/null +++ b/documentation/grammars/en_toy/verbalize_serialization/verbatim.grm @@ -0,0 +1,18 @@ +import '../byte.grm' as b; +import '../util.grm' as u; + +# A verbatim grammar is needed as a backoff since if for some reason +# verbalization fails, it backs off to reading the string as the literal +# sequence of characters. + +q = u.q; +# Used to allow for different numbers of spaces coming out of the serializer. +s = u.s; + +char = b.kNotSpace u.I["_character"]; + +chars = char (u.I[" "] char)*; + +# Removes the markup (allowing for various spacing possibilities in the +# serialization) and verbalizes the remainder. +export VERBATIM = Optimize[u.D["verbatim:" s q] chars u.D[q]]; diff --git a/documentation/grammars/sparrowhawk_configuration_serialization.ascii_proto b/documentation/grammars/sparrowhawk_configuration_serialization.ascii_proto new file mode 100644 index 0000000..daa1621 --- /dev/null +++ b/documentation/grammars/sparrowhawk_configuration_serialization.ascii_proto @@ -0,0 +1,9 @@ +tokenizer_grammar: "tokenizer.ascii_proto" + +verbalizer_grammar: "verbalizer_serialization.ascii_proto" + +sentence_boundary_regexp: "[\\.:!\\?] " + +sentence_boundary_exceptions_file: "sentence_boundary_exceptions.txt" + +serialization_spec: "verbalizer_serialization_spec.ascii_proto" diff --git a/documentation/grammars/verbalizer_serialization.ascii_proto b/documentation/grammars/verbalizer_serialization.ascii_proto new file mode 100644 index 0000000..ad7c59e --- /dev/null +++ b/documentation/grammars/verbalizer_serialization.ascii_proto @@ -0,0 +1,5 @@ +grammar_file: "en_toy/verbalize_serialization/verbalize.far" + +grammar_name: "Verbalizer" + +rules { main: "ALL" } diff --git a/documentation/grammars/verbalizer_serialization_spec.ascii_proto b/documentation/grammars/verbalizer_serialization_spec.ascii_proto new file mode 100644 index 0000000..c0f98ee --- /dev/null +++ b/documentation/grammars/verbalizer_serialization_spec.ascii_proto @@ -0,0 +1,84 @@ +class_spec { + semiotic_class: "measure" + style_spec { + record_spec { + field_path: "measure.decimal.integer_part" + } + record_spec { + field_path: "measure.decimal.fractional_part" + } + record_spec { + field_path: "measure.units" + } + required_fields: "measure.decimal.integer_part" + } +} +class_spec { + semiotic_class: "money" + style_spec { + record_spec { + field_path: "money.amount.integer_part" + suffix_spec { + field_path: "money.currency" + } + } + record_spec { + field_path: "money.amount.fractional_part" + suffix_spec { + field_path: "money.currency" + } + } + } +} +class_spec { + semiotic_class: "cardinal" + style_spec { + record_spec { + field_path: "cardinal.integer" + } + } +} +class_spec { + semiotic_class: "time" + style_spec { + record_spec { + field_path: "time.hours" + } + record_spec { + field_path: "time.minutes" + } + } +} +class_spec { + semiotic_class: "date" + style_spec { + record_spec { + field_path: "date.day" + } + record_spec { + field_path: "date.month" + } + record_spec { + field_path: "date.year" + } + } + style_spec { + record_spec { + field_path: "date.month" + } + record_spec { + field_path: "date.day" + } + record_spec { + field_path: "date.year" + } + } +} +class_spec { + semiotic_class: "verbatim" + style_spec { + record_spec { + field_path: "verbatim" + } + } +} diff --git a/src/bin/normalizer_main.cc b/src/bin/normalizer_main.cc index 1a85865..841702c 100644 --- a/src/bin/normalizer_main.cc +++ b/src/bin/normalizer_main.cc @@ -46,7 +46,7 @@ DEFINE_string(path_prefix, "./", "Optional path prefix if not relative."); void NormalizeInput(const string& input, speech::sparrowhawk::Normalizer *normalizer) { - const vector sentences = normalizer->SentenceSplitter(input); + const std::vector sentences = normalizer->SentenceSplitter(input); for (const auto& sentence : sentences) { string output; normalizer->Normalize(sentence, &output); diff --git a/src/include/Makefile.am b/src/include/Makefile.am index 0fe1abb..bd40640 100644 --- a/src/include/Makefile.am +++ b/src/include/Makefile.am @@ -3,12 +3,21 @@ BUILT_SOURCES = $(srcdir)/sparrowhawk/items.pb.h $(srcdir)/sparrowhawk/links.pb. $(srcdir)/sparrowhawk/semiotic_classes.pb.h \ $(srcdir)/sparrowhawk/sparrowhawk_configuration.pb.h -nobase_include_HEADERS = sparrowhawk/io_utils.h sparrowhawk/normalizer.h \ - sparrowhawk/protobuf_parser.h sparrowhawk/regexp.h \ - sparrowhawk/sentence_boundary.h sparrowhawk/logger.h \ - sparrowhawk/numbers.h sparrowhawk/protobuf_serializer.h \ - sparrowhawk/rule_system.h sparrowhawk/string_utils.h \ - $(BUILT_SOURCES) +nobase_include_HEADERS = sparrowhawk/field_path.h \ + sparrowhawk/io_utils.h \ + sparrowhawk/logger.h \ + sparrowhawk/normalizer.h \ + sparrowhawk/numbers.h \ + sparrowhawk/protobuf_parser.h \ + sparrowhawk/protobuf_serializer.h \ + sparrowhawk/record_serializer.h \ + sparrowhawk/regexp.h \ + sparrowhawk/rule_system.h \ + sparrowhawk/sentence_boundary.h \ + sparrowhawk/spec_serializer.h \ + sparrowhawk/string_utils.h \ + sparrowhawk/style_serializer.h \ + $(BUILT_SOURCES) sparrowhawk/items.pb.h: $(MAKE) -C $(srcdir)/../proto/ items.pb.h @@ -22,6 +31,9 @@ sparrowhawk/rule_order.pb.h: sparrowhawk/semiotic_classes.pb.h: $(MAKE) -C $(srcdir)/../proto/ semiotic_classes.pb.h +sparrowhawk/serialization_spec.pb.h: + $(MAKE) -C $(srcdir)/../proto/ serialization_spec.pb.h + sparrowhawk/sparrowhawk_configuration.pb.h: $(MAKE) -C $(srcdir)/../proto/ sparrowhawk_configuration.pb.h diff --git a/src/include/Makefile.in b/src/include/Makefile.in index 8121e9f..b8546b5 100644 --- a/src/include/Makefile.in +++ b/src/include/Makefile.in @@ -284,12 +284,21 @@ BUILT_SOURCES = $(srcdir)/sparrowhawk/items.pb.h $(srcdir)/sparrowhawk/links.pb. $(srcdir)/sparrowhawk/semiotic_classes.pb.h \ $(srcdir)/sparrowhawk/sparrowhawk_configuration.pb.h -nobase_include_HEADERS = sparrowhawk/io_utils.h sparrowhawk/normalizer.h \ - sparrowhawk/protobuf_parser.h sparrowhawk/regexp.h \ - sparrowhawk/sentence_boundary.h sparrowhawk/logger.h \ - sparrowhawk/numbers.h sparrowhawk/protobuf_serializer.h \ - sparrowhawk/rule_system.h sparrowhawk/string_utils.h \ - $(BUILT_SOURCES) +nobase_include_HEADERS = sparrowhawk/field_path.h \ + sparrowhawk/io_utils.h \ + sparrowhawk/logger.h \ + sparrowhawk/normalizer.h \ + sparrowhawk/numbers.h \ + sparrowhawk/protobuf_parser.h \ + sparrowhawk/protobuf_serializer.h \ + sparrowhawk/record_serializer.h \ + sparrowhawk/regexp.h \ + sparrowhawk/rule_system.h \ + sparrowhawk/sentence_boundary.h \ + sparrowhawk/spec_serializer.h \ + sparrowhawk/string_utils.h \ + sparrowhawk/style_serializer.h \ + $(BUILT_SOURCES) all: $(BUILT_SOURCES) $(MAKE) $(AM_MAKEFLAGS) all-am @@ -573,6 +582,9 @@ sparrowhawk/rule_order.pb.h: sparrowhawk/semiotic_classes.pb.h: $(MAKE) -C $(srcdir)/../proto/ semiotic_classes.pb.h +sparrowhawk/serialization_spec.pb.h: + $(MAKE) -C $(srcdir)/../proto/ serialization_spec.pb.h + sparrowhawk/sparrowhawk_configuration.pb.h: $(MAKE) -C $(srcdir)/../proto/ sparrowhawk_configuration.pb.h diff --git a/src/include/sparrowhawk/field_path.h b/src/include/sparrowhawk/field_path.h new file mode 100644 index 0000000..0470b1c --- /dev/null +++ b/src/include/sparrowhawk/field_path.h @@ -0,0 +1,91 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2015 and onwards Google, Inc. +// Utility to access specific subfields within a protocol buffer. FieldPath +// objects make subfields available via Follow(). +// + +#ifndef SPARROWHAWK_FIELD_PATH_H_ +#define SPARROWHAWK_FIELD_PATH_H_ + +#include +#include +using std::string; +#include +using std::vector; + +#include +#include +#include + +namespace speech { +namespace sparrowhawk { + +class FieldPath { + public: + // Creates and returns a FieldPath using a descriptor for the type of + // messages we intend to Follow(). + // Returns a null value if the input pointer is null. + static std::unique_ptr Create(const google::protobuf::Descriptor *root_type); + + // Replaces this field_path with input path_string of type: + // (message_name.)*scalar_field_name + // Returns false if an error occurs with either the format of the string or + // with mismatches of type (e.g. a subfield of an integer) or label (i.e. an + // index is supplied when the field is not repeated.) + bool Parse(const string& path_string); + + // Clear all fields from path. + void Clear(); + + inline const google::protobuf::Descriptor *GetRootType() const { return root_type_; } + + // Number of fields on this path. Does not count the root as a field. + inline int GetLength() const { return path_.size(); } + + // True if GetLength() == 0. + inline bool IsEmpty() const { return GetLength() == 0; } + + // Follows the path starting from the given base message. *parent is filled + // in with the immediate parent of the field at the end of the path and *field + // is filled in with the terminal field's descriptor. + // You can then use reflection to query the field value. + // + // Returns false only if the base message is incorrect (the only error that + // can't be detected at parsing time); in this case *parent and *field are + // unchanged. + bool Follow(const google::protobuf::Message& base, const google::protobuf::Message **parent, + const google::protobuf::FieldDescriptor **field) const; + + private: + // Only used by the factory function Create. + explicit FieldPath(const google::protobuf::Descriptor *root_type) + : root_type_(root_type) {} + + // Parse intermediate message fields from input path. The parent is initially + // root_type_ and is finally set to the penultimate field's descriptor. + bool TraverseIntermediateFields(std::vector path, + const google::protobuf::Descriptor **parent); + + // Parse terminal field "field" with given parent descriptor into path_. + bool ParseTerminalField(const string &terminal_field_name, + const google::protobuf::Descriptor *parent); + + std::vector path_; + const google::protobuf::Descriptor *root_type_; +}; + +} // namespace sparrowhawk +} // namespace speech + +#endif // SPARROWHAWK_FIELD_PATH_H_ diff --git a/src/include/sparrowhawk/io_utils.h b/src/include/sparrowhawk/io_utils.h index 44fd749..09b047b 100644 --- a/src/include/sparrowhawk/io_utils.h +++ b/src/include/sparrowhawk/io_utils.h @@ -18,6 +18,7 @@ #include using std::string; +#include namespace speech { namespace sparrowhawk { diff --git a/src/include/sparrowhawk/logger.h b/src/include/sparrowhawk/logger.h index 1f323f5..ce63b1b 100644 --- a/src/include/sparrowhawk/logger.h +++ b/src/include/sparrowhawk/logger.h @@ -18,6 +18,7 @@ // TODO(rws): Write a more respectable logging system or link to some // open-source substitute. +#include namespace speech { namespace sparrowhawk { diff --git a/src/include/sparrowhawk/normalizer.h b/src/include/sparrowhawk/normalizer.h index 8dafe71..c363be1 100644 --- a/src/include/sparrowhawk/normalizer.h +++ b/src/include/sparrowhawk/normalizer.h @@ -36,6 +36,7 @@ using std::vector; #include #include #include +#include namespace speech { namespace sparrowhawk { @@ -77,7 +78,7 @@ class Normalizer { // Preprocessor to use the sentence splitter to break up text into // sentences. An application would normally call this first, and then // normalize each of the resulting sentences. - vector SentenceSplitter(const string &input) const; + std::vector SentenceSplitter(const string &input) const; private: // normalizer.cc @@ -139,7 +140,8 @@ class Normalizer { std::unique_ptr tokenizer_classifier_rules_; std::unique_ptr verbalizer_rules_; std::unique_ptr sentence_boundary_; - set sentence_boundary_exceptions_; + std::unique_ptr spec_serializer_; + std::set sentence_boundary_exceptions_; DISALLOW_COPY_AND_ASSIGN(Normalizer); }; diff --git a/src/include/sparrowhawk/numbers.h b/src/include/sparrowhawk/numbers.h index 4360dfe..1fcbc26 100644 --- a/src/include/sparrowhawk/numbers.h +++ b/src/include/sparrowhawk/numbers.h @@ -19,6 +19,7 @@ #include using std::string; +#include namespace speech { namespace sparrowhawk { diff --git a/src/include/sparrowhawk/protobuf_parser.h b/src/include/sparrowhawk/protobuf_parser.h index 7966c61..21879b5 100644 --- a/src/include/sparrowhawk/protobuf_parser.h +++ b/src/include/sparrowhawk/protobuf_parser.h @@ -109,7 +109,7 @@ class ProtobufParser { // Records the field orders if there is a preserve_order field and it's true bool RecordFieldOrder(google::protobuf::Message *message, - const vector &field_order); + const std::vector &field_order); // Applies fixes to the token names caused by lookahead FSTs. void FixLookahead(Utterance *utt); diff --git a/src/include/sparrowhawk/protobuf_serializer.h b/src/include/sparrowhawk/protobuf_serializer.h index 3864bba..c89db5f 100644 --- a/src/include/sparrowhawk/protobuf_serializer.h +++ b/src/include/sparrowhawk/protobuf_serializer.h @@ -57,7 +57,7 @@ class ProtobufSerializer { protected: typedef google::protobuf::FieldDescriptor FieldDescriptor; - typedef vector FieldDescriptorVector; + typedef std::vector FieldDescriptorVector; typedef fst::StateIterator StateIterator; typedef fst::ArcIterator ArcIterator; typedef MutableTransducer::Arc Arc; diff --git a/src/include/sparrowhawk/record_serializer.h b/src/include/sparrowhawk/record_serializer.h new file mode 100644 index 0000000..835f395 --- /dev/null +++ b/src/include/sparrowhawk/record_serializer.h @@ -0,0 +1,131 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2015 and onwards Google, Inc. +// Recursively serializes a single record in the spec and concatenates onto a +// transducer. +// +// Typically the serialized field content looks like +// :| +// Note that nothing is serialized if the field corresponding to the record_spec +// field_path is missing in the token. +// +// This is used by the StyleSerializer for serializing all the records in a +// given style. It constructs the RecordSerializer for each record in the +// style_spec. Given a token it sequentially invokes the Serialize function of +// the records in the style being serialized. + +#ifndef SPARROWHAWK_RECORD_SERIALIZER_H_ +#define SPARROWHAWK_RECORD_SERIALIZER_H_ + +#include +#include +using std::vector; + +#include +#include +#include +#include +#include +#include + +namespace speech { +namespace sparrowhawk { + +class RecordSerializer { + public: + typedef fst::StdVectorFst MutableTransducer; + + // Creates and returns a RecordSerializer from the record_spec by noting the + // field path and path label the record and recursively building + // record_serializers for prefix and suffix specs. + // Returns a null value if the spec is not well-formed. + static std::unique_ptr Create( + const RecordSpec &record_spec); + + // Serializes a token using the record spec, returns true only if the token + // serializes correctly as per the record spec. For the input token, it + // recursively traverses field_paths noted in the record_serializer and its + // affix_serializers and concatenates serialized field content onto the + // input fst. + bool Serialize(const Token &token, MutableTransducer *fst) const; + + private: + typedef MutableTransducer::Arc Arc; + typedef Arc::StateId StateId; + typedef Arc::Weight Weight; + typedef fst::StringCompiler StringCompiler; + + // Only used by the factory function Create. + RecordSerializer(); + + // Serializers for prefix specs in the specification. + std::vector> prefix_serializers_; + + // Serializers for suffix specs in the specification. + std::vector> suffix_serializers_; + + // Field path for the record_spec field. + std::unique_ptr field_path_; + + // String denoting the terminating field's name for the record spec. + string field_name_; + + // Default value to be emitted when field is not set. + string default_value_; + + // Pattern to be escaped - record_separator or escape_character. + RE2 escape_re_; + + // Replacement string for escape pattern - prepended escape_character. + string escape_replacement_; + + // String Compiler for making fsts from strings. + StringCompiler string_compiler_; + + // Serializes a record, escaping record_separator and escape_character. + // Also serializes various factorizations as parallel arcs into the FST. + void SerializeRecord(string *value, + MutableTransducer *fst) const; + + // Assumes that the (non-repeated) field is set for the parent, and checks + // that it corresponds to a scalar value. Also, in this case, adds an arc to + // fst between states start and end, optionally adding a new state for end if + // a sentinel is passed for end. It is an error to invoke this with a + // repeated field. + bool SerializeToFst(const google::protobuf::Message &parent, + const google::protobuf::FieldDescriptor &field, + MutableTransducer *fst) const; + + // Assumes that the (repeated) field is set for the parent, and checks that it + // corresponds to a scalar value. Also, in this case, adds an arc to + // fst between states start and end, optionally adding a new state for end if + // a sentinel is passed for end. It is an error to invoke this with a + // non-repeated field. + bool SerializeToFstRepeated(const google::protobuf::Message &parent, + const google::protobuf::FieldDescriptor &field, + const int index, + MutableTransducer *fst) const; + + // Recursively serializes prefix and suffix records into respective + // transducers using appropriate record serializers. + bool SerializeAffixes(const Token &token, + MutableTransducer *prefix_fst, + MutableTransducer *suffix_fst) const; + + DISALLOW_COPY_AND_ASSIGN(RecordSerializer); +}; + +} // namespace sparrowhawk +} // namespace speech + +#endif // SPARROWHAWK_RECORD_SERIALIZER_H_ diff --git a/src/include/sparrowhawk/regexp.h b/src/include/sparrowhawk/regexp.h index 3cc5324..e6e1f95 100644 --- a/src/include/sparrowhawk/regexp.h +++ b/src/include/sparrowhawk/regexp.h @@ -38,9 +38,9 @@ struct RegMatch { int n_sub; int len; // if the regexp contained subexpressions - vector sub_str; - vector sub_start; - vector sub_end; + std::vector sub_str; + std::vector sub_start; + std::vector sub_end; }; class Regexp { @@ -66,7 +66,7 @@ class Regexp { // Gets vector of start and end chars for all matching string parts // returns number of matches. Fills the matches vector with RegMatch objects. int GetAllMatches(const string &input, - vector *matches) const; + std::vector *matches) const; // Accessor for boolean whether this has been successfully compiled bool ok() const; diff --git a/src/include/sparrowhawk/rule_system.h b/src/include/sparrowhawk/rule_system.h index 0b52314..3b8db0c 100644 --- a/src/include/sparrowhawk/rule_system.h +++ b/src/include/sparrowhawk/rule_system.h @@ -75,7 +75,7 @@ class RuleSystem { string grammar_name_; std::unique_ptr grm_; // Precomputed lookahead transducers - mutable map lookaheads_; + mutable std::map lookaheads_; }; } // namespace sparrowhawk diff --git a/src/include/sparrowhawk/sentence_boundary.h b/src/include/sparrowhawk/sentence_boundary.h index 746230e..5fc93fb 100644 --- a/src/include/sparrowhawk/sentence_boundary.h +++ b/src/include/sparrowhawk/sentence_boundary.h @@ -40,7 +40,7 @@ class SentenceBoundary { // differently. bool LoadSentenceBoundaryExceptions(const string &filename); - vector ExtractSentences(const string &input_text) const; + std::vector ExtractSentences(const string &input_text) const; // If true, then prefixes each exception in the exception list with a space, // so that it when matching against a potential end-of-sentence position, it @@ -58,7 +58,7 @@ class SentenceBoundary { bool EvaluateCandidate(const string &input_text, const string &marker) const; std::unique_ptr regexp_; - vector sentence_boundary_exceptions_; + std::vector sentence_boundary_exceptions_; bool pad_exceptions_with_space_prefix_; DISALLOW_COPY_AND_ASSIGN(SentenceBoundary); }; diff --git a/src/include/sparrowhawk/sparrowhawk_configuration.pb.h b/src/include/sparrowhawk/sparrowhawk_configuration.pb.h index 3da88ec..0405b06 100644 --- a/src/include/sparrowhawk/sparrowhawk_configuration.pb.h +++ b/src/include/sparrowhawk/sparrowhawk_configuration.pb.h @@ -140,6 +140,18 @@ class SparrowhawkConfiguration : public ::google::protobuf::Message { inline ::std::string* release_sentence_boundary_exceptions_file(); inline void set_allocated_sentence_boundary_exceptions_file(::std::string* sentence_boundary_exceptions_file); + // optional string serialization_spec = 5; + inline bool has_serialization_spec() const; + inline void clear_serialization_spec(); + static const int kSerializationSpecFieldNumber = 5; + inline const ::std::string& serialization_spec() const; + inline void set_serialization_spec(const ::std::string& value); + inline void set_serialization_spec(const char* value); + inline void set_serialization_spec(const char* value, size_t size); + inline ::std::string* mutable_serialization_spec(); + inline ::std::string* release_serialization_spec(); + inline void set_allocated_serialization_spec(::std::string* serialization_spec); + // @@protoc_insertion_point(class_scope:speech.sparrowhawk.SparrowhawkConfiguration) private: inline void set_has_tokenizer_grammar(); @@ -150,6 +162,8 @@ class SparrowhawkConfiguration : public ::google::protobuf::Message { inline void clear_has_sentence_boundary_regexp(); inline void set_has_sentence_boundary_exceptions_file(); inline void clear_has_sentence_boundary_exceptions_file(); + inline void set_has_serialization_spec(); + inline void clear_has_serialization_spec(); ::google::protobuf::UnknownFieldSet _unknown_fields_; @@ -157,9 +171,10 @@ class SparrowhawkConfiguration : public ::google::protobuf::Message { ::std::string* verbalizer_grammar_; ::std::string* sentence_boundary_regexp_; ::std::string* sentence_boundary_exceptions_file_; + ::std::string* serialization_spec_; mutable int _cached_size_; - ::google::protobuf::uint32 _has_bits_[(4 + 31) / 32]; + ::google::protobuf::uint32 _has_bits_[(5 + 31) / 32]; friend void protobuf_AddDesc_sparrowhawk_5fconfiguration_2eproto(); friend void protobuf_AssignDesc_sparrowhawk_5fconfiguration_2eproto(); @@ -455,6 +470,76 @@ inline void SparrowhawkConfiguration::set_allocated_sentence_boundary_exceptions } } +// optional string serialization_spec = 5; +inline bool SparrowhawkConfiguration::has_serialization_spec() const { + return (_has_bits_[0] & 0x00000010u) != 0; +} +inline void SparrowhawkConfiguration::set_has_serialization_spec() { + _has_bits_[0] |= 0x00000010u; +} +inline void SparrowhawkConfiguration::clear_has_serialization_spec() { + _has_bits_[0] &= ~0x00000010u; +} +inline void SparrowhawkConfiguration::clear_serialization_spec() { + if (serialization_spec_ != &::google::protobuf::internal::kEmptyString) { + serialization_spec_->clear(); + } + clear_has_serialization_spec(); +} +inline const ::std::string& SparrowhawkConfiguration::serialization_spec() const { + return *serialization_spec_; +} +inline void SparrowhawkConfiguration::set_serialization_spec(const ::std::string& value) { + set_has_serialization_spec(); + if (serialization_spec_ == &::google::protobuf::internal::kEmptyString) { + serialization_spec_ = new ::std::string; + } + serialization_spec_->assign(value); +} +inline void SparrowhawkConfiguration::set_serialization_spec(const char* value) { + set_has_serialization_spec(); + if (serialization_spec_ == &::google::protobuf::internal::kEmptyString) { + serialization_spec_ = new ::std::string; + } + serialization_spec_->assign(value); +} +inline void SparrowhawkConfiguration::set_serialization_spec(const char* value, size_t size) { + set_has_serialization_spec(); + if (serialization_spec_ == &::google::protobuf::internal::kEmptyString) { + serialization_spec_ = new ::std::string; + } + serialization_spec_->assign(reinterpret_cast(value), size); +} +inline ::std::string* SparrowhawkConfiguration::mutable_serialization_spec() { + set_has_serialization_spec(); + if (serialization_spec_ == &::google::protobuf::internal::kEmptyString) { + serialization_spec_ = new ::std::string; + } + return serialization_spec_; +} +inline ::std::string* SparrowhawkConfiguration::release_serialization_spec() { + clear_has_serialization_spec(); + if (serialization_spec_ == &::google::protobuf::internal::kEmptyString) { + return NULL; + } else { + ::std::string* temp = serialization_spec_; + serialization_spec_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString); + return temp; + } +} +inline void SparrowhawkConfiguration::set_allocated_serialization_spec(::std::string* serialization_spec) { + if (serialization_spec_ != &::google::protobuf::internal::kEmptyString) { + delete serialization_spec_; + } + if (serialization_spec) { + set_has_serialization_spec(); + serialization_spec_ = serialization_spec; + } else { + clear_has_serialization_spec(); + serialization_spec_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString); + } +} + // @@protoc_insertion_point(namespace_scope) diff --git a/src/include/sparrowhawk/spec_serializer.h b/src/include/sparrowhawk/spec_serializer.h new file mode 100644 index 0000000..a5a586a --- /dev/null +++ b/src/include/sparrowhawk/spec_serializer.h @@ -0,0 +1,73 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2015 and onwards Google, Inc. +// Serializes a token based on a given spec for simple, fast verbalization. +// Iteratively serializes the styles in a class_spec which are concatenated as +// parallel arcs onto a transducer, which is returned as output. + +#ifndef SPARROWHAWK_SPEC_SERIALIZER_H_ +#define SPARROWHAWK_SPEC_SERIALIZER_H_ + +#include +#include +#include +using std::vector; + +#include +#include +#include +#include +#include +#include + +namespace speech { +namespace sparrowhawk { + +class Serializer { + public: + typedef fst::StdVectorFst MutableTransducer; + + // Creates and returns a Serializer from the serialize_spec by creating + // style_serializers for all its style_specs and storing the name of the + // semiotic class. + // Returns a null value if the spec is not well-formed. + static std::unique_ptr Create( + const SerializeSpec &serialize_spec); + + // Serializes a token using the serialization spec, i.e. builds an fst + // corresponding to the serialization of the token. Appends a label for the + // semiotic class name at the front and then adds parallel arcs for the + // different valid style_specs. + MutableTransducer Serialize(const Token &token) const; + + private: + typedef MutableTransducer::Arc Arc; + typedef fst::StringCompiler StringCompiler; + + // Only used by the factory function Create. + Serializer() : string_compiler_(fst::StringTokenType::BYTE) {} + + // String Compiler for making fsts from strings. + StringCompiler string_compiler_; + + // Map to store the serialization indexed by field descriptors. + std::map>> serializers_; + + DISALLOW_COPY_AND_ASSIGN(Serializer); +}; + +} // namespace sparrowhawk +} // namespace speech + +#endif // SPARROWHAWK_SPEC_SERIALIZER_H_ diff --git a/src/include/sparrowhawk/string_utils.h b/src/include/sparrowhawk/string_utils.h index 06da352..58523ea 100644 --- a/src/include/sparrowhawk/string_utils.h +++ b/src/include/sparrowhawk/string_utils.h @@ -20,14 +20,15 @@ using std::string; #include using std::vector; +#include namespace speech { namespace sparrowhawk { // Splits string s by sep and returns a vector of strings. -vector SplitString(const string &s, const string &delims); +std::vector SplitString(const string &s, const string &delims); // Splits string s by sep and returns a vector of strings, skipping empties. -vector SplitString(const string &s, +std::vector SplitString(const string &s, const string &delims, bool skip_empty); diff --git a/src/include/sparrowhawk/style_serializer.h b/src/include/sparrowhawk/style_serializer.h new file mode 100644 index 0000000..3cd4af5 --- /dev/null +++ b/src/include/sparrowhawk/style_serializer.h @@ -0,0 +1,104 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2015 and onwards Google, Inc. +// Iteratively serializes the records in a style_spec which are serially +// concatenated onto a transducer. +// +// Typically the serialized field content looks like +// (:|)* +// where each unit is the serialization of a record. +// +// This is used by the Serializer for serializing all the styles in a given +// semiotic class. It constructs the StyleSerializer for each style in the +// class_spec permitted by the prohibited/requested values. Given a token it +// sequentially invokes the Serialize function of the styles in the class being +// serialized. + +#ifndef SPARROWHAWK_STYLE_SERIALIZER_H_ +#define SPARROWHAWK_STYLE_SERIALIZER_H_ + +#include +#include +using std::vector; + +#include +#include +#include +#include +#include +#include +#include + +namespace speech { +namespace sparrowhawk { + +class StyleSerializer { + public: + typedef fst::StdVectorFst MutableTransducer; + + // Creates and returns a StyleSerializer from the style_spec by creating + // record_serializers for all its record_specs and storing field_paths of + // required and prohibited fields. + // Returns a null value if the spec is not well-formed. + static std::unique_ptr Create(const StyleSpec &style_spec); + + // Serializes a token using the style spec, returns true only for valid + // styles satisfying required/prohibited field constraints. If so, all the + // records in the style are serialized onto the input fst. + bool Serialize(const Token &token, MutableTransducer *serialization) const; + + private: + // Only used by the factory function Create. + StyleSerializer() {} + + // Populates record_serializers_ using style_spec. + static bool CreateRecordSerializers(const StyleSpec &style_spec, + const std::unique_ptr &style_serializer); + + // Populates required_fields_ using style_spec. + static bool SetRequiredFieldPaths(const StyleSpec &style_spec, + const std::unique_ptr &style_serializer); + + // Populates prohibited_fields_ using style_spec. + static bool SetProhibitedFieldPaths(const StyleSpec &style_spec, + const std::unique_ptr &style_serializer); + + // Checks required_fields_ in token. + bool CheckRequiredFields(const Token &token) const; + + // Checks prohibited_fields_ in token. + bool CheckProhibitedFields(const Token &token) const; + + // FieldPaths to required fields in the specification. + std::vector> required_fields_; + + // FieldPaths to prohibited fields in the specification. + std::vector prohibited_fields_; + + // Record serializers for the record specs in the style. + std::vector> record_serializers_; + + // Takes as input a message and a target field path ending in a scalar field + // to within the input message and returns true if the field at the end of the + // path is set. It further assumes that all the intermediate messages are + // non-repeated, although the terminating field itself may be repeated. + bool IsFieldSet(const google::protobuf::Message &root, + const FieldPath &field_path) const; + + DISALLOW_COPY_AND_ASSIGN(StyleSerializer); +}; + +} // namespace sparrowhawk +} // namespace speech + +#endif // SPARROWHAWK_STYLE_SERIALIZER_H_ diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index 058dd04..be5a826 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -6,19 +6,25 @@ proto_sources = items.pb.cc \ links.pb.cc \ rule_order.pb.cc \ semiotic_classes.pb.cc \ + serialization_spec.pb.cc \ sparrowhawk_configuration.pb.cc -libsparrowhawk_la_SOURCES = io_utils.cc \ - normalizer_utils.cc \ - protobuf_parser.cc \ - regexp.cc \ - sentence_boundary.cc \ +libsparrowhawk_la_SOURCES = field_path.cc \ + io_utils.cc \ normalizer.cc \ + normalizer_utils.cc \ numbers.cc \ + protobuf_parser.cc \ protobuf_serializer.cc \ + record_serializer.cc \ + regexp.cc \ rule_system.cc \ + sentence_boundary.cc \ + spec_serializer.cc \ string_utils.cc \ + style_serializer.cc \ $(proto_sources) + libsparrowhawk_la_LDFLAGS = -version-info 0:0:0 items.pb.cc: @@ -33,6 +39,9 @@ rule_order.pb.cc: semiotic_classes.pb.cc: $(MAKE) -C $(srcdir)/../proto/ semiotic_classes.pb.cc +serialization_spec.pb.cc: + $(MAKE) -C $(srcdir)/../proto/ serialization_spec.pb.cc + sparrowhawk_configuration.pb.cc: $(MAKE) -C $(srcdir)/../proto/ sparrowhawk_configuration.pb.cc diff --git a/src/lib/Makefile.in b/src/lib/Makefile.in index d45ffa8..8248b8f 100644 --- a/src/lib/Makefile.in +++ b/src/lib/Makefile.in @@ -122,11 +122,13 @@ am__installdirs = "$(DESTDIR)$(libdir)" LTLIBRARIES = $(lib_LTLIBRARIES) libsparrowhawk_la_LIBADD = am__objects_1 = items.pb.lo links.pb.lo rule_order.pb.lo \ - semiotic_classes.pb.lo sparrowhawk_configuration.pb.lo -am_libsparrowhawk_la_OBJECTS = io_utils.lo normalizer_utils.lo \ - protobuf_parser.lo regexp.lo sentence_boundary.lo \ - normalizer.lo numbers.lo protobuf_serializer.lo rule_system.lo \ - string_utils.lo $(am__objects_1) + semiotic_classes.pb.lo serialization_spec.pb.lo \ + sparrowhawk_configuration.pb.lo +am_libsparrowhawk_la_OBJECTS = field_path.lo io_utils.lo normalizer.lo \ + normalizer_utils.lo numbers.lo protobuf_parser.lo \ + protobuf_serializer.lo record_serializer.lo regexp.lo \ + rule_system.lo sentence_boundary.lo spec_serializer.lo \ + string_utils.lo style_serializer.lo $(am__objects_1) libsparrowhawk_la_OBJECTS = $(am_libsparrowhawk_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -325,18 +327,23 @@ proto_sources = items.pb.cc \ links.pb.cc \ rule_order.pb.cc \ semiotic_classes.pb.cc \ + serialization_spec.pb.cc \ sparrowhawk_configuration.pb.cc -libsparrowhawk_la_SOURCES = io_utils.cc \ - normalizer_utils.cc \ - protobuf_parser.cc \ - regexp.cc \ - sentence_boundary.cc \ +libsparrowhawk_la_SOURCES = field_path.cc \ + io_utils.cc \ normalizer.cc \ + normalizer_utils.cc \ numbers.cc \ + protobuf_parser.cc \ protobuf_serializer.cc \ + record_serializer.cc \ + regexp.cc \ rule_system.cc \ + sentence_boundary.cc \ + spec_serializer.cc \ string_utils.cc \ + style_serializer.cc \ $(proto_sources) libsparrowhawk_la_LDFLAGS = -version-info 0:0:0 @@ -419,6 +426,7 @@ mostlyclean-compile: distclean-compile: -rm -f *.tab.c +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/field_path.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/io_utils.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/items.pb.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/links.pb.Plo@am__quote@ @@ -427,13 +435,17 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/numbers.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/protobuf_parser.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/protobuf_serializer.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/record_serializer.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/regexp.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rule_order.pb.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rule_system.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/semiotic_classes.pb.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sentence_boundary.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/serialization_spec.pb.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sparrowhawk_configuration.pb.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/spec_serializer.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/string_utils.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/style_serializer.Plo@am__quote@ .cc.o: @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< @@ -681,6 +693,9 @@ rule_order.pb.cc: semiotic_classes.pb.cc: $(MAKE) -C $(srcdir)/../proto/ semiotic_classes.pb.cc +serialization_spec.pb.cc: + $(MAKE) -C $(srcdir)/../proto/ serialization_spec.pb.cc + sparrowhawk_configuration.pb.cc: $(MAKE) -C $(srcdir)/../proto/ sparrowhawk_configuration.pb.cc diff --git a/src/lib/field_path.cc b/src/lib/field_path.cc new file mode 100644 index 0000000..51be311 --- /dev/null +++ b/src/lib/field_path.cc @@ -0,0 +1,127 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2015 and onwards Google, Inc. +#include + +#include +#include +using std::string; +#include +using std::vector; + +#include +#include +#include + +namespace speech { +namespace sparrowhawk { + +using google::protobuf::Descriptor; +using google::protobuf::FieldDescriptor; +using google::protobuf::Message; + +std::unique_ptr FieldPath::Create( + const Descriptor *root_type) { + if (root_type == nullptr) { + return nullptr; + } else { + std::unique_ptr field_path(new FieldPath(root_type)); + return field_path; + } +} + +void FieldPath::Clear() { + path_.clear(); +} + +bool FieldPath::Follow(const Message &base, const Message **parent, + const FieldDescriptor **field) const { + if (base.GetDescriptor() != root_type_) { + LOG(ERROR) << "Input Message to Follow is of type " + << base.GetDescriptor()->name() + << " while the field_path root type is " << root_type_->name(); + return false; + } + const Message *inner_message = &base; + int size = path_.size(); + for (int i = 0; i < size - 1; ++i) { + // Iterating over singular messages. + inner_message = &inner_message->GetReflection()->GetMessage(*inner_message, + path_[i]); + } + *parent = inner_message; + *field = path_[size - 1]; + return true; +} + +// Helper function to go through the intermediate message fields. +bool FieldPath::TraverseIntermediateFields( + std::vector path, + const google::protobuf::Descriptor **parent) { + for (int i = 0; i < path.size() - 1; ++i) { + string &field_name = path[i]; + const FieldDescriptor *field = (*parent)->FindFieldByName(field_name); + if (field == nullptr) { + LOG(ERROR) << (*parent)->full_name() + << " does not contain a field named '" + << field_name << "'."; + return false; + } + if (field->type() != FieldDescriptor::TYPE_MESSAGE) { + LOG(ERROR) << "Non-terminal field " << field->full_name() + << " is not a message."; + return false; + } + path_.push_back(field); + *parent = field->message_type(); + } + return true; +} + +// Helper function to parse the terminal scalar field. +bool FieldPath::ParseTerminalField(const string &terminal_field_name, + const Descriptor *parent) { + const FieldDescriptor *terminal_field = + parent->FindFieldByName(terminal_field_name); + if (terminal_field == nullptr) { + LOG(ERROR) << parent->full_name() << " does not contain a field named '" + << terminal_field_name << "'."; + return false; + } else if (terminal_field->type() == FieldDescriptor::TYPE_MESSAGE) { + LOG(ERROR) << "Terminal field " << terminal_field->full_name() + << " is a message."; + return false; + } else { + path_.push_back(terminal_field); + } + return true; +} + +bool FieldPath::Parse(const string &path_string) { + // Overwriting without clearing the field_path is illegal. + if (!IsEmpty()) { + LOG(ERROR) << "Cannot overwrite field_path. Use Clear() to reset."; + return false; + } + std::vector path = SplitString(path_string, "."); + const Descriptor *parent = root_type_; + if (TraverseIntermediateFields(path, &parent) && + ParseTerminalField(path.back(), parent)) { + return true; + } + Clear(); + return false; +} + +} // namespace sparrowhawk +} // namespace speech diff --git a/src/lib/io_utils.cc b/src/lib/io_utils.cc index f0687a5..24f664d 100644 --- a/src/lib/io_utils.cc +++ b/src/lib/io_utils.cc @@ -24,7 +24,7 @@ namespace speech { namespace sparrowhawk { string IOStream::LoadFileToString(const string &filename) { - ifstream strm(filename.c_str(), std::ios_base::in); + std::ifstream strm(filename.c_str(), std::ios_base::in); if (!strm) { LoggerFatal("Error opening file %s", filename.c_str()); } diff --git a/src/lib/normalizer.cc b/src/lib/normalizer.cc index b2516a9..ac1fdf6 100644 --- a/src/lib/normalizer.cc +++ b/src/lib/normalizer.cc @@ -20,11 +20,13 @@ using std::string; #include #include #include +#include #include #include #include #include #include +#include #include namespace speech { @@ -71,6 +73,18 @@ bool Normalizer::Setup(const string &configuration_proto, configuration.sentence_boundary_exceptions_file().c_str()); } } + if (configuration.has_serialization_spec()) { + string spec_string = IOStream::LoadFileToString( + pathname_prefix + "/" + configuration.serialization_spec()); + SerializeSpec spec; + if (spec_string.empty() || + !google::protobuf::TextFormat::ParseFromString(spec_string, &spec) || + (spec_serializer_ = Serializer::Create(spec)) == nullptr) { + LoggerError("Failed to load a valid serialization spec from file: %s", + configuration.serialization_spec().c_str()); + return false; + } + } return true; } @@ -187,8 +201,12 @@ bool Normalizer::VerbalizeSemioticClass(const Token &markup, Token local(markup); CleanFields(&local); MutableTransducer input_fst; - ProtobufSerializer serializer(&local, &input_fst); - serializer.SerializeToFst(); + if (spec_serializer_ == nullptr) { + ProtobufSerializer serializer(&local, &input_fst); + serializer.SerializeToFst(); + } else { + input_fst = spec_serializer_->Serialize(local); + } if (!verbalizer_rules_->ApplyRules(input_fst, words, false /* use_lookahead */)) { @@ -198,7 +216,7 @@ bool Normalizer::VerbalizeSemioticClass(const Token &markup, return true; } -vector Normalizer::SentenceSplitter(const string &input) const { +std::vector Normalizer::SentenceSplitter(const string &input) const { return sentence_boundary_->ExtractSentences(input); } diff --git a/src/lib/normalizer_utils.cc b/src/lib/normalizer_utils.cc index 49964f0..ffa25c2 100644 --- a/src/lib/normalizer_utils.cc +++ b/src/lib/normalizer_utils.cc @@ -75,7 +75,7 @@ Word* Normalizer::AddWord(Utterance* utt, Word* Normalizer::AddWords(Utterance* utt, Token* token, const string& words) const { - vector word_names = SplitString(words, " \t\n"); + std::vector word_names = SplitString(words, " \t\n"); Word* word = NULL; for (int i = 0; i < word_names.size(); ++i) { diff --git a/src/lib/protobuf_parser.cc b/src/lib/protobuf_parser.cc index 89a0df9..1d46fc5 100644 --- a/src/lib/protobuf_parser.cc +++ b/src/lib/protobuf_parser.cc @@ -139,7 +139,7 @@ bool ProtobufParser::ParseMessage(bool eof_allowed, Message *message) { const Reflection *reflection = message->GetReflection(); string label; // Record of the order in which the fields came in - vector field_order; + std::vector field_order; while (true) { if (!ConsumeLabel(&label)) { if (eof_allowed) { @@ -234,7 +234,7 @@ bool ProtobufParser::ParseQuotedFieldValue(bool ignore_backslashes, } bool ProtobufParser::RecordFieldOrder(Message *message, - const vector &field_order) { + const std::vector &field_order) { const Descriptor *descriptor = message->GetDescriptor(); const Reflection *reflection = message->GetReflection(); const FieldDescriptor *preserve_field = diff --git a/src/lib/record_serializer.cc b/src/lib/record_serializer.cc new file mode 100644 index 0000000..07c723c --- /dev/null +++ b/src/lib/record_serializer.cc @@ -0,0 +1,330 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2015 and onwards Google, Inc. +#include + +#include +#include +using std::string; +#include +using std::vector; + +#include +#include +#include +#include +#include +#include +#include + +namespace speech { +namespace sparrowhawk { + +using google::protobuf::Descriptor; +using google::protobuf::FieldDescriptor; +using google::protobuf::Reflection; +using google::protobuf::Message; + +namespace { + +const char kLabelSeparator[] = ":"; +const char kEscapedEscape[] = R"(\\)"; +const char kRecordSeparator[] = "|"; + +} // namespace + +RecordSerializer::RecordSerializer() + : escape_re_(string("(") + + string(kEscapedEscape) + + string(R"()|(\)") + + string(kRecordSeparator) + + string(")")), + escape_replacement_(string(kEscapedEscape) + string(R"(\0)")), + string_compiler_(fst::StringTokenType::BYTE) {} + +std::unique_ptr RecordSerializer::Create( + const RecordSpec &record_spec) { + std::unique_ptr record_serializer(new RecordSerializer()); + + // Adds field path, label and default from the spec. + record_serializer->field_path_ = FieldPath::Create(Token::descriptor()); + if (!record_serializer->field_path_->Parse(record_spec.field_path())) { + LOG(ERROR) << "FieldPath failed to parse for record spec: " + << record_spec.field_path(); + return nullptr; + } + if (record_spec.has_label()) { + record_serializer->field_name_ = record_spec.label(); + } else { + std::vector vector_path = + SplitString(record_spec.field_path(), "."); + record_serializer->field_name_ = vector_path.back(); + } + if (record_spec.has_default_value()) { + record_serializer->default_value_ = record_spec.default_value(); + if (record_serializer->default_value_.empty()) { + LOG(ERROR) << "Empty default value for record spec: " + << record_spec.field_path(); + return nullptr; + } + } + + // Adds record serializers for prefix and suffix records. + for (const RecordSpec &prefix_spec : record_spec.prefix_spec()) { + auto prefix_serializer = RecordSerializer::Create(prefix_spec); + if (prefix_serializer) { + record_serializer->prefix_serializers_.push_back( + std::move(prefix_serializer)); + } else { + return nullptr; + } + } + for (const RecordSpec &suffix_spec : record_spec.suffix_spec()) { + auto suffix_serializer = RecordSerializer::Create(suffix_spec); + if (suffix_serializer) { + record_serializer->suffix_serializers_.push_back( + std::move(suffix_serializer)); + } else { + return nullptr; + } + } + return record_serializer; +} + +void RecordSerializer::SerializeRecord(string *value, + MutableTransducer *fst) const { + // Adds a label for the field_name. + string_compiler_(field_name_ + kLabelSeparator, fst); + // Escapes record_separator and escape_character in value. + RE2::GlobalReplace(value, escape_re_, escape_replacement_); + MutableTransducer fst_value; + string_compiler_(*value, &fst_value); + Concat(fst, fst_value); + // Adds a record_separator to terminate the record. + MutableTransducer record_separator_fst; + string_compiler_(kRecordSeparator, &record_separator_fst); + Concat(fst, record_separator_fst); +} + +bool RecordSerializer::SerializeToFstRepeated(const Message &parent, + const FieldDescriptor &field, + const int index, + MutableTransducer *fst) const { + const Reflection *parent_reflection = parent.GetReflection(); + string value; + // TODO(drasha) Add better test coverage for different cases. + switch (field.type()) { + case FieldDescriptor::TYPE_BYTES: + case FieldDescriptor::TYPE_STRING: { + value = parent_reflection->GetRepeatedString(parent, &field, index); + break; + } + case FieldDescriptor::TYPE_BOOL: { + value = std::to_string( + parent_reflection->GetRepeatedBool(parent, &field, index)); + break; + } + case FieldDescriptor::TYPE_DOUBLE: { + value = std::to_string( + parent_reflection->GetRepeatedDouble(parent, &field, index)); + break; + } + case FieldDescriptor::TYPE_FLOAT: + value = std::to_string( + parent_reflection->GetRepeatedFloat(parent, &field, index)); + break; + case FieldDescriptor::TYPE_SFIXED32: + case FieldDescriptor::TYPE_SINT32: + case FieldDescriptor::TYPE_INT32: { + value = std::to_string( + parent_reflection->GetRepeatedInt32(parent, &field, index)); + break; + } + case FieldDescriptor::TYPE_SFIXED64: + case FieldDescriptor::TYPE_SINT64: + case FieldDescriptor::TYPE_INT64: { + value = std::to_string( + parent_reflection->GetRepeatedInt64(parent, &field, index)); + break; + } + case FieldDescriptor::TYPE_FIXED32: + case FieldDescriptor::TYPE_UINT32: { + value = std::to_string( + parent_reflection->GetRepeatedUInt32(parent, &field, index)); + break; + } + case FieldDescriptor::TYPE_FIXED64: + case FieldDescriptor::TYPE_UINT64: { + value = std::to_string( + parent_reflection->GetRepeatedUInt64(parent, &field, index)); + break; + } + case FieldDescriptor::TYPE_ENUM: { + value = parent_reflection->GetEnum(parent, &field)->name(); + break; + } + case FieldDescriptor::TYPE_GROUP: + case FieldDescriptor::TYPE_MESSAGE: { + LOG(ERROR) << "Scalar expected for: " << field.full_name(); + return false; + } + } + SerializeRecord(&value, fst); + return true; +} + +bool RecordSerializer::SerializeToFst(const Message &parent, + const FieldDescriptor &field, + MutableTransducer *fst) const { + const Reflection *parent_reflection = parent.GetReflection(); + string value; + // TODO(drasha) Add better test coverage for different cases. + switch (field.type()) { + case FieldDescriptor::TYPE_BYTES: + case FieldDescriptor::TYPE_STRING: { + value = parent_reflection->GetString(parent, &field); + break; + } + case FieldDescriptor::TYPE_BOOL: { + value = std::to_string(parent_reflection->GetBool(parent, &field)); + break; + } + case FieldDescriptor::TYPE_DOUBLE: { + value = std::to_string(parent_reflection->GetDouble(parent, &field)); + break; + } + case FieldDescriptor::TYPE_FLOAT: { + value = std::to_string(parent_reflection->GetFloat(parent, &field)); + break; + } + case FieldDescriptor::TYPE_SFIXED32: + case FieldDescriptor::TYPE_SINT32: + case FieldDescriptor::TYPE_INT32: { + value = std::to_string(parent_reflection->GetInt32(parent, &field)); + break; + } + case FieldDescriptor::TYPE_SFIXED64: + case FieldDescriptor::TYPE_SINT64: + case FieldDescriptor::TYPE_INT64: { + value = std::to_string(parent_reflection->GetInt64(parent, &field)); + break; + } + case FieldDescriptor::TYPE_FIXED32: + case FieldDescriptor::TYPE_UINT32: { + value = std::to_string(parent_reflection->GetUInt32(parent, &field)); + break; + } + case FieldDescriptor::TYPE_FIXED64: + case FieldDescriptor::TYPE_UINT64: { + value = std::to_string(parent_reflection->GetUInt64(parent, &field)); + break; + } + case FieldDescriptor::TYPE_ENUM: { + value = parent_reflection->GetEnum(parent, &field)->name(); + break; + } + case FieldDescriptor::TYPE_GROUP: + case FieldDescriptor::TYPE_MESSAGE: { + LOG(ERROR) << "Scalar expected for: " << field.full_name(); + return false; + } + } + SerializeRecord(&value, fst); + return true; +} + +bool RecordSerializer::SerializeAffixes(const Token &token, + MutableTransducer *prefix_fst, + MutableTransducer *suffix_fst) const { + prefix_fst->SetStart(prefix_fst->AddState()); + prefix_fst->SetFinal(0, 1); + for (const auto &prefix_serializer : prefix_serializers_) { + if (!prefix_serializer->Serialize(token, prefix_fst)) { + return false; + } + } + suffix_fst->SetStart(suffix_fst->AddState()); + suffix_fst->SetFinal(0, 1); + for (const auto &suffix_serializer : suffix_serializers_) { + if (!suffix_serializer->Serialize(token, suffix_fst)) { + return false; + } + } + return true; +} + +bool RecordSerializer::Serialize(const Token &token, + MutableTransducer *fst) const { + const Message *parent; + const FieldDescriptor *field; + if (!field_path_->Follow(token, &parent, &field)) { + LOG(ERROR) << "FieldPath traversal failed for input Message " + << token.DebugString(); + return false; + } + + // Checks whether the field being serialized is not set (it is known that it + // must be a valid field as it parses) in the token, and returns without + // modifying the fst in this case. + int field_size; + bool repeated_field = field->label() == FieldDescriptor::LABEL_REPEATED; + if (repeated_field) { + field_size = parent->GetReflection()->FieldSize(*parent, field); + if (field_size == 0) { + return true; + } + } else if (!parent->GetReflection()->HasField(*parent, field)) { + if (!default_value_.empty()) { + string value = default_value_; + MutableTransducer serialization; + SerializeRecord(&value, &serialization); + Concat(fst, serialization); + } + return true; + } + + MutableTransducer prefix_fst, suffix_fst; + if (!SerializeAffixes(token, &prefix_fst, &suffix_fst)) { + return false; + } + std::vector serializations; + if (repeated_field) { + if (field->type() == FieldDescriptor::TYPE_MESSAGE) { + LOG(ERROR) << "Intermediate repeated message not allowed in field_path, " + << "found: " << field->full_name(); + return false; + } else { + for (int i = 0; i < field_size; ++i) { + Concat(fst, prefix_fst); + MutableTransducer serialization; + if (!SerializeToFstRepeated(*parent, *field, i, &serialization)) { + return false; + } + Concat(fst, serialization); + Concat(fst, suffix_fst); + } + } + } else { + Concat(fst, prefix_fst); + MutableTransducer serialization; + if (!SerializeToFst(*parent, *field, &serialization)) { + return false; + } + Concat(fst, serialization); + Concat(fst, suffix_fst); + } + return true; +} + +} // namespace sparrowhawk +} // namespace speech diff --git a/src/lib/regexp.cc b/src/lib/regexp.cc index 4227017..d882f80 100644 --- a/src/lib/regexp.cc +++ b/src/lib/regexp.cc @@ -85,7 +85,7 @@ bool Regexp::CheckMatch(const string &input, const string &pattern) { } int Regexp::GetAllMatches(const string &input, - vector *matches) const { + std::vector *matches) const { if (!ok()) { return 0; } diff --git a/src/lib/rule_system.cc b/src/lib/rule_system.cc index 072d41c..af4344c 100644 --- a/src/lib/rule_system.cc +++ b/src/lib/rule_system.cc @@ -24,7 +24,7 @@ using fst::LabelLookAheadRelabeler; using fst::StdArc; RuleSystem::~RuleSystem() { - map::iterator iter; + std::map::iterator iter; for (iter = lookaheads_.begin(); iter != lookaheads_.end(); iter++) { delete iter->second; } @@ -89,7 +89,8 @@ bool RuleSystem::ApplyRules(const Transducer& input, bool success = true; if (parens_rule.empty() && use_lookahead) { - map::iterator iter = lookaheads_.find(rule_name); + std::map::iterator iter = + lookaheads_.find(rule_name); LookaheadFst *lookahead_rule_fst; if (iter == lookaheads_.end()) { const Transducer *rule_fst = grm_->GetFst(rule_name); diff --git a/src/lib/sentence_boundary.cc b/src/lib/sentence_boundary.cc index 9b12582..cdee264 100644 --- a/src/lib/sentence_boundary.cc +++ b/src/lib/sentence_boundary.cc @@ -37,7 +37,7 @@ SentenceBoundary::SentenceBoundary(const string ®exp) : bool SentenceBoundary::LoadSentenceBoundaryExceptions(const string &filename) { string raw = IOStream::LoadFileToString(filename); - vector tokens = SplitString(raw, "\n", true /* skip_empty */); + std::vector tokens = SplitString(raw, "\n", true /* skip_empty */); for (auto token : tokens) { token = StripWhitespace(token); // Having it as an unordered list is of course not very efficient for @@ -53,11 +53,11 @@ bool SentenceBoundary::LoadSentenceBoundaryExceptions(const string &filename) { return true; } -vector SentenceBoundary::ExtractSentences( +std::vector SentenceBoundary::ExtractSentences( const string &input_text) const { - vector potentials; + std::vector potentials; regexp_->GetAllMatches(input_text, &potentials); - vector cutpoints; + std::vector cutpoints; int last = 0, i; for (i = 0; i < potentials.size(); ++i) { const int start = potentials[i].start_char; @@ -70,7 +70,7 @@ vector SentenceBoundary::ExtractSentences( last = end; } } - vector result; + std::vector result; last = 0; string sentence; for (int i = 0; i < cutpoints.size(); ++i) { diff --git a/src/lib/serialization_spec.pb.cc b/src/lib/serialization_spec.pb.cc new file mode 100644 index 0000000..f1fb23e --- /dev/null +++ b/src/lib/serialization_spec.pb.cc @@ -0,0 +1,1380 @@ +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: serialization_spec.proto + +#define INTERNAL_SUPPRESS_PROTOBUF_FIELD_DEPRECATION +#include "serialization_spec.pb.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +// @@protoc_insertion_point(includes) + +namespace speech { +namespace sparrowhawk { + +namespace { + +const ::google::protobuf::Descriptor* RecordSpec_descriptor_ = NULL; +const ::google::protobuf::internal::GeneratedMessageReflection* + RecordSpec_reflection_ = NULL; +const ::google::protobuf::Descriptor* StyleSpec_descriptor_ = NULL; +const ::google::protobuf::internal::GeneratedMessageReflection* + StyleSpec_reflection_ = NULL; +const ::google::protobuf::Descriptor* ClassSpec_descriptor_ = NULL; +const ::google::protobuf::internal::GeneratedMessageReflection* + ClassSpec_reflection_ = NULL; +const ::google::protobuf::Descriptor* SerializeSpec_descriptor_ = NULL; +const ::google::protobuf::internal::GeneratedMessageReflection* + SerializeSpec_reflection_ = NULL; + +} // namespace + + +void protobuf_AssignDesc_serialization_5fspec_2eproto() { + protobuf_AddDesc_serialization_5fspec_2eproto(); + const ::google::protobuf::FileDescriptor* file = + ::google::protobuf::DescriptorPool::generated_pool()->FindFileByName( + "serialization_spec.proto"); + GOOGLE_CHECK(file != NULL); + RecordSpec_descriptor_ = file->message_type(0); + static const int RecordSpec_offsets_[5] = { + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(RecordSpec, prefix_spec_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(RecordSpec, suffix_spec_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(RecordSpec, field_path_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(RecordSpec, label_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(RecordSpec, default_value_), + }; + RecordSpec_reflection_ = + new ::google::protobuf::internal::GeneratedMessageReflection( + RecordSpec_descriptor_, + RecordSpec::default_instance_, + RecordSpec_offsets_, + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(RecordSpec, _has_bits_[0]), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(RecordSpec, _unknown_fields_), + -1, + ::google::protobuf::DescriptorPool::generated_pool(), + ::google::protobuf::MessageFactory::generated_factory(), + sizeof(RecordSpec)); + StyleSpec_descriptor_ = file->message_type(1); + static const int StyleSpec_offsets_[3] = { + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(StyleSpec, record_spec_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(StyleSpec, required_fields_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(StyleSpec, prohibited_fields_), + }; + StyleSpec_reflection_ = + new ::google::protobuf::internal::GeneratedMessageReflection( + StyleSpec_descriptor_, + StyleSpec::default_instance_, + StyleSpec_offsets_, + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(StyleSpec, _has_bits_[0]), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(StyleSpec, _unknown_fields_), + -1, + ::google::protobuf::DescriptorPool::generated_pool(), + ::google::protobuf::MessageFactory::generated_factory(), + sizeof(StyleSpec)); + ClassSpec_descriptor_ = file->message_type(2); + static const int ClassSpec_offsets_[2] = { + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(ClassSpec, semiotic_class_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(ClassSpec, style_spec_), + }; + ClassSpec_reflection_ = + new ::google::protobuf::internal::GeneratedMessageReflection( + ClassSpec_descriptor_, + ClassSpec::default_instance_, + ClassSpec_offsets_, + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(ClassSpec, _has_bits_[0]), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(ClassSpec, _unknown_fields_), + -1, + ::google::protobuf::DescriptorPool::generated_pool(), + ::google::protobuf::MessageFactory::generated_factory(), + sizeof(ClassSpec)); + SerializeSpec_descriptor_ = file->message_type(3); + static const int SerializeSpec_offsets_[1] = { + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(SerializeSpec, class_spec_), + }; + SerializeSpec_reflection_ = + new ::google::protobuf::internal::GeneratedMessageReflection( + SerializeSpec_descriptor_, + SerializeSpec::default_instance_, + SerializeSpec_offsets_, + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(SerializeSpec, _has_bits_[0]), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(SerializeSpec, _unknown_fields_), + -1, + ::google::protobuf::DescriptorPool::generated_pool(), + ::google::protobuf::MessageFactory::generated_factory(), + sizeof(SerializeSpec)); +} + +namespace { + +GOOGLE_PROTOBUF_DECLARE_ONCE(protobuf_AssignDescriptors_once_); +inline void protobuf_AssignDescriptorsOnce() { + ::google::protobuf::GoogleOnceInit(&protobuf_AssignDescriptors_once_, + &protobuf_AssignDesc_serialization_5fspec_2eproto); +} + +void protobuf_RegisterTypes(const ::std::string&) { + protobuf_AssignDescriptorsOnce(); + ::google::protobuf::MessageFactory::InternalRegisterGeneratedMessage( + RecordSpec_descriptor_, &RecordSpec::default_instance()); + ::google::protobuf::MessageFactory::InternalRegisterGeneratedMessage( + StyleSpec_descriptor_, &StyleSpec::default_instance()); + ::google::protobuf::MessageFactory::InternalRegisterGeneratedMessage( + ClassSpec_descriptor_, &ClassSpec::default_instance()); + ::google::protobuf::MessageFactory::InternalRegisterGeneratedMessage( + SerializeSpec_descriptor_, &SerializeSpec::default_instance()); +} + +} // namespace + +void protobuf_ShutdownFile_serialization_5fspec_2eproto() { + delete RecordSpec::default_instance_; + delete RecordSpec_reflection_; + delete StyleSpec::default_instance_; + delete StyleSpec_reflection_; + delete ClassSpec::default_instance_; + delete ClassSpec_reflection_; + delete SerializeSpec::default_instance_; + delete SerializeSpec_reflection_; +} + +void protobuf_AddDesc_serialization_5fspec_2eproto() { + static bool already_here = false; + if (already_here) return; + already_here = true; + GOOGLE_PROTOBUF_VERIFY_VERSION; + + ::google::protobuf::DescriptorPool::InternalAddGeneratedFile( + "\n\030serialization_spec.proto\022\022speech.sparr" + "owhawk\"\260\001\n\nRecordSpec\0223\n\013prefix_spec\030\001 \003" + "(\0132\036.speech.sparrowhawk.RecordSpec\0223\n\013su" + "ffix_spec\030\002 \003(\0132\036.speech.sparrowhawk.Rec" + "ordSpec\022\022\n\nfield_path\030\003 \001(\t\022\r\n\005label\030\004 \001" + "(\t\022\025\n\rdefault_value\030\005 \001(\t\"t\n\tStyleSpec\0223" + "\n\013record_spec\030\001 \003(\0132\036.speech.sparrowhawk" + ".RecordSpec\022\027\n\017required_fields\030\002 \003(\t\022\031\n\021" + "prohibited_fields\030\003 \003(\t\"V\n\tClassSpec\022\026\n\016" + "semiotic_class\030\001 \001(\t\0221\n\nstyle_spec\030\002 \003(\013" + "2\035.speech.sparrowhawk.StyleSpec\"B\n\rSeria" + "lizeSpec\0221\n\nclass_spec\030\001 \003(\0132\035.speech.sp" + "arrowhawk.ClassSpec", 499); + ::google::protobuf::MessageFactory::InternalRegisterGeneratedFile( + "serialization_spec.proto", &protobuf_RegisterTypes); + RecordSpec::default_instance_ = new RecordSpec(); + StyleSpec::default_instance_ = new StyleSpec(); + ClassSpec::default_instance_ = new ClassSpec(); + SerializeSpec::default_instance_ = new SerializeSpec(); + RecordSpec::default_instance_->InitAsDefaultInstance(); + StyleSpec::default_instance_->InitAsDefaultInstance(); + ClassSpec::default_instance_->InitAsDefaultInstance(); + SerializeSpec::default_instance_->InitAsDefaultInstance(); + ::google::protobuf::internal::OnShutdown(&protobuf_ShutdownFile_serialization_5fspec_2eproto); +} + +// Force AddDescriptors() to be called at static initialization time. +struct StaticDescriptorInitializer_serialization_5fspec_2eproto { + StaticDescriptorInitializer_serialization_5fspec_2eproto() { + protobuf_AddDesc_serialization_5fspec_2eproto(); + } +} static_descriptor_initializer_serialization_5fspec_2eproto_; + +// =================================================================== + +#ifndef _MSC_VER +const int RecordSpec::kPrefixSpecFieldNumber; +const int RecordSpec::kSuffixSpecFieldNumber; +const int RecordSpec::kFieldPathFieldNumber; +const int RecordSpec::kLabelFieldNumber; +const int RecordSpec::kDefaultValueFieldNumber; +#endif // !_MSC_VER + +RecordSpec::RecordSpec() + : ::google::protobuf::Message() { + SharedCtor(); +} + +void RecordSpec::InitAsDefaultInstance() { +} + +RecordSpec::RecordSpec(const RecordSpec& from) + : ::google::protobuf::Message() { + SharedCtor(); + MergeFrom(from); +} + +void RecordSpec::SharedCtor() { + _cached_size_ = 0; + field_path_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString); + label_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString); + default_value_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString); + ::memset(_has_bits_, 0, sizeof(_has_bits_)); +} + +RecordSpec::~RecordSpec() { + SharedDtor(); +} + +void RecordSpec::SharedDtor() { + if (field_path_ != &::google::protobuf::internal::kEmptyString) { + delete field_path_; + } + if (label_ != &::google::protobuf::internal::kEmptyString) { + delete label_; + } + if (default_value_ != &::google::protobuf::internal::kEmptyString) { + delete default_value_; + } + if (this != default_instance_) { + } +} + +void RecordSpec::SetCachedSize(int size) const { + GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN(); + _cached_size_ = size; + GOOGLE_SAFE_CONCURRENT_WRITES_END(); +} +const ::google::protobuf::Descriptor* RecordSpec::descriptor() { + protobuf_AssignDescriptorsOnce(); + return RecordSpec_descriptor_; +} + +const RecordSpec& RecordSpec::default_instance() { + if (default_instance_ == NULL) protobuf_AddDesc_serialization_5fspec_2eproto(); + return *default_instance_; +} + +RecordSpec* RecordSpec::default_instance_ = NULL; + +RecordSpec* RecordSpec::New() const { + return new RecordSpec; +} + +void RecordSpec::Clear() { + if (_has_bits_[2 / 32] & (0xffu << (2 % 32))) { + if (has_field_path()) { + if (field_path_ != &::google::protobuf::internal::kEmptyString) { + field_path_->clear(); + } + } + if (has_label()) { + if (label_ != &::google::protobuf::internal::kEmptyString) { + label_->clear(); + } + } + if (has_default_value()) { + if (default_value_ != &::google::protobuf::internal::kEmptyString) { + default_value_->clear(); + } + } + } + prefix_spec_.Clear(); + suffix_spec_.Clear(); + ::memset(_has_bits_, 0, sizeof(_has_bits_)); + mutable_unknown_fields()->Clear(); +} + +bool RecordSpec::MergePartialFromCodedStream( + ::google::protobuf::io::CodedInputStream* input) { +#define DO_(EXPRESSION) if (!(EXPRESSION)) return false + ::google::protobuf::uint32 tag; + while ((tag = input->ReadTag()) != 0) { + switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) { + // repeated .speech.sparrowhawk.RecordSpec prefix_spec = 1; + case 1: { + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED) { + parse_prefix_spec: + DO_(::google::protobuf::internal::WireFormatLite::ReadMessageNoVirtual( + input, add_prefix_spec())); + } else { + goto handle_uninterpreted; + } + if (input->ExpectTag(10)) goto parse_prefix_spec; + if (input->ExpectTag(18)) goto parse_suffix_spec; + break; + } + + // repeated .speech.sparrowhawk.RecordSpec suffix_spec = 2; + case 2: { + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED) { + parse_suffix_spec: + DO_(::google::protobuf::internal::WireFormatLite::ReadMessageNoVirtual( + input, add_suffix_spec())); + } else { + goto handle_uninterpreted; + } + if (input->ExpectTag(18)) goto parse_suffix_spec; + if (input->ExpectTag(26)) goto parse_field_path; + break; + } + + // optional string field_path = 3; + case 3: { + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED) { + parse_field_path: + DO_(::google::protobuf::internal::WireFormatLite::ReadString( + input, this->mutable_field_path())); + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->field_path().data(), this->field_path().length(), + ::google::protobuf::internal::WireFormat::PARSE); + } else { + goto handle_uninterpreted; + } + if (input->ExpectTag(34)) goto parse_label; + break; + } + + // optional string label = 4; + case 4: { + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED) { + parse_label: + DO_(::google::protobuf::internal::WireFormatLite::ReadString( + input, this->mutable_label())); + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->label().data(), this->label().length(), + ::google::protobuf::internal::WireFormat::PARSE); + } else { + goto handle_uninterpreted; + } + if (input->ExpectTag(42)) goto parse_default_value; + break; + } + + // optional string default_value = 5; + case 5: { + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED) { + parse_default_value: + DO_(::google::protobuf::internal::WireFormatLite::ReadString( + input, this->mutable_default_value())); + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->default_value().data(), this->default_value().length(), + ::google::protobuf::internal::WireFormat::PARSE); + } else { + goto handle_uninterpreted; + } + if (input->ExpectAtEnd()) return true; + break; + } + + default: { + handle_uninterpreted: + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_END_GROUP) { + return true; + } + DO_(::google::protobuf::internal::WireFormat::SkipField( + input, tag, mutable_unknown_fields())); + break; + } + } + } + return true; +#undef DO_ +} + +void RecordSpec::SerializeWithCachedSizes( + ::google::protobuf::io::CodedOutputStream* output) const { + // repeated .speech.sparrowhawk.RecordSpec prefix_spec = 1; + for (int i = 0; i < this->prefix_spec_size(); i++) { + ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray( + 1, this->prefix_spec(i), output); + } + + // repeated .speech.sparrowhawk.RecordSpec suffix_spec = 2; + for (int i = 0; i < this->suffix_spec_size(); i++) { + ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray( + 2, this->suffix_spec(i), output); + } + + // optional string field_path = 3; + if (has_field_path()) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->field_path().data(), this->field_path().length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + ::google::protobuf::internal::WireFormatLite::WriteString( + 3, this->field_path(), output); + } + + // optional string label = 4; + if (has_label()) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->label().data(), this->label().length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + ::google::protobuf::internal::WireFormatLite::WriteString( + 4, this->label(), output); + } + + // optional string default_value = 5; + if (has_default_value()) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->default_value().data(), this->default_value().length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + ::google::protobuf::internal::WireFormatLite::WriteString( + 5, this->default_value(), output); + } + + if (!unknown_fields().empty()) { + ::google::protobuf::internal::WireFormat::SerializeUnknownFields( + unknown_fields(), output); + } +} + +::google::protobuf::uint8* RecordSpec::SerializeWithCachedSizesToArray( + ::google::protobuf::uint8* target) const { + // repeated .speech.sparrowhawk.RecordSpec prefix_spec = 1; + for (int i = 0; i < this->prefix_spec_size(); i++) { + target = ::google::protobuf::internal::WireFormatLite:: + WriteMessageNoVirtualToArray( + 1, this->prefix_spec(i), target); + } + + // repeated .speech.sparrowhawk.RecordSpec suffix_spec = 2; + for (int i = 0; i < this->suffix_spec_size(); i++) { + target = ::google::protobuf::internal::WireFormatLite:: + WriteMessageNoVirtualToArray( + 2, this->suffix_spec(i), target); + } + + // optional string field_path = 3; + if (has_field_path()) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->field_path().data(), this->field_path().length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + target = + ::google::protobuf::internal::WireFormatLite::WriteStringToArray( + 3, this->field_path(), target); + } + + // optional string label = 4; + if (has_label()) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->label().data(), this->label().length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + target = + ::google::protobuf::internal::WireFormatLite::WriteStringToArray( + 4, this->label(), target); + } + + // optional string default_value = 5; + if (has_default_value()) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->default_value().data(), this->default_value().length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + target = + ::google::protobuf::internal::WireFormatLite::WriteStringToArray( + 5, this->default_value(), target); + } + + if (!unknown_fields().empty()) { + target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray( + unknown_fields(), target); + } + return target; +} + +int RecordSpec::ByteSize() const { + int total_size = 0; + + if (_has_bits_[2 / 32] & (0xffu << (2 % 32))) { + // optional string field_path = 3; + if (has_field_path()) { + total_size += 1 + + ::google::protobuf::internal::WireFormatLite::StringSize( + this->field_path()); + } + + // optional string label = 4; + if (has_label()) { + total_size += 1 + + ::google::protobuf::internal::WireFormatLite::StringSize( + this->label()); + } + + // optional string default_value = 5; + if (has_default_value()) { + total_size += 1 + + ::google::protobuf::internal::WireFormatLite::StringSize( + this->default_value()); + } + + } + // repeated .speech.sparrowhawk.RecordSpec prefix_spec = 1; + total_size += 1 * this->prefix_spec_size(); + for (int i = 0; i < this->prefix_spec_size(); i++) { + total_size += + ::google::protobuf::internal::WireFormatLite::MessageSizeNoVirtual( + this->prefix_spec(i)); + } + + // repeated .speech.sparrowhawk.RecordSpec suffix_spec = 2; + total_size += 1 * this->suffix_spec_size(); + for (int i = 0; i < this->suffix_spec_size(); i++) { + total_size += + ::google::protobuf::internal::WireFormatLite::MessageSizeNoVirtual( + this->suffix_spec(i)); + } + + if (!unknown_fields().empty()) { + total_size += + ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize( + unknown_fields()); + } + GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN(); + _cached_size_ = total_size; + GOOGLE_SAFE_CONCURRENT_WRITES_END(); + return total_size; +} + +void RecordSpec::MergeFrom(const ::google::protobuf::Message& from) { + GOOGLE_CHECK_NE(&from, this); + const RecordSpec* source = + ::google::protobuf::internal::dynamic_cast_if_available( + &from); + if (source == NULL) { + ::google::protobuf::internal::ReflectionOps::Merge(from, this); + } else { + MergeFrom(*source); + } +} + +void RecordSpec::MergeFrom(const RecordSpec& from) { + GOOGLE_CHECK_NE(&from, this); + prefix_spec_.MergeFrom(from.prefix_spec_); + suffix_spec_.MergeFrom(from.suffix_spec_); + if (from._has_bits_[2 / 32] & (0xffu << (2 % 32))) { + if (from.has_field_path()) { + set_field_path(from.field_path()); + } + if (from.has_label()) { + set_label(from.label()); + } + if (from.has_default_value()) { + set_default_value(from.default_value()); + } + } + mutable_unknown_fields()->MergeFrom(from.unknown_fields()); +} + +void RecordSpec::CopyFrom(const ::google::protobuf::Message& from) { + if (&from == this) return; + Clear(); + MergeFrom(from); +} + +void RecordSpec::CopyFrom(const RecordSpec& from) { + if (&from == this) return; + Clear(); + MergeFrom(from); +} + +bool RecordSpec::IsInitialized() const { + + return true; +} + +void RecordSpec::Swap(RecordSpec* other) { + if (other != this) { + prefix_spec_.Swap(&other->prefix_spec_); + suffix_spec_.Swap(&other->suffix_spec_); + std::swap(field_path_, other->field_path_); + std::swap(label_, other->label_); + std::swap(default_value_, other->default_value_); + std::swap(_has_bits_[0], other->_has_bits_[0]); + _unknown_fields_.Swap(&other->_unknown_fields_); + std::swap(_cached_size_, other->_cached_size_); + } +} + +::google::protobuf::Metadata RecordSpec::GetMetadata() const { + protobuf_AssignDescriptorsOnce(); + ::google::protobuf::Metadata metadata; + metadata.descriptor = RecordSpec_descriptor_; + metadata.reflection = RecordSpec_reflection_; + return metadata; +} + + +// =================================================================== + +#ifndef _MSC_VER +const int StyleSpec::kRecordSpecFieldNumber; +const int StyleSpec::kRequiredFieldsFieldNumber; +const int StyleSpec::kProhibitedFieldsFieldNumber; +#endif // !_MSC_VER + +StyleSpec::StyleSpec() + : ::google::protobuf::Message() { + SharedCtor(); +} + +void StyleSpec::InitAsDefaultInstance() { +} + +StyleSpec::StyleSpec(const StyleSpec& from) + : ::google::protobuf::Message() { + SharedCtor(); + MergeFrom(from); +} + +void StyleSpec::SharedCtor() { + _cached_size_ = 0; + ::memset(_has_bits_, 0, sizeof(_has_bits_)); +} + +StyleSpec::~StyleSpec() { + SharedDtor(); +} + +void StyleSpec::SharedDtor() { + if (this != default_instance_) { + } +} + +void StyleSpec::SetCachedSize(int size) const { + GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN(); + _cached_size_ = size; + GOOGLE_SAFE_CONCURRENT_WRITES_END(); +} +const ::google::protobuf::Descriptor* StyleSpec::descriptor() { + protobuf_AssignDescriptorsOnce(); + return StyleSpec_descriptor_; +} + +const StyleSpec& StyleSpec::default_instance() { + if (default_instance_ == NULL) protobuf_AddDesc_serialization_5fspec_2eproto(); + return *default_instance_; +} + +StyleSpec* StyleSpec::default_instance_ = NULL; + +StyleSpec* StyleSpec::New() const { + return new StyleSpec; +} + +void StyleSpec::Clear() { + record_spec_.Clear(); + required_fields_.Clear(); + prohibited_fields_.Clear(); + ::memset(_has_bits_, 0, sizeof(_has_bits_)); + mutable_unknown_fields()->Clear(); +} + +bool StyleSpec::MergePartialFromCodedStream( + ::google::protobuf::io::CodedInputStream* input) { +#define DO_(EXPRESSION) if (!(EXPRESSION)) return false + ::google::protobuf::uint32 tag; + while ((tag = input->ReadTag()) != 0) { + switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) { + // repeated .speech.sparrowhawk.RecordSpec record_spec = 1; + case 1: { + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED) { + parse_record_spec: + DO_(::google::protobuf::internal::WireFormatLite::ReadMessageNoVirtual( + input, add_record_spec())); + } else { + goto handle_uninterpreted; + } + if (input->ExpectTag(10)) goto parse_record_spec; + if (input->ExpectTag(18)) goto parse_required_fields; + break; + } + + // repeated string required_fields = 2; + case 2: { + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED) { + parse_required_fields: + DO_(::google::protobuf::internal::WireFormatLite::ReadString( + input, this->add_required_fields())); + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->required_fields(this->required_fields_size() - 1).data(), + this->required_fields(this->required_fields_size() - 1).length(), + ::google::protobuf::internal::WireFormat::PARSE); + } else { + goto handle_uninterpreted; + } + if (input->ExpectTag(18)) goto parse_required_fields; + if (input->ExpectTag(26)) goto parse_prohibited_fields; + break; + } + + // repeated string prohibited_fields = 3; + case 3: { + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED) { + parse_prohibited_fields: + DO_(::google::protobuf::internal::WireFormatLite::ReadString( + input, this->add_prohibited_fields())); + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->prohibited_fields(this->prohibited_fields_size() - 1).data(), + this->prohibited_fields(this->prohibited_fields_size() - 1).length(), + ::google::protobuf::internal::WireFormat::PARSE); + } else { + goto handle_uninterpreted; + } + if (input->ExpectTag(26)) goto parse_prohibited_fields; + if (input->ExpectAtEnd()) return true; + break; + } + + default: { + handle_uninterpreted: + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_END_GROUP) { + return true; + } + DO_(::google::protobuf::internal::WireFormat::SkipField( + input, tag, mutable_unknown_fields())); + break; + } + } + } + return true; +#undef DO_ +} + +void StyleSpec::SerializeWithCachedSizes( + ::google::protobuf::io::CodedOutputStream* output) const { + // repeated .speech.sparrowhawk.RecordSpec record_spec = 1; + for (int i = 0; i < this->record_spec_size(); i++) { + ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray( + 1, this->record_spec(i), output); + } + + // repeated string required_fields = 2; + for (int i = 0; i < this->required_fields_size(); i++) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->required_fields(i).data(), this->required_fields(i).length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + ::google::protobuf::internal::WireFormatLite::WriteString( + 2, this->required_fields(i), output); + } + + // repeated string prohibited_fields = 3; + for (int i = 0; i < this->prohibited_fields_size(); i++) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->prohibited_fields(i).data(), this->prohibited_fields(i).length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + ::google::protobuf::internal::WireFormatLite::WriteString( + 3, this->prohibited_fields(i), output); + } + + if (!unknown_fields().empty()) { + ::google::protobuf::internal::WireFormat::SerializeUnknownFields( + unknown_fields(), output); + } +} + +::google::protobuf::uint8* StyleSpec::SerializeWithCachedSizesToArray( + ::google::protobuf::uint8* target) const { + // repeated .speech.sparrowhawk.RecordSpec record_spec = 1; + for (int i = 0; i < this->record_spec_size(); i++) { + target = ::google::protobuf::internal::WireFormatLite:: + WriteMessageNoVirtualToArray( + 1, this->record_spec(i), target); + } + + // repeated string required_fields = 2; + for (int i = 0; i < this->required_fields_size(); i++) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->required_fields(i).data(), this->required_fields(i).length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + target = ::google::protobuf::internal::WireFormatLite:: + WriteStringToArray(2, this->required_fields(i), target); + } + + // repeated string prohibited_fields = 3; + for (int i = 0; i < this->prohibited_fields_size(); i++) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->prohibited_fields(i).data(), this->prohibited_fields(i).length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + target = ::google::protobuf::internal::WireFormatLite:: + WriteStringToArray(3, this->prohibited_fields(i), target); + } + + if (!unknown_fields().empty()) { + target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray( + unknown_fields(), target); + } + return target; +} + +int StyleSpec::ByteSize() const { + int total_size = 0; + + // repeated .speech.sparrowhawk.RecordSpec record_spec = 1; + total_size += 1 * this->record_spec_size(); + for (int i = 0; i < this->record_spec_size(); i++) { + total_size += + ::google::protobuf::internal::WireFormatLite::MessageSizeNoVirtual( + this->record_spec(i)); + } + + // repeated string required_fields = 2; + total_size += 1 * this->required_fields_size(); + for (int i = 0; i < this->required_fields_size(); i++) { + total_size += ::google::protobuf::internal::WireFormatLite::StringSize( + this->required_fields(i)); + } + + // repeated string prohibited_fields = 3; + total_size += 1 * this->prohibited_fields_size(); + for (int i = 0; i < this->prohibited_fields_size(); i++) { + total_size += ::google::protobuf::internal::WireFormatLite::StringSize( + this->prohibited_fields(i)); + } + + if (!unknown_fields().empty()) { + total_size += + ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize( + unknown_fields()); + } + GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN(); + _cached_size_ = total_size; + GOOGLE_SAFE_CONCURRENT_WRITES_END(); + return total_size; +} + +void StyleSpec::MergeFrom(const ::google::protobuf::Message& from) { + GOOGLE_CHECK_NE(&from, this); + const StyleSpec* source = + ::google::protobuf::internal::dynamic_cast_if_available( + &from); + if (source == NULL) { + ::google::protobuf::internal::ReflectionOps::Merge(from, this); + } else { + MergeFrom(*source); + } +} + +void StyleSpec::MergeFrom(const StyleSpec& from) { + GOOGLE_CHECK_NE(&from, this); + record_spec_.MergeFrom(from.record_spec_); + required_fields_.MergeFrom(from.required_fields_); + prohibited_fields_.MergeFrom(from.prohibited_fields_); + mutable_unknown_fields()->MergeFrom(from.unknown_fields()); +} + +void StyleSpec::CopyFrom(const ::google::protobuf::Message& from) { + if (&from == this) return; + Clear(); + MergeFrom(from); +} + +void StyleSpec::CopyFrom(const StyleSpec& from) { + if (&from == this) return; + Clear(); + MergeFrom(from); +} + +bool StyleSpec::IsInitialized() const { + + return true; +} + +void StyleSpec::Swap(StyleSpec* other) { + if (other != this) { + record_spec_.Swap(&other->record_spec_); + required_fields_.Swap(&other->required_fields_); + prohibited_fields_.Swap(&other->prohibited_fields_); + std::swap(_has_bits_[0], other->_has_bits_[0]); + _unknown_fields_.Swap(&other->_unknown_fields_); + std::swap(_cached_size_, other->_cached_size_); + } +} + +::google::protobuf::Metadata StyleSpec::GetMetadata() const { + protobuf_AssignDescriptorsOnce(); + ::google::protobuf::Metadata metadata; + metadata.descriptor = StyleSpec_descriptor_; + metadata.reflection = StyleSpec_reflection_; + return metadata; +} + + +// =================================================================== + +#ifndef _MSC_VER +const int ClassSpec::kSemioticClassFieldNumber; +const int ClassSpec::kStyleSpecFieldNumber; +#endif // !_MSC_VER + +ClassSpec::ClassSpec() + : ::google::protobuf::Message() { + SharedCtor(); +} + +void ClassSpec::InitAsDefaultInstance() { +} + +ClassSpec::ClassSpec(const ClassSpec& from) + : ::google::protobuf::Message() { + SharedCtor(); + MergeFrom(from); +} + +void ClassSpec::SharedCtor() { + _cached_size_ = 0; + semiotic_class_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString); + ::memset(_has_bits_, 0, sizeof(_has_bits_)); +} + +ClassSpec::~ClassSpec() { + SharedDtor(); +} + +void ClassSpec::SharedDtor() { + if (semiotic_class_ != &::google::protobuf::internal::kEmptyString) { + delete semiotic_class_; + } + if (this != default_instance_) { + } +} + +void ClassSpec::SetCachedSize(int size) const { + GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN(); + _cached_size_ = size; + GOOGLE_SAFE_CONCURRENT_WRITES_END(); +} +const ::google::protobuf::Descriptor* ClassSpec::descriptor() { + protobuf_AssignDescriptorsOnce(); + return ClassSpec_descriptor_; +} + +const ClassSpec& ClassSpec::default_instance() { + if (default_instance_ == NULL) protobuf_AddDesc_serialization_5fspec_2eproto(); + return *default_instance_; +} + +ClassSpec* ClassSpec::default_instance_ = NULL; + +ClassSpec* ClassSpec::New() const { + return new ClassSpec; +} + +void ClassSpec::Clear() { + if (_has_bits_[0 / 32] & (0xffu << (0 % 32))) { + if (has_semiotic_class()) { + if (semiotic_class_ != &::google::protobuf::internal::kEmptyString) { + semiotic_class_->clear(); + } + } + } + style_spec_.Clear(); + ::memset(_has_bits_, 0, sizeof(_has_bits_)); + mutable_unknown_fields()->Clear(); +} + +bool ClassSpec::MergePartialFromCodedStream( + ::google::protobuf::io::CodedInputStream* input) { +#define DO_(EXPRESSION) if (!(EXPRESSION)) return false + ::google::protobuf::uint32 tag; + while ((tag = input->ReadTag()) != 0) { + switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) { + // optional string semiotic_class = 1; + case 1: { + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED) { + DO_(::google::protobuf::internal::WireFormatLite::ReadString( + input, this->mutable_semiotic_class())); + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->semiotic_class().data(), this->semiotic_class().length(), + ::google::protobuf::internal::WireFormat::PARSE); + } else { + goto handle_uninterpreted; + } + if (input->ExpectTag(18)) goto parse_style_spec; + break; + } + + // repeated .speech.sparrowhawk.StyleSpec style_spec = 2; + case 2: { + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED) { + parse_style_spec: + DO_(::google::protobuf::internal::WireFormatLite::ReadMessageNoVirtual( + input, add_style_spec())); + } else { + goto handle_uninterpreted; + } + if (input->ExpectTag(18)) goto parse_style_spec; + if (input->ExpectAtEnd()) return true; + break; + } + + default: { + handle_uninterpreted: + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_END_GROUP) { + return true; + } + DO_(::google::protobuf::internal::WireFormat::SkipField( + input, tag, mutable_unknown_fields())); + break; + } + } + } + return true; +#undef DO_ +} + +void ClassSpec::SerializeWithCachedSizes( + ::google::protobuf::io::CodedOutputStream* output) const { + // optional string semiotic_class = 1; + if (has_semiotic_class()) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->semiotic_class().data(), this->semiotic_class().length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + ::google::protobuf::internal::WireFormatLite::WriteString( + 1, this->semiotic_class(), output); + } + + // repeated .speech.sparrowhawk.StyleSpec style_spec = 2; + for (int i = 0; i < this->style_spec_size(); i++) { + ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray( + 2, this->style_spec(i), output); + } + + if (!unknown_fields().empty()) { + ::google::protobuf::internal::WireFormat::SerializeUnknownFields( + unknown_fields(), output); + } +} + +::google::protobuf::uint8* ClassSpec::SerializeWithCachedSizesToArray( + ::google::protobuf::uint8* target) const { + // optional string semiotic_class = 1; + if (has_semiotic_class()) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->semiotic_class().data(), this->semiotic_class().length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + target = + ::google::protobuf::internal::WireFormatLite::WriteStringToArray( + 1, this->semiotic_class(), target); + } + + // repeated .speech.sparrowhawk.StyleSpec style_spec = 2; + for (int i = 0; i < this->style_spec_size(); i++) { + target = ::google::protobuf::internal::WireFormatLite:: + WriteMessageNoVirtualToArray( + 2, this->style_spec(i), target); + } + + if (!unknown_fields().empty()) { + target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray( + unknown_fields(), target); + } + return target; +} + +int ClassSpec::ByteSize() const { + int total_size = 0; + + if (_has_bits_[0 / 32] & (0xffu << (0 % 32))) { + // optional string semiotic_class = 1; + if (has_semiotic_class()) { + total_size += 1 + + ::google::protobuf::internal::WireFormatLite::StringSize( + this->semiotic_class()); + } + + } + // repeated .speech.sparrowhawk.StyleSpec style_spec = 2; + total_size += 1 * this->style_spec_size(); + for (int i = 0; i < this->style_spec_size(); i++) { + total_size += + ::google::protobuf::internal::WireFormatLite::MessageSizeNoVirtual( + this->style_spec(i)); + } + + if (!unknown_fields().empty()) { + total_size += + ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize( + unknown_fields()); + } + GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN(); + _cached_size_ = total_size; + GOOGLE_SAFE_CONCURRENT_WRITES_END(); + return total_size; +} + +void ClassSpec::MergeFrom(const ::google::protobuf::Message& from) { + GOOGLE_CHECK_NE(&from, this); + const ClassSpec* source = + ::google::protobuf::internal::dynamic_cast_if_available( + &from); + if (source == NULL) { + ::google::protobuf::internal::ReflectionOps::Merge(from, this); + } else { + MergeFrom(*source); + } +} + +void ClassSpec::MergeFrom(const ClassSpec& from) { + GOOGLE_CHECK_NE(&from, this); + style_spec_.MergeFrom(from.style_spec_); + if (from._has_bits_[0 / 32] & (0xffu << (0 % 32))) { + if (from.has_semiotic_class()) { + set_semiotic_class(from.semiotic_class()); + } + } + mutable_unknown_fields()->MergeFrom(from.unknown_fields()); +} + +void ClassSpec::CopyFrom(const ::google::protobuf::Message& from) { + if (&from == this) return; + Clear(); + MergeFrom(from); +} + +void ClassSpec::CopyFrom(const ClassSpec& from) { + if (&from == this) return; + Clear(); + MergeFrom(from); +} + +bool ClassSpec::IsInitialized() const { + + return true; +} + +void ClassSpec::Swap(ClassSpec* other) { + if (other != this) { + std::swap(semiotic_class_, other->semiotic_class_); + style_spec_.Swap(&other->style_spec_); + std::swap(_has_bits_[0], other->_has_bits_[0]); + _unknown_fields_.Swap(&other->_unknown_fields_); + std::swap(_cached_size_, other->_cached_size_); + } +} + +::google::protobuf::Metadata ClassSpec::GetMetadata() const { + protobuf_AssignDescriptorsOnce(); + ::google::protobuf::Metadata metadata; + metadata.descriptor = ClassSpec_descriptor_; + metadata.reflection = ClassSpec_reflection_; + return metadata; +} + + +// =================================================================== + +#ifndef _MSC_VER +const int SerializeSpec::kClassSpecFieldNumber; +#endif // !_MSC_VER + +SerializeSpec::SerializeSpec() + : ::google::protobuf::Message() { + SharedCtor(); +} + +void SerializeSpec::InitAsDefaultInstance() { +} + +SerializeSpec::SerializeSpec(const SerializeSpec& from) + : ::google::protobuf::Message() { + SharedCtor(); + MergeFrom(from); +} + +void SerializeSpec::SharedCtor() { + _cached_size_ = 0; + ::memset(_has_bits_, 0, sizeof(_has_bits_)); +} + +SerializeSpec::~SerializeSpec() { + SharedDtor(); +} + +void SerializeSpec::SharedDtor() { + if (this != default_instance_) { + } +} + +void SerializeSpec::SetCachedSize(int size) const { + GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN(); + _cached_size_ = size; + GOOGLE_SAFE_CONCURRENT_WRITES_END(); +} +const ::google::protobuf::Descriptor* SerializeSpec::descriptor() { + protobuf_AssignDescriptorsOnce(); + return SerializeSpec_descriptor_; +} + +const SerializeSpec& SerializeSpec::default_instance() { + if (default_instance_ == NULL) protobuf_AddDesc_serialization_5fspec_2eproto(); + return *default_instance_; +} + +SerializeSpec* SerializeSpec::default_instance_ = NULL; + +SerializeSpec* SerializeSpec::New() const { + return new SerializeSpec; +} + +void SerializeSpec::Clear() { + class_spec_.Clear(); + ::memset(_has_bits_, 0, sizeof(_has_bits_)); + mutable_unknown_fields()->Clear(); +} + +bool SerializeSpec::MergePartialFromCodedStream( + ::google::protobuf::io::CodedInputStream* input) { +#define DO_(EXPRESSION) if (!(EXPRESSION)) return false + ::google::protobuf::uint32 tag; + while ((tag = input->ReadTag()) != 0) { + switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) { + // repeated .speech.sparrowhawk.ClassSpec class_spec = 1; + case 1: { + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED) { + parse_class_spec: + DO_(::google::protobuf::internal::WireFormatLite::ReadMessageNoVirtual( + input, add_class_spec())); + } else { + goto handle_uninterpreted; + } + if (input->ExpectTag(10)) goto parse_class_spec; + if (input->ExpectAtEnd()) return true; + break; + } + + default: { + handle_uninterpreted: + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_END_GROUP) { + return true; + } + DO_(::google::protobuf::internal::WireFormat::SkipField( + input, tag, mutable_unknown_fields())); + break; + } + } + } + return true; +#undef DO_ +} + +void SerializeSpec::SerializeWithCachedSizes( + ::google::protobuf::io::CodedOutputStream* output) const { + // repeated .speech.sparrowhawk.ClassSpec class_spec = 1; + for (int i = 0; i < this->class_spec_size(); i++) { + ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray( + 1, this->class_spec(i), output); + } + + if (!unknown_fields().empty()) { + ::google::protobuf::internal::WireFormat::SerializeUnknownFields( + unknown_fields(), output); + } +} + +::google::protobuf::uint8* SerializeSpec::SerializeWithCachedSizesToArray( + ::google::protobuf::uint8* target) const { + // repeated .speech.sparrowhawk.ClassSpec class_spec = 1; + for (int i = 0; i < this->class_spec_size(); i++) { + target = ::google::protobuf::internal::WireFormatLite:: + WriteMessageNoVirtualToArray( + 1, this->class_spec(i), target); + } + + if (!unknown_fields().empty()) { + target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray( + unknown_fields(), target); + } + return target; +} + +int SerializeSpec::ByteSize() const { + int total_size = 0; + + // repeated .speech.sparrowhawk.ClassSpec class_spec = 1; + total_size += 1 * this->class_spec_size(); + for (int i = 0; i < this->class_spec_size(); i++) { + total_size += + ::google::protobuf::internal::WireFormatLite::MessageSizeNoVirtual( + this->class_spec(i)); + } + + if (!unknown_fields().empty()) { + total_size += + ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize( + unknown_fields()); + } + GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN(); + _cached_size_ = total_size; + GOOGLE_SAFE_CONCURRENT_WRITES_END(); + return total_size; +} + +void SerializeSpec::MergeFrom(const ::google::protobuf::Message& from) { + GOOGLE_CHECK_NE(&from, this); + const SerializeSpec* source = + ::google::protobuf::internal::dynamic_cast_if_available( + &from); + if (source == NULL) { + ::google::protobuf::internal::ReflectionOps::Merge(from, this); + } else { + MergeFrom(*source); + } +} + +void SerializeSpec::MergeFrom(const SerializeSpec& from) { + GOOGLE_CHECK_NE(&from, this); + class_spec_.MergeFrom(from.class_spec_); + mutable_unknown_fields()->MergeFrom(from.unknown_fields()); +} + +void SerializeSpec::CopyFrom(const ::google::protobuf::Message& from) { + if (&from == this) return; + Clear(); + MergeFrom(from); +} + +void SerializeSpec::CopyFrom(const SerializeSpec& from) { + if (&from == this) return; + Clear(); + MergeFrom(from); +} + +bool SerializeSpec::IsInitialized() const { + + return true; +} + +void SerializeSpec::Swap(SerializeSpec* other) { + if (other != this) { + class_spec_.Swap(&other->class_spec_); + std::swap(_has_bits_[0], other->_has_bits_[0]); + _unknown_fields_.Swap(&other->_unknown_fields_); + std::swap(_cached_size_, other->_cached_size_); + } +} + +::google::protobuf::Metadata SerializeSpec::GetMetadata() const { + protobuf_AssignDescriptorsOnce(); + ::google::protobuf::Metadata metadata; + metadata.descriptor = SerializeSpec_descriptor_; + metadata.reflection = SerializeSpec_reflection_; + return metadata; +} + + +// @@protoc_insertion_point(namespace_scope) + +} // namespace sparrowhawk +} // namespace speech + +// @@protoc_insertion_point(global_scope) diff --git a/src/lib/sparrowhawk_configuration.pb.cc b/src/lib/sparrowhawk_configuration.pb.cc index 0052fac..a849ae0 100644 --- a/src/lib/sparrowhawk_configuration.pb.cc +++ b/src/lib/sparrowhawk_configuration.pb.cc @@ -35,11 +35,12 @@ void protobuf_AssignDesc_sparrowhawk_5fconfiguration_2eproto() { "sparrowhawk_configuration.proto"); GOOGLE_CHECK(file != NULL); SparrowhawkConfiguration_descriptor_ = file->message_type(0); - static const int SparrowhawkConfiguration_offsets_[4] = { + static const int SparrowhawkConfiguration_offsets_[5] = { GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(SparrowhawkConfiguration, tokenizer_grammar_), GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(SparrowhawkConfiguration, verbalizer_grammar_), GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(SparrowhawkConfiguration, sentence_boundary_regexp_), GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(SparrowhawkConfiguration, sentence_boundary_exceptions_file_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(SparrowhawkConfiguration, serialization_spec_), }; SparrowhawkConfiguration_reflection_ = new ::google::protobuf::internal::GeneratedMessageReflection( @@ -83,11 +84,12 @@ void protobuf_AddDesc_sparrowhawk_5fconfiguration_2eproto() { ::google::protobuf::DescriptorPool::InternalAddGeneratedFile( "\n\037sparrowhawk_configuration.proto\022\022speec" - "h.sparrowhawk\"\236\001\n\030SparrowhawkConfigurati" + "h.sparrowhawk\"\272\001\n\030SparrowhawkConfigurati" "on\022\031\n\021tokenizer_grammar\030\001 \001(\t\022\032\n\022verbali" "zer_grammar\030\002 \001(\t\022 \n\030sentence_boundary_r" "egexp\030\003 \001(\t\022)\n!sentence_boundary_excepti" - "ons_file\030\004 \001(\t", 214); + "ons_file\030\004 \001(\t\022\032\n\022serialization_spec\030\005 \001" + "(\t", 242); ::google::protobuf::MessageFactory::InternalRegisterGeneratedFile( "sparrowhawk_configuration.proto", &protobuf_RegisterTypes); SparrowhawkConfiguration::default_instance_ = new SparrowhawkConfiguration(); @@ -109,6 +111,7 @@ const int SparrowhawkConfiguration::kTokenizerGrammarFieldNumber; const int SparrowhawkConfiguration::kVerbalizerGrammarFieldNumber; const int SparrowhawkConfiguration::kSentenceBoundaryRegexpFieldNumber; const int SparrowhawkConfiguration::kSentenceBoundaryExceptionsFileFieldNumber; +const int SparrowhawkConfiguration::kSerializationSpecFieldNumber; #endif // !_MSC_VER SparrowhawkConfiguration::SparrowhawkConfiguration() @@ -131,6 +134,7 @@ void SparrowhawkConfiguration::SharedCtor() { verbalizer_grammar_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString); sentence_boundary_regexp_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString); sentence_boundary_exceptions_file_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString); + serialization_spec_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString); ::memset(_has_bits_, 0, sizeof(_has_bits_)); } @@ -151,6 +155,9 @@ void SparrowhawkConfiguration::SharedDtor() { if (sentence_boundary_exceptions_file_ != &::google::protobuf::internal::kEmptyString) { delete sentence_boundary_exceptions_file_; } + if (serialization_spec_ != &::google::protobuf::internal::kEmptyString) { + delete serialization_spec_; + } if (this != default_instance_) { } } @@ -198,6 +205,11 @@ void SparrowhawkConfiguration::Clear() { sentence_boundary_exceptions_file_->clear(); } } + if (has_serialization_spec()) { + if (serialization_spec_ != &::google::protobuf::internal::kEmptyString) { + serialization_spec_->clear(); + } + } } ::memset(_has_bits_, 0, sizeof(_has_bits_)); mutable_unknown_fields()->Clear(); @@ -272,6 +284,23 @@ bool SparrowhawkConfiguration::MergePartialFromCodedStream( } else { goto handle_uninterpreted; } + if (input->ExpectTag(42)) goto parse_serialization_spec; + break; + } + + // optional string serialization_spec = 5; + case 5: { + if (::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED) { + parse_serialization_spec: + DO_(::google::protobuf::internal::WireFormatLite::ReadString( + input, this->mutable_serialization_spec())); + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->serialization_spec().data(), this->serialization_spec().length(), + ::google::protobuf::internal::WireFormat::PARSE); + } else { + goto handle_uninterpreted; + } if (input->ExpectAtEnd()) return true; break; } @@ -330,6 +359,15 @@ void SparrowhawkConfiguration::SerializeWithCachedSizes( 4, this->sentence_boundary_exceptions_file(), output); } + // optional string serialization_spec = 5; + if (has_serialization_spec()) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->serialization_spec().data(), this->serialization_spec().length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + ::google::protobuf::internal::WireFormatLite::WriteString( + 5, this->serialization_spec(), output); + } + if (!unknown_fields().empty()) { ::google::protobuf::internal::WireFormat::SerializeUnknownFields( unknown_fields(), output); @@ -378,6 +416,16 @@ ::google::protobuf::uint8* SparrowhawkConfiguration::SerializeWithCachedSizesToA 4, this->sentence_boundary_exceptions_file(), target); } + // optional string serialization_spec = 5; + if (has_serialization_spec()) { + ::google::protobuf::internal::WireFormat::VerifyUTF8String( + this->serialization_spec().data(), this->serialization_spec().length(), + ::google::protobuf::internal::WireFormat::SERIALIZE); + target = + ::google::protobuf::internal::WireFormatLite::WriteStringToArray( + 5, this->serialization_spec(), target); + } + if (!unknown_fields().empty()) { target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray( unknown_fields(), target); @@ -417,6 +465,13 @@ int SparrowhawkConfiguration::ByteSize() const { this->sentence_boundary_exceptions_file()); } + // optional string serialization_spec = 5; + if (has_serialization_spec()) { + total_size += 1 + + ::google::protobuf::internal::WireFormatLite::StringSize( + this->serialization_spec()); + } + } if (!unknown_fields().empty()) { total_size += @@ -456,6 +511,9 @@ void SparrowhawkConfiguration::MergeFrom(const SparrowhawkConfiguration& from) { if (from.has_sentence_boundary_exceptions_file()) { set_sentence_boundary_exceptions_file(from.sentence_boundary_exceptions_file()); } + if (from.has_serialization_spec()) { + set_serialization_spec(from.serialization_spec()); + } } mutable_unknown_fields()->MergeFrom(from.unknown_fields()); } @@ -483,6 +541,7 @@ void SparrowhawkConfiguration::Swap(SparrowhawkConfiguration* other) { std::swap(verbalizer_grammar_, other->verbalizer_grammar_); std::swap(sentence_boundary_regexp_, other->sentence_boundary_regexp_); std::swap(sentence_boundary_exceptions_file_, other->sentence_boundary_exceptions_file_); + std::swap(serialization_spec_, other->serialization_spec_); std::swap(_has_bits_[0], other->_has_bits_[0]); _unknown_fields_.Swap(&other->_unknown_fields_); std::swap(_cached_size_, other->_cached_size_); diff --git a/src/lib/spec_serializer.cc b/src/lib/spec_serializer.cc new file mode 100644 index 0000000..101cdd8 --- /dev/null +++ b/src/lib/spec_serializer.cc @@ -0,0 +1,89 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2015 and onwards Google, Inc. +#include + +#include +#include +using std::vector; + +#include +#include +#include +#include +#include + +namespace speech { +namespace sparrowhawk { + +using google::protobuf::Descriptor; +using google::protobuf::FieldDescriptor; +using google::protobuf::Reflection; + +namespace { + +typedef Serializer::MutableTransducer MutableTransducer; +const char kClassSeparator[] = "|"; + +} // namespace + +std::unique_ptr Serializer::Create( + const SerializeSpec &serialize_spec) { + std::unique_ptr serializer(new Serializer()); + const Descriptor *token_descriptor = Token::descriptor(); + for (const ClassSpec &class_spec : serialize_spec.class_spec()) { + const FieldDescriptor *class_descriptor = + token_descriptor->FindFieldByName(class_spec.semiotic_class()); + if (class_descriptor == nullptr) { + LOG(ERROR) << "Cannot find " << class_spec.semiotic_class() + << " field in Token proto"; + return nullptr; + } + std::vector> &styles = + serializer->serializers_[class_descriptor]; + for (const StyleSpec &style_spec : class_spec.style_spec()) { + auto style_serializer = StyleSerializer::Create(style_spec); + if (style_serializer) { + styles.push_back(std::move(style_serializer)); + } else { + return nullptr; + } + } + } + return serializer; +} + +MutableTransducer Serializer::Serialize(const Token &token) const { + MutableTransducer fst; + const Reflection *reflection = token.GetReflection(); + for (const auto &candidate_class : serializers_) { + if (reflection->HasField(token, candidate_class.first)) { + string_compiler_(candidate_class.first->name() + kClassSeparator, + &fst); + MutableTransducer fst_styles; + for (const auto &candidate_style : candidate_class.second) { + MutableTransducer fst_style; + fst_style.SetStart(fst_style.AddState()); + fst_style.SetFinal(0, 1); + if (candidate_style->Serialize(token, &fst_style)) { + Union(&fst_styles, fst_style); + } + } + Concat(&fst, fst_styles); + } + } + return fst; +} + +} // namespace sparrowhawk +} // namespace speech diff --git a/src/lib/string_utils.cc b/src/lib/string_utils.cc index 484bfd9..224975e 100644 --- a/src/lib/string_utils.cc +++ b/src/lib/string_utils.cc @@ -21,14 +21,14 @@ using std::vector; namespace speech { namespace sparrowhawk { -vector SplitString(const string &s, const string &delims) { +std::vector SplitString(const string &s, const string &delims) { return SplitString(s, delims, false); } -vector SplitString(const string &s, +std::vector SplitString(const string &s, const string &delims, bool skip_empty) { - vector out; + std::vector out; if (s.empty()) { return out; } diff --git a/src/lib/style_serializer.cc b/src/lib/style_serializer.cc new file mode 100644 index 0000000..3fa1675 --- /dev/null +++ b/src/lib/style_serializer.cc @@ -0,0 +1,169 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2015 and onwards Google, Inc. +#include + +#include +#include +using std::string; +#include +using std::vector; + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace speech { +namespace sparrowhawk { + +using google::protobuf::Descriptor; +using google::protobuf::FieldDescriptor; +using google::protobuf::Reflection; +using google::protobuf::TextFormat; +using google::protobuf::Message; + +bool StyleSerializer::CreateRecordSerializers( + const StyleSpec &style_spec, + const std::unique_ptr &style_serializer) { + for (const RecordSpec &record_spec : style_spec.record_spec()) { + auto record_serializer = RecordSerializer::Create(record_spec); + if (record_serializer) { + style_serializer->record_serializers_.push_back( + std::move(record_serializer)); + } else { + return false; + } + } + return true; +} + +bool StyleSerializer::SetRequiredFieldPaths( + const StyleSpec &style_spec, + const std::unique_ptr &style_serializer) { + const Descriptor *token_descriptor = Token::descriptor(); + for (const string &required_fields : style_spec.required_fields()) { + std::vector any_of; + for (const auto &required_field : + SplitString(required_fields, "|")) { + std::unique_ptr field_path = + FieldPath::Create(token_descriptor); + any_of.push_back(*field_path); + if (!any_of.back().Parse(required_field)) { + LOG(ERROR) << "FieldPath failed to parse for required field: " + << required_field; + return false; + } + } + style_serializer->required_fields_.push_back(std::move(any_of)); + } + return true; +} + +bool StyleSerializer::SetProhibitedFieldPaths( + const StyleSpec &style_spec, + const std::unique_ptr &style_serializer) { + const Descriptor *token_descriptor = Token::descriptor(); + for (const string &prohibited_field : style_spec.prohibited_fields()) { + std::vector &prohibited_fields = + style_serializer->prohibited_fields_; + std::unique_ptr field_path = + FieldPath::Create(token_descriptor); + prohibited_fields.push_back(*field_path); + if (!prohibited_fields.back().Parse(prohibited_field)) { + LOG(ERROR) << "FieldPath failed to parse for prohibited field: " + << prohibited_field; + return false; + } + } + return true; +} + +std::unique_ptr StyleSerializer::Create( + const StyleSpec &style_spec) { + std::unique_ptr style_serializer(new StyleSerializer()); + if (!CreateRecordSerializers(style_spec, style_serializer) || + !SetRequiredFieldPaths(style_spec, style_serializer) || + !SetProhibitedFieldPaths(style_spec, style_serializer)) { + return nullptr; + } + return style_serializer; +} + +bool StyleSerializer::IsFieldSet(const Message &root, + const FieldPath &field_path) const { + const Message *parent; + const FieldDescriptor *field; + if (!field_path.Follow(root, &parent, &field)) { + LOG(ERROR) << "FieldPath traversal failed for input Message " + << root.DebugString(); + return false; + } + const Reflection *parent_reflection = parent->GetReflection(); + if (field->label() == FieldDescriptor::LABEL_REPEATED) { + // The field is assumed to be a scalar here. + if (parent_reflection->FieldSize(*parent, field) == 0) { + return false; + } + } else if (!parent_reflection->HasField(*parent, field)) { + return false; + } + return true; +} + +bool StyleSerializer::CheckRequiredFields(const Token &token) const { + for (const std::vector &field_paths : required_fields_) { + bool found = false; + for (const FieldPath &field_path : field_paths) { + if (IsFieldSet(token, field_path)) { + found = true; + break; + } + } + if (!found) { + return false; + } + } + return true; +} + +bool StyleSerializer::CheckProhibitedFields(const Token &token) const { + for (const FieldPath &field_path : prohibited_fields_) { + if (IsFieldSet(token, field_path)) { + return false; + } + } + return true; +} + +bool StyleSerializer::Serialize(const Token &token, + MutableTransducer *serialization) const { + if (!CheckRequiredFields(token) || !CheckProhibitedFields(token)) { + return false; + } + for (const auto &record_serializer : record_serializers_) { + if (!record_serializer->Serialize(token, serialization)) { + LOG(ERROR) << "Record serialization failure for token " + token.name(); + return false; + } + } + return true; +} + + +} // namespace sparrowhawk +} // namespace speech diff --git a/src/proto/Makefile.am b/src/proto/Makefile.am index facd6d4..6387c1f 100644 --- a/src/proto/Makefile.am +++ b/src/proto/Makefile.am @@ -2,6 +2,7 @@ dist_noinst_DATA = items.proto \ links.proto \ rule_order.proto \ semiotic_classes.proto \ + serialization_spec.proto \ sparrowhawk_configuration.proto CC_OUT = $(srcdir)/../lib @@ -16,6 +17,7 @@ MOSTLYCLEANFILES = items.pb.h items.pb.cc \ links.pb.h links.pb.cc \ rule_order.pb.h rule_order.pb.cc \ semiotic_classes.pb.h semiotic_classes.pb.cc \ + serialization_spec.pb.h serialization_spec.pb.cc \ sparrowhawk_configuration.pb.h sparrowhawk_configuration.pb.cc all: $(MOSTLYCLEANFILES) diff --git a/src/proto/Makefile.in b/src/proto/Makefile.in index e4e51de..f163b44 100644 --- a/src/proto/Makefile.in +++ b/src/proto/Makefile.in @@ -237,6 +237,7 @@ dist_noinst_DATA = items.proto \ links.proto \ rule_order.proto \ semiotic_classes.proto \ + serialization_spec.proto \ sparrowhawk_configuration.proto CC_OUT = $(srcdir)/../lib @@ -245,6 +246,7 @@ MOSTLYCLEANFILES = items.pb.h items.pb.cc \ links.pb.h links.pb.cc \ rule_order.pb.h rule_order.pb.cc \ semiotic_classes.pb.h semiotic_classes.pb.cc \ + serialization_spec.pb.h serialization_spec.pb.cc \ sparrowhawk_configuration.pb.h sparrowhawk_configuration.pb.cc all: all-am diff --git a/src/proto/serialization_spec.proto b/src/proto/serialization_spec.proto new file mode 100644 index 0000000..e3f3939 --- /dev/null +++ b/src/proto/serialization_spec.proto @@ -0,0 +1,101 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2015 and onwards Google, Inc. +// Proto messages describing specifications for serializing semiotic classes. +// These serializations determine the input to the verbalization grammars. +// TODO(drasha) consider changing the name to serialize_spec.proto for +// consistency. + +syntax = "proto2"; + +package speech.sparrowhawk; + +// Specification for serializing a sub-part of a semiotic class. RecordSpecs may +// be simple, such as a single field, or recursively combine additional +// RecordSpecs to specify more elaborate formats. +// For a repeated scalar field, we simply serialize all the values in the +// token for this field in an identical fashion, respecting the original +// order. +// NB. Assumes there are no repeated embedded messages in semiotic_classes.proto +message RecordSpec { + // The serialization for these RecordSpecs will be emitted prior to every + // instance of the main field for this spec. + repeated RecordSpec prefix_spec = 1; + + // The serialization for these RecordSpecs will be emitted after every + // instance of the main field for this spec. + repeated RecordSpec suffix_spec = 2; + + // Field serialization specification: the fields below are used to include a + // value from the input proto in the serialization. This record will only be + // included in the output serialization if this field is present in the input, + // a default value is supplied, or a one_of field is given. + + // The path (from the top-level token, in proto_path.h format) to this field. + // If the label field is not set, the terminal portion of this will be used as + // the label in the serialized output. + optional string field_path = 3; + + // Defines the record label in the serialization. This should be set only to + // override the use of the terminal field name from the field path as the + // default label. + optional string label = 4; + + // String defining the value to be used for the field in case it is not set. + // Note that prefix and suffix records with default values will not be + // serialized if the parent record is missing. The default value is + // well-defined only for singular fields and is ignored otherwise. + optional string default_value = 5; +} + +// Specification for serializing a semiotic class in a particular style. +// StyleSpecs provide required and prohibited fields to help determine the style +// to be used for verbalization. +message StyleSpec { + // Gives the specification for how tokens should be serialized in this style. + // The serialization components for this style will be emitted in the same + // order as the record specs in this field. + repeated RecordSpec record_spec = 1; + + // When more than one serialization style is used for a semiotic class, it may + // be possible to infer that a serialization is inappropriate due to the + // presence or absence of a particular field. The following fields provide a + // mechanism to do this. + + // This serialization will not be emitted unless all of the fields referred to + // here are present. A single instance can have multiple fields (separated by + // "|") from which at least one field is required for serialization. + repeated string required_fields = 2; + + // This serialization will not be emitted if any of the fields referred to + // here are present. + repeated string prohibited_fields = 3; +} + +// Specification of a serialization format for a particular semiotic class. +message ClassSpec { + // Indicates the type of token that may be serialized by this spec: those with + // this field present, e.g. "cardinal" or "measure". + optional /* required */ string semiotic_class = 1; + + // Denotes the style within the semiotic class. StyleSpecs augment ClassSpec + // by enabling multiple ways of verbalizing the same semiotic class. + repeated StyleSpec style_spec = 2; +} + +// Collection of all serialization specs for a language. A single semiotic class +// may have more than one specification, and all matching serializations for +// that class will be included as paths in the output. +message SerializeSpec { + repeated ClassSpec class_spec = 1; +} diff --git a/src/proto/sparrowhawk_configuration.proto b/src/proto/sparrowhawk_configuration.proto index 9b9cac3..903d5df 100644 --- a/src/proto/sparrowhawk_configuration.proto +++ b/src/proto/sparrowhawk_configuration.proto @@ -31,4 +31,8 @@ message SparrowhawkConfiguration { // marker that should *not* usually induce an end-of-sentence decision // e.g. “Mr.” optional string sentence_boundary_exceptions_file = 4; + + // Optional file with SerializeSpec for verbalizer as a text proto. If the + // the field is not set, we resort to protobuf serializer. + optional string serialization_spec = 5; }