Update to Sparrowhawk 1.0, with new serialization.

google · Jan 9, 2017 · 0d2fe13 · 0d2fe13
1 parent eb97411
commit 0d2fe13
Show file tree

Hide file tree

Showing 58 changed files with 3,368 additions and 76 deletions.
diff --git a/NEWS b/NEWS
@@ -2,3 +2,8 @@ Sparrowhawk - Release 0.1
 
 This is the alpha version.
 
+Sparrowhawk - Release 1.0
+
+* Added new verbalizer serialization, with accompanying grammars.
+
+
diff --git a/README b/README
@@ -1,4 +1,4 @@
-Sparrowhawk - Release 0.1
+Sparrowhawk - Release 1.0
 
 Sparrowhawk is an open-source implementation of Google's Kestrel text-to-speech
 text normalization system.  It follows the discussion of the Kestrel system as
@@ -34,6 +34,11 @@ INSTALLATION:
   recommend configuring with --enable-static=no for faster
   compiles. 
 
+  NOTE: In some versions of Mac OS-X we have noticed a problem with configure
+  whereby it fails to find fst.h. If this occurs, try configuring as follows: 
+
+  CPPFLAGS=-I/usr/local/include LDFLAGS=-L/usr/local/lib ./configure
+
 USAGE:
   Assuming you've installed under the default /usr/local, the library will be
   in /usr/local/lib, and the headers in /usr/local/include/sparrowhawk.

diff --git a/configure b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for Sparrowhawk 0.1.0.
+# Generated by GNU Autoconf 2.69 for Sparrowhawk 1.0.0.
 #
 # Report bugs to <rws@google.com>.
 #
@@ -590,8 +590,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='Sparrowhawk'
 PACKAGE_TARNAME='sparrowhawk'
-PACKAGE_VERSION='0.1.0'
-PACKAGE_STRING='Sparrowhawk 0.1.0'
+PACKAGE_VERSION='1.0.0'
+PACKAGE_STRING='Sparrowhawk 1.0.0'
 PACKAGE_BUGREPORT='rws@google.com'
 PACKAGE_URL=''
 
@@ -1325,7 +1325,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures Sparrowhawk 0.1.0 to adapt to many kinds of systems.
+\`configure' configures Sparrowhawk 1.0.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1395,7 +1395,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of Sparrowhawk 0.1.0:";;
+     short | recursive ) echo "Configuration of Sparrowhawk 1.0.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1504,7 +1504,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-Sparrowhawk configure 0.1.0
+Sparrowhawk configure 1.0.0
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1994,7 +1994,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by Sparrowhawk $as_me 0.1.0, which was
+It was created by Sparrowhawk $as_me 1.0.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2857,7 +2857,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='sparrowhawk'
- VERSION='0.1.0'
+ VERSION='1.0.0'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -4162,6 +4162,7 @@ unknown)
 esac
 
 
+CPPFLAGS="$CPPFLAGS -funsigned-char"
 CXXFLAGS="$CXXFLAGS -std=c++11"
 
 ac_ext=cpp
@@ -16052,7 +16053,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by Sparrowhawk $as_me 0.1.0, which was
+This file was extended by Sparrowhawk $as_me 1.0.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -16109,7 +16110,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-Sparrowhawk config.status 0.1.0
+Sparrowhawk config.status 1.0.0
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 

diff --git a/configure.ac b/configure.ac
@@ -1,8 +1,9 @@
-AC_INIT([Sparrowhawk], [0.1.0], [rws@google.com])
+AC_INIT([Sparrowhawk], [1.0.0], [rws@google.com])
 AM_INIT_AUTOMAKE([foreign nostdinc -Wall -Werror])
 
 AM_PROG_AR
 
+CPPFLAGS="$CPPFLAGS -funsigned-char"
 CXXFLAGS="$CXXFLAGS -std=c++11"
 
 AC_PROG_CXX

diff --git a/documentation/README.md b/documentation/README.md
@@ -357,6 +357,77 @@ the token as a sequence of characters:
 3_character :_character 3_character 0_character
 </pre>
 
+### Verbalizer grammars: new serialization format (Sparrowhawk 1.0 and above)
+
+With Sparrowhawk 1.0, we introduce a simpler format for verbalizer
+grammars. The upside of this is that it makes writing the verbalizer grammars
+quite a bit simpler. The downside is that it requires a serialization
+specification proto instance (see below). This new format has no relevance to
+the classifier grammars, which should be written as described above in any case.
+
+The main salient differences between the previous format and the new
+serialization format are first that the representation that is passed by the
+serialization to the verbalizer is more compact. Instead of
+
+<pre>
+money { amount { integer_part: "3" } currency: "usd" }
+</pre>
+
+what gets passed is
+
+<pre>
+money|integer_part:3|currency:usd|
+</pre>
+
+For both major and minor currencies the verbalizer sees, e.g.:
+
+<pre>
+money|integer_part:3|currency:usd|fractional_part:50|currency:usd|
+</pre>
+
+The second major difference is that a REDUP rule is no longer needed. Rather the
+serialization, and possible copying of elements is done in code, controlled by
+the serialization specification, itself an ASCII protocol buffer representation
+that is referenced by an additional optional specification in the Sparrowhawk
+configuration file. An example is given in
+"verbalizer&#x005f;serialization&#x005f;spec.ascii&#x005f;proto". This specifies
+the serialization possibilities for the different classes. For money, the
+specification:
+
+<pre>
+class_spec {
+  semiotic_class: "money"
+  style_spec {
+    record_spec {
+      field_path: "money.amount.integer_part"
+      suffix_spec {
+        field_path: "money.currency"
+      }
+    }
+    record_spec {
+      field_path: "money.amount.fractional_part"
+      suffix_spec {
+        field_path: "money.currency"
+      }
+    }
+  }
+}
+</pre>
+
+means that the integer part of the money expression and the fractional part are
+verbalized in that order, and the repetition of the "money.currency" field has
+the effect of duplicating the expression for the currency itself. Again, the
+verbalizer grammar is responsible for determining that the first instance would
+be read as the major currency expression, and the second as the minor currency
+expression.
+
+The protocol buffer definition of the serialization specification is found in
+"src/proto/serialization&#x005f;spec.proto", which is also documented with
+comments on the functions of the various fields.
+
+The parallel English toy grammar in the new serializer format can be found in
+"grammars/en&#x005f;toy/verbalize&#x005f;serialization".
+
 ### Sentence boundary detection
 
 Sparrowhawk provides some simple support for sentence boundary detection. One
@@ -454,6 +525,12 @@ For example in the "grammars" directory, assuming one has built all the grammars
 normalizer_main --config=sparrowhawk_configuration.ascii_proto --multi_line_text < test.txt  2>/dev/null
 </pre>
 
+For the new serialization specification, the invocation is as follows:
+
+<pre>
+normalizer_main --config=sparrowhawk_configuration_serialization.ascii_proto --multi_line_text < test.txt 2>/dev/null
+</pre>
+
 Integrating Sparrowhawk with Festival
 -------------------------
 
@@ -509,7 +586,7 @@ festival/examples/sparrowhawk_test_us_null.scm
 Sparrowhawk will perform tokenization and text normalization and leave you with
 a sequence of words in Festival's 'Word' relation. You need to take it from
 there.
- 
+
 How to cite Sparrowhawk
 -------------------------
 

diff --git a/documentation/grammars/en_toy/byte.far b/documentation/grammars/en_toy/byte.far
diff --git a/documentation/grammars/en_toy/util.far b/documentation/grammars/en_toy/util.far
diff --git a/documentation/grammars/en_toy/verbalize_serialization/CARDINAL_NUMBER_NAME b/documentation/grammars/en_toy/verbalize_serialization/CARDINAL_NUMBER_NAME
diff --git a/documentation/grammars/en_toy/verbalize_serialization/ORDINAL_NUMBER_NAME b/documentation/grammars/en_toy/verbalize_serialization/ORDINAL_NUMBER_NAME
diff --git a/documentation/grammars/en_toy/verbalize_serialization/date.grm b/documentation/grammars/en_toy/verbalize_serialization/date.grm
@@ -0,0 +1,69 @@
+import '../byte.grm' as b;
+import '../util.grm' as u;
+import 'numbers.grm' as n;
+
+# quotation mark
+q = u.q;
+
+# Used to allow for different numbers of spaces coming out of the serializer.
+s = u.s;
+
+month = b.kAlpha+;
+
+day = n.ORDINAL;
+
+d = b.kDigit;
+D = b.kDigit - "0";
+
+two_digit =
+    ((D d) @ n.CARDINAL)
+  | ("0" : "oh ") (D @ n.CARDINAL)
+  | ("00" : "hundred")
+;
+
+# Years are not read as cardinals, generally:
+year =
+    (("19" @ n.CARDINAL) u.I[" "] two_digit)
+  | (("20" @ n.CARDINAL) u.I[" "] ((D d) @ two_digit))
+  | (("200" d) @ n.CARDINAL)
+;
+
+# Remove these if they occur
+
+field = (b.kAlpha | "_")+;
+preserve_order = "preserve_order:true";
+field_order = "field_order:" field;
+field_order_specs = (preserve_order | field_order)*;
+
+# Verbalization for MDY
+mdy =
+  u.D["date"]
+  u.D["|month:"]
+  month
+  u.I[" the "]
+  u.D["|day:"]
+  day
+  u.I[" "]
+  u.D["|year:"]
+  year
+  u.D[field_order_specs]?
+  u.D["|"]
+;
+
+# Verbalization for DMY
+dmy =
+  u.D["date"]
+  u.I["the "]
+  u.D["|day:"]
+  day
+  u.I[" of "]
+  u.D["|month:"]
+  month
+  u.D["|year:"]
+  u.I[" "]
+  year
+  u.D[field_order_specs]?
+  u.D["|"]
+;
+
+export DATE = Optimize[mdy | dmy];
diff --git a/documentation/grammars/en_toy/verbalize_serialization/measure.grm b/documentation/grammars/en_toy/verbalize_serialization/measure.grm
@@ -0,0 +1,39 @@
+import '../byte.grm' as b;
+import '../util.grm' as u;
+import 'numbers.grm' as n;
+
+# Except with exactly 1, the plural form is used, so we map to that form, and
+# then singularize below.
+measures =
+    ("centimeter" : "centimeters")
+  | ("kilogram" : "kilograms")
+  | ("degree" : "degrees")
+;
+
+# quotation mark
+q = u.q;
+
+# Used to allow for different numbers of spaces coming out of the serializer.
+s = u.s;
+
+# Removes the markup (allowing for various spacing possibilities in the
+# serialization) and verbalizes the remainder.
+measure =
+  u.D["measure"]
+  u.D["|integer_part:"]
+  n.CARDINAL
+ (u.D["|fractional_part:"]
+  u.I[" point "]
+  n.DIGITS)?
+  u.I[" "]
+  u.D["|units:"]
+  measures
+  u.D["|"]
+;
+
+sigstar = b.kBytes*;
+
+# Uses the singular form after exactly "one".
+singularize = CDRewrite[Invert[measures], "[BOS]one ", "", sigstar];
+
+export MEASURE = Optimize[measure @ singularize];