From c5bc07b25d04f6ada65f0654c96a41a9ea649ebd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Rasmusson?= <B.Rasmusson@computer.org>
Date: Sat, 15 Apr 2017 18:17:50 +0200
Subject: [PATCH 1/4] gherkin: (C) Use UTF-16 when wchar_t is of 2 bytes size.

On Windows wchar_t is 2 bytes large, and use UTF-16. This means that
for the case of code points > 0xFFFF (and wchar_t is only 2 bytes
large), the code point read from the UTF-8 source need to be converted
to two UTF-16 surrogates (wchar_t wide characters).
---
 gherkin/c/src/Makefile             |  4 +-
 gherkin/c/src/compiler.c           |  5 +-
 gherkin/c/src/file_reader.c        | 34 ++++++++---
 gherkin/c/src/file_token_scanner.c | 38 ++++++++----
 gherkin/c/src/gherkin_line.c       |  4 +-
 gherkin/c/src/string_utilities.c   | 39 ++++++++++--
 gherkin/c/src/string_utilities.h   |  4 ++
 gherkin/c/src/token_queue.h        |  1 +
 gherkin/c/src/unicode_utilities.c  | 98 ++++++++++++++++++++++++++++++
 gherkin/c/src/unicode_utilities.h  | 31 ++++++++++
 gherkin/c/src/utf8_utilities.c     | 34 -----------
 gherkin/c/src/utf8_utilities.h     | 18 ------
 12 files changed, 225 insertions(+), 85 deletions(-)
 create mode 100644 gherkin/c/src/unicode_utilities.c
 create mode 100644 gherkin/c/src/unicode_utilities.h
 delete mode 100644 gherkin/c/src/utf8_utilities.c
 delete mode 100644 gherkin/c/src/utf8_utilities.h

diff --git a/gherkin/c/src/Makefile b/gherkin/c/src/Makefile
index a3b1e1baa47..0fa61f35102 100644
--- a/gherkin/c/src/Makefile
+++ b/gherkin/c/src/Makefile
@@ -42,8 +42,8 @@ UTILITIES_OBJS= \
 	../objs/file_utf8_source.o \
 	../objs/print_utilities.o \
 	../objs/string_utilities.o \
-	../objs/utf8_source.o \
-	../objs/utf8_utilities.o
+	../objs/unicode_utilities.o \
+	../objs/utf8_source.o
 -include $(UTILITIES_OBJS:.o=.d)
 
 PARSER_OBJS= \
diff --git a/gherkin/c/src/compiler.c b/gherkin/c/src/compiler.c
index dbd9330bc66..f05630e04f9 100644
--- a/gherkin/c/src/compiler.c
+++ b/gherkin/c/src/compiler.c
@@ -9,6 +9,7 @@
 #include "pickle_table.h"
 #include "pickle_tag.h"
 #include "pickle_string.h"
+#include "string_utilities.h"
 #include <stdlib.h>
 
 typedef struct Compiler {
@@ -103,7 +104,7 @@ int Compiler_compile(Compiler* compiler, const GherkinDocument* gherkin_document
                     }
                     int j;
                     for (j = 0; j < scenario_outline->steps->step_count; ++j) {
-                        int column_offset = scenario_outline->steps->steps[j].keyword ? wcslen(scenario_outline->steps->steps[j].keyword) : 0;
+                        int column_offset = scenario_outline->steps->steps[j].keyword ? StringUtilities_code_point_length(scenario_outline->steps->steps[j].keyword) : 0;
                         const PickleLocations* step_locations = PickleLocations_new_double(table_row->location.line, table_row->location.column, scenario_outline->steps->steps[j].location.line, scenario_outline->steps->steps[j].location.column + column_offset);
                         const PickleStep* step = expand_outline_step(&scenario_outline->steps->steps[j], example_table->table_header, table_row, step_locations);
                         PickleStep_transfer(&steps->steps[background_step_count + j], (PickleStep*)step);
@@ -225,7 +226,7 @@ static void copy_tags(PickleTag* destination_array, const Tags* source) {
 static void copy_steps(PickleStep* destination_array, const Steps* source) {
     int i;
     for (i = 0; i < source->step_count; ++i) {
-        int column_offset = source->steps[i].keyword ? wcslen(source->steps[i].keyword) : 0;
+        int column_offset = source->steps[i].keyword ? StringUtilities_code_point_length(source->steps[i].keyword) : 0;
         const PickleLocations* step_locations = PickleLocations_new_single(source->steps[i].location.line, source->steps[i].location.column + column_offset);
         const PickleArgument* argument = create_pickle_argument(source->steps[i].argument, 0, 0);
         const PickleStep* step = PickleStep_new(step_locations, source->steps[i].text, argument);
diff --git a/gherkin/c/src/file_reader.c b/gherkin/c/src/file_reader.c
index ee40f496be4..7143128b671 100644
--- a/gherkin/c/src/file_reader.c
+++ b/gherkin/c/src/file_reader.c
@@ -1,12 +1,14 @@
 #include "file_reader.h"
 #include "file_utf8_source.h"
-#include "utf8_utilities.h"
+#include "unicode_utilities.h"
 #include <stdlib.h>
 
 typedef struct FileReader {
     const char* file_name;
 } FileReader;
 
+static void extend_buffer_if_needed(wchar_t** buffer, int* buffer_size, int pos);
+
 FileReader* FileReader_new(const char* const file_name) {
     FileReader* file_reader = (FileReader*)malloc(sizeof(FileReader));
     file_reader->file_name = file_name;
@@ -17,19 +19,24 @@ const wchar_t* FileReader_read(FileReader* file_reader) {
     int buffer_size = 256;
     wchar_t* buffer = (wchar_t*)malloc(buffer_size * sizeof(wchar_t));
     int pos = 0;
-    wchar_t c;
-    FILE* file = fopen(file_reader->file_name, "r");
+    long code_point;
+    FILE* file = fopen(file_reader->file_name, "rb");
     Utf8Source* utf8_source = FileUtf8Source_new(file);
     do {
-        c = Utf8Utilities_read_wchar_from_utf8_source(utf8_source);
-        if (c != WEOF) {
-            buffer[pos++] = c;
-            if (pos >= buffer_size - 1) {
-                buffer_size *= 2;
-                buffer = (wchar_t*)realloc(buffer, buffer_size * sizeof(wchar_t));
+        code_point = UnicodeUtilities_read_code_point_from_utf8_source(utf8_source);
+        if (code_point != WEOF) {
+            if (code_point <= 0xFFFF || sizeof(wchar_t) > 2) {
+                buffer[pos++] = (wchar_t)code_point;
+                extend_buffer_if_needed(&buffer, &buffer_size, pos);
+            } else {
+                Utf16Surrogates surrogates = UnicodeUtilities_get_utf16_surrogates(code_point);
+                buffer[pos++] = surrogates.leading;
+                extend_buffer_if_needed(&buffer, &buffer_size, pos);
+                buffer[pos++] = surrogates.trailing;
+                extend_buffer_if_needed(&buffer, &buffer_size, pos);
             }
         }
-    } while (c != WEOF);
+    } while (code_point != WEOF);
     buffer[pos] = L'\0';
     Utf8Source_delete(utf8_source);
     fclose(file);
@@ -42,3 +49,10 @@ void FileReader_delete(FileReader* file_reader) {
     }
     free((void*)file_reader);
 }
+
+static void extend_buffer_if_needed(wchar_t** buffer, int* buffer_size, int pos) {
+    if (pos >= *buffer_size - 1) {
+        *buffer_size *= 2;
+        *buffer = (wchar_t*)realloc(*buffer, *buffer_size * sizeof(wchar_t));
+    }
+}
diff --git a/gherkin/c/src/file_token_scanner.c b/gherkin/c/src/file_token_scanner.c
index 98d2a1d841b..d0648c4d538 100644
--- a/gherkin/c/src/file_token_scanner.c
+++ b/gherkin/c/src/file_token_scanner.c
@@ -2,7 +2,7 @@
 #include "file_utf8_source.h"
 #include "gherkin_line.h"
 #include "string_utilities.h"
-#include "utf8_utilities.h"
+#include "unicode_utilities.h"
 #include <stdlib.h>
 
 typedef struct FileTokenScanner {
@@ -16,6 +16,8 @@ typedef struct FileTokenScanner {
 
 static Token* FileTokenScanner_read(TokenScanner* token_scanner);
 
+static void extend_buffer_if_needed(FileTokenScanner* token_scanner, int pos);
+
 static void FileTokenScanner_delete(TokenScanner* token_scanner);
 
 TokenScanner* FileTokenScanner_new(const char* const file_name) {
@@ -24,7 +26,7 @@ TokenScanner* FileTokenScanner_new(const char* const file_name) {
     token_scanner->token_scanner.delete = &FileTokenScanner_delete;
     token_scanner->line = 0;
     token_scanner->file = 0;
-    token_scanner->file = fopen(file_name, "r");
+    token_scanner->file = fopen(file_name, "rb");
     token_scanner->utf8_source = FileUtf8Source_new(token_scanner->file);
     token_scanner->buffer_size = 128;
     token_scanner->buffer = (wchar_t*)malloc(token_scanner->buffer_size * sizeof(wchar_t));
@@ -51,18 +53,23 @@ static Token* FileTokenScanner_read(TokenScanner* token_scanner) {
     if (feof(file_token_scanner->file))
         return Token_new(0, file_token_scanner->line);
     int pos = 0;
-    wchar_t c;
+    long code_point;
     do {
-        c = Utf8Utilities_read_wchar_from_utf8_source(file_token_scanner->utf8_source);
-        if (c != WEOF && c != L'\r' && c != L'\n') {
-            file_token_scanner->buffer[pos++] = c;
-            if (pos >= file_token_scanner->buffer_size - 1) {
-                file_token_scanner->buffer_size *= 2;
-                file_token_scanner->buffer = (wchar_t*)realloc(file_token_scanner->buffer, file_token_scanner->buffer_size * sizeof(wchar_t));
+        code_point = UnicodeUtilities_read_code_point_from_utf8_source(file_token_scanner->utf8_source);
+        if (code_point != WEOF && code_point != L'\r' && code_point != L'\n') {
+            if (code_point <= 0xFFFF || sizeof(wchar_t) > 2) {
+                file_token_scanner->buffer[pos++] = (wchar_t)code_point;
+                extend_buffer_if_needed(file_token_scanner, pos);
+            } else {
+                Utf16Surrogates surrogates = UnicodeUtilities_get_utf16_surrogates(code_point);
+                file_token_scanner->buffer[pos++] = surrogates.leading;
+                extend_buffer_if_needed(file_token_scanner, pos);
+                file_token_scanner->buffer[pos++] = surrogates.trailing;
+                extend_buffer_if_needed(file_token_scanner, pos);
             }
         }
-    } while (c != WEOF && c != L'\r' && c != L'\n');
-    if (c == L'\r') {
+    } while (code_point != WEOF && code_point != L'\r' && code_point != L'\n');
+    if (code_point == L'\r') {
         unsigned char next_char = fgetc(file_token_scanner->file);
         if (next_char != L'\n') {
             ungetc(next_char, file_token_scanner->file);
@@ -70,7 +77,7 @@ static Token* FileTokenScanner_read(TokenScanner* token_scanner) {
     }
     file_token_scanner->buffer[pos] = L'\0';
     const GherkinLine* line;
-    if (c != WEOF || pos != 0) {
+    if (code_point != WEOF || pos != 0) {
         wchar_t* text = StringUtilities_copy_string_part(file_token_scanner->buffer, pos);
         line = GherkinLine_new(text, file_token_scanner->line);
     }
@@ -78,3 +85,10 @@ static Token* FileTokenScanner_read(TokenScanner* token_scanner) {
         line = (GherkinLine*)0;
     return Token_new(line, file_token_scanner->line);
 }
+
+static void extend_buffer_if_needed(FileTokenScanner* file_token_scanner, int pos){
+    if (pos >= file_token_scanner->buffer_size - 1) {
+        file_token_scanner->buffer_size *= 2;
+        file_token_scanner->buffer = (wchar_t*)realloc(file_token_scanner->buffer, file_token_scanner->buffer_size * sizeof(wchar_t));
+    }
+}
diff --git a/gherkin/c/src/gherkin_line.c b/gherkin/c/src/gherkin_line.c
index 096cefad201..f35d1c2a273 100644
--- a/gherkin/c/src/gherkin_line.c
+++ b/gherkin/c/src/gherkin_line.c
@@ -215,7 +215,7 @@ static const wchar_t* populate_cell_data(Span* item, const wchar_t* start_pos, i
         ++current_pos;
     while (end_text > current_pos && *(end_text - 1) == L' ')
         --end_text;
-    item->column = start_indent + (current_pos - start_pos) + 1;
+    item->column = start_indent + StringUtilities_code_point_length_for_part(start_pos, current_pos - start_pos) + 1;
     int text_length = end_text - current_pos;
     wchar_t* text = StringUtilities_copy_string_part(current_pos, text_length);
     const wchar_t* from = text;
@@ -251,7 +251,7 @@ static const wchar_t* populate_tag_data(Span* item, const wchar_t* start_pos, in
     const wchar_t* end_text = end_pos;
     while (end_text > current_pos && *(end_text - 1) == L' ')
         --end_text;
-    item->column = start_indent + (current_pos - start_pos) + 1;
+    item->column = start_indent + StringUtilities_code_point_length_for_part(start_pos, current_pos - start_pos) + 1;
     int text_length = end_text - current_pos;
     if (text_length > 0) {
         item->text = StringUtilities_copy_string_part(current_pos, text_length);
diff --git a/gherkin/c/src/string_utilities.c b/gherkin/c/src/string_utilities.c
index 338b19de581..db6a4997042 100644
--- a/gherkin/c/src/string_utilities.c
+++ b/gherkin/c/src/string_utilities.c
@@ -1,5 +1,5 @@
 #include "string_utilities.h"
-#include "utf8_utilities.h"
+#include "unicode_utilities.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -32,19 +32,48 @@ wchar_t* StringUtilities_copy_to_wide_string(const char* string) {
     int length = strlen(string);
     wchar_t* copy = (wchar_t*)malloc((length + 1) * sizeof(wchar_t));
     Utf8Source* utf8_source = StringUtf8Source_new(string);
+    int to_index = 0;
     int i;
     for (i = 0; i < length; ++i) {
-        wchar_t c = Utf8Utilities_read_wchar_from_utf8_source(utf8_source);
-        if (c == WEOF) {
+        long code_point = UnicodeUtilities_read_code_point_from_utf8_source(utf8_source);
+        if (code_point == WEOF) {
             break;
         }
-        copy[i] = c;
+        if (code_point <= 0xFFFF || sizeof(wchar_t) > 2) {
+            copy[to_index++] = (wchar_t)code_point;
+        } else {
+            Utf16Surrogates surrogates = UnicodeUtilities_get_utf16_surrogates(code_point);
+            copy[to_index++] = surrogates.leading;
+            copy[to_index++] = surrogates.trailing;
+        }
     }
-    copy[i] = L'\0';
+    copy[to_index] = L'\0';
     Utf8Source_delete(utf8_source);
     return copy;
 }
 
+size_t StringUtilities_code_point_length(const wchar_t* string) {
+    if (sizeof(wchar_t) > 2) {
+        return wcslen(string);
+    } else {
+        return StringUtilities_code_point_length_for_part(string, wcslen(string));
+    }
+}
+
+size_t StringUtilities_code_point_length_for_part(const wchar_t* string, const int length) {
+    int code_points = 0;
+    int i;
+    for (i = 0; i < length; ++i) {
+        ++code_points;
+        if (UnicodeUtilities_is_utf16_surrogate(string[i])) {
+            ++i;
+        }
+
+    }
+    return code_points;
+}
+
+
 Utf8Source* StringUtf8Source_new(const char* string) {
     StringUtf8Source* string_utf8_source = (StringUtf8Source*)malloc(sizeof(StringUtf8Source));
     string_utf8_source->utf8_source.read = &StringUtf8Source_read;
diff --git a/gherkin/c/src/string_utilities.h b/gherkin/c/src/string_utilities.h
index 17e330dffd0..30472a5ea97 100644
--- a/gherkin/c/src/string_utilities.h
+++ b/gherkin/c/src/string_utilities.h
@@ -13,6 +13,10 @@ wchar_t* StringUtilities_copy_string_part(const wchar_t* string, const int lengt
 
 wchar_t* StringUtilities_copy_to_wide_string(const char* string);
 
+size_t StringUtilities_code_point_length(const wchar_t* string);
+
+size_t StringUtilities_code_point_length_for_part(const wchar_t* string, const int length);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gherkin/c/src/token_queue.h b/gherkin/c/src/token_queue.h
index eaf1513c5c1..68a0cba2cf0 100644
--- a/gherkin/c/src/token_queue.h
+++ b/gherkin/c/src/token_queue.h
@@ -1,6 +1,7 @@
 #ifndef GHERKIN_TOKEN_QUEUE_H_
 #define GHERKIN_TOKEN_QUEUE_H_
 
+#include <stdbool.h>
 #include "token.h"
 
 #ifdef __cplusplus
diff --git a/gherkin/c/src/unicode_utilities.c b/gherkin/c/src/unicode_utilities.c
new file mode 100644
index 00000000000..c83bbdd9ae4
--- /dev/null
+++ b/gherkin/c/src/unicode_utilities.c
@@ -0,0 +1,98 @@
+#include "unicode_utilities.h"
+
+long UnicodeUtilities_read_code_point_from_utf8_source(Utf8Source* utf8_source) {
+    unsigned char c = Utf8Source_read(utf8_source);
+    if (c < 0x80) {
+        return (long)c;
+    }
+    unsigned char c2 = Utf8Source_read(utf8_source);
+    long lower_part = (long)(c2 & 0x3F);
+    if ((c & 0xE0) == 0xC0) {
+        return (((long)(c & 0x1F)) << 6) | lower_part;
+    }
+    c2 = Utf8Source_read(utf8_source);
+    lower_part =  (lower_part << 6) | (long)(c2 & 0x3F);
+    if ((c & 0xF0) == 0xE0) {
+        return (((long)(c & 0x0F)) << 12) | lower_part;
+    }
+    c2 = Utf8Source_read(utf8_source);
+    lower_part =  (lower_part << 6) | (long)(c2 & 0x3F);
+    if ((c & 0xF8) == 0xF0) {
+        return (((long)(c & 0x07)) << 18) | lower_part;
+    }
+    c2 = Utf8Source_read(utf8_source);
+    lower_part =  (lower_part << 6) | (long)(c2 & 0x3F);
+    if ((c & 0xFC) == 0xF8) {
+        return (((long)(c & 0x03)) << 24) | lower_part;
+    }
+    c2 = Utf8Source_read(utf8_source);
+    lower_part =  (lower_part << 6) | (long)(c2 & 0x3F);
+    if ((c & 0xFE) == 0xFC) {
+        return (((long)(c & 0x01)) << 30) | lower_part;
+    }
+    return WEOF;
+}
+
+Utf16Surrogates UnicodeUtilities_get_utf16_surrogates(long code_point){
+    Utf16Surrogates surrogates;
+    long surrogates_base = code_point - 0x10000;
+    surrogates.leading = 0xD800 + (surrogates_base >> 10);
+    surrogates.trailing = 0xDC00 + (surrogates_base & 0x3FF);
+    return surrogates;
+}
+
+int UnicodeUtilities_print_wide_character_to_utf8_file(FILE* file, const wchar_t* text, int pos) {
+    long code_point;
+    if (!UnicodeUtilities_is_utf16_surrogate(text[pos]) || sizeof(wchar_t) > 2) {
+        code_point = (long)text[pos];
+    } else {
+        long leading_surrogate = (long)text[pos++];
+        long trailing_surrogate = (long)text[pos];
+        code_point = 0x10000 + ((leading_surrogate - 0xD800) << 10) + (trailing_surrogate - 0xDC00);
+    }
+    print_code_point_to_utf8_file(file, code_point);
+    return pos;
+}
+
+bool UnicodeUtilities_is_utf16_surrogate(const wchar_t wide_char) {
+    return wide_char >= 0xD800 && wide_char < 0xE000;
+}
+
+void print_code_point_to_utf8_file(FILE* file, long code_point) {
+    int trailing_bytes;
+    if (code_point < 0x80) {
+        fputc((char)code_point, file);
+        return;
+    } else if (code_point < 0x800) {
+        fputc((char)(0xC0 | ((code_point & 0x7C0) >> 6)), file);
+        trailing_bytes = 1;
+    } else if (code_point < 0x10000) {
+        fputc((char)(0xE0 | ((code_point & 0xF000) >> 12)), file);
+        trailing_bytes = 2;
+    } else if (code_point < 0x200000) {
+        fputc((char)(0xF0 | ((code_point & 0x1C0000) >> 18)), file);
+        trailing_bytes = 3;
+    } else if (code_point < 0x4000000) {
+        fputc((char)(0xF8 | ((code_point & 0x3000000) >> 24)), file);
+        trailing_bytes = 4;
+    } else {
+        fputc((char)(0xFC | ((code_point & 0x40000000) >> 30)), file);
+        trailing_bytes = 5;
+    }
+    switch (trailing_bytes) {
+    case 5:
+        fputc((char)(0x80 | ((code_point & 0x3F000000) >> 24)), file);
+        /* fall through */
+    case 4:
+        fputc((char)(0x80 | ((code_point & 0xFC0000) >> 18)), file);
+        /* fall through */
+    case 3:
+        fputc((char)(0x80 | ((code_point & 0x3F000) >> 12)), file);
+        /* fall through */
+    case 2:
+        fputc((char)(0x80 | ((code_point & 0xFC0) >> 6)), file);
+        /* fall through */
+    case 1:
+        fputc((char)(0x80 | (code_point & 0x3F)), file);
+    }
+}
diff --git a/gherkin/c/src/unicode_utilities.h b/gherkin/c/src/unicode_utilities.h
new file mode 100644
index 00000000000..006b69588cb
--- /dev/null
+++ b/gherkin/c/src/unicode_utilities.h
@@ -0,0 +1,31 @@
+#ifndef GHERKIN_UNICODE_UTILITIES_H_
+#define GHERKIN_UNICODE_UTILITIES_H_
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <wchar.h>
+
+#include "utf8_source.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct Utf16Surrogates {
+    wchar_t leading;
+    wchar_t trailing;
+} Utf16Surrogates;
+
+long UnicodeUtilities_read_code_point_from_utf8_source(Utf8Source* utf8_source);
+
+Utf16Surrogates UnicodeUtilities_get_utf16_surrogates(long code_point);
+
+int UnicodeUtilities_print_wide_character_to_utf8_file(FILE* file, const wchar_t* text, int pos);
+
+bool UnicodeUtilities_is_utf16_surrogate(const wchar_t wide_char);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GHERKIN_UNICODE_UTILITIES_H_ */
diff --git a/gherkin/c/src/utf8_utilities.c b/gherkin/c/src/utf8_utilities.c
deleted file mode 100644
index 1fc1eee30db..00000000000
--- a/gherkin/c/src/utf8_utilities.c
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "utf8_utilities.h"
-
-wchar_t Utf8Utilities_read_wchar_from_utf8_source(Utf8Source* utf8_source) {
-    unsigned char c = Utf8Source_read(utf8_source);
-    if (c < 0x80) {
-        return (wchar_t)c;
-    }
-    unsigned char c2 = Utf8Source_read(utf8_source);
-    wchar_t lower_part = (wchar_t)(c2 & 0x3F);
-    if ((c & 0xE0) == 0xC0) {
-        return (((wchar_t)(c & 0x1F)) << 6) | lower_part;
-    }
-    c2 = Utf8Source_read(utf8_source);
-    lower_part =  (lower_part << 6) | (wchar_t)(c2 & 0x3F);
-    if ((c & 0xF0) == 0xE0) {
-        return (((wchar_t)(c & 0x0F)) << 12) | lower_part;
-    }
-    c2 = Utf8Source_read(utf8_source);
-    lower_part =  (lower_part << 6) | (wchar_t)(c2 & 0x3F);
-    if ((c & 0xF8) == 0xF0) {
-        return (((wchar_t)(c & 0x07)) << 18) | lower_part;
-    }
-    c2 = Utf8Source_read(utf8_source);
-    lower_part =  (lower_part << 6) | (wchar_t)(c2 & 0x3F);
-    if ((c & 0xFC) == 0xF8) {
-        return (((wchar_t)(c & 0x03)) << 24) | lower_part;
-    }
-    c2 = Utf8Source_read(utf8_source);
-    lower_part =  (lower_part << 6) | (wchar_t)(c2 & 0x3F);
-    if ((c & 0xFE) == 0xFC) {
-        return (((wchar_t)(c & 0x01)) << 30) | lower_part;
-    }
-    return WEOF;
-}
diff --git a/gherkin/c/src/utf8_utilities.h b/gherkin/c/src/utf8_utilities.h
deleted file mode 100644
index 5a2db1b4a4e..00000000000
--- a/gherkin/c/src/utf8_utilities.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef GHERKIN_UFT8_UTILITIES_H_
-#define GHERKIN_UFT8_UTILITIES_H_
-
-#include <wchar.h>
-
-#include "utf8_source.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-wchar_t Utf8Utilities_read_wchar_from_utf8_source(Utf8Source* utf8_source);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* GHERKIN_UFT8_UTILITIES_H_ */

From c1528c63536abc53c76844c4388bb037af2c564b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Rasmusson?= <B.Rasmusson@computer.org>
Date: Mon, 17 Apr 2017 17:00:07 +0200
Subject: [PATCH 2/4] gherkin: (C) Always print feature file data using UTF-8.

Windows does not use UTF-8 by default. To make sure that UTF-8 is used
in the output, do manual conversion from unicode/UTF-16 to an UTF-8
byte sequence for output. It seems safer that to try to add Windows
specific code to set UTF-8 output on that platform.
---
 gherkin/c/src/ast_printer.c             | 41 ++++++++++++++++++-------
 gherkin/c/src/attachment_event.c        | 11 ++++---
 gherkin/c/src/gherkin_document_event.c  |  6 ++--
 gherkin/c/src/pickle_event.c            |  6 ++--
 gherkin/c/src/pickle_printer.c          |  9 ++++--
 gherkin/c/src/print_utilities.c         | 22 ++++++++-----
 gherkin/c/src/print_utilities.h         |  2 ++
 gherkin/c/src/source_event.c            |  5 +--
 gherkin/c/src/token_formatter_builder.c | 35 +++++++++++++--------
 gherkin/c/src/unicode_utilities.c       |  2 ++
 10 files changed, 96 insertions(+), 43 deletions(-)

diff --git a/gherkin/c/src/ast_printer.c b/gherkin/c/src/ast_printer.c
index e255af56bd7..e7bf99831a7 100644
--- a/gherkin/c/src/ast_printer.c
+++ b/gherkin/c/src/ast_printer.c
@@ -93,7 +93,9 @@ static void print_doc_string(FILE* file, const DocString* doc_string) {
     fprintf(file, "{\"type\":\"%ls\",", ast_item_type_to_string(doc_string->type));
     print_location(file, &doc_string->location);
     if (doc_string->content_type) {
-        fprintf(file, "\"contentType\":\"%ls\",", doc_string->content_type);
+        fprintf(file, "\"contentType\":\"");
+        PrintUtilities_print_json_string(file, doc_string->content_type);
+        fprintf(file, "\",");
     }
     fprintf(file, "\"content\":\"");
     if (doc_string->content) {
@@ -102,11 +104,23 @@ static void print_doc_string(FILE* file, const DocString* doc_string) {
     fprintf(file, "\"}");
 }
 
+static void print_keyword(FILE* file, const wchar_t* keyword) {
+    fprintf(file, "\"keyword\":\"");
+    PrintUtilities_print_json_string(file, keyword);
+    fprintf(file, "\",");
+}
+
+static void print_text(FILE* file, const wchar_t* text) {
+    fprintf(file, "\"text\":\"");
+    PrintUtilities_print_json_string(file, text);
+    fprintf(file, "\"");
+}
+
 static void print_step(FILE* file, const Step* step) {
     fprintf(file, "{\"type\":\"%ls\",", ast_item_type_to_string(step->type));
     print_location(file, &step->location);
-    fprintf(file, "\"keyword\":\"%ls\",", step->keyword);
-    fprintf(file, "\"text\":\"%ls\"", step->text);
+    print_keyword(file, step->keyword);
+    print_text(file, step->text);
     if (step->argument) {
         fprintf(file, ",\"argument\":");
         if (step->argument->type == Gherkin_DataTable) {
@@ -136,7 +150,7 @@ static void print_description(FILE* file, const wchar_t* description) {
 static void print_background(FILE* file, const Background* background) {
     fprintf(file, "{\"type\":\"%ls\",", ast_item_type_to_string(background->type));
     print_location(file, &background->location);
-    fprintf(file, "\"keyword\":\"%ls\",", background->keyword);
+    print_keyword(file, background->keyword);
     print_name(file, background->name);
     print_description(file, background->description);
     fprintf(file, "\"steps\":[");
@@ -153,7 +167,9 @@ static void print_background(FILE* file, const Background* background) {
 static void print_tag(FILE* file, const Tag* tag) {
     fprintf(file, "{\"type\":\"%ls\",", ast_item_type_to_string(tag->type));
     print_location(file, &tag->location);
-    fprintf(file, "\"name\":\"%ls\"}", tag->name);
+    fprintf(file, "\"name\":\"");
+    PrintUtilities_print_json_string(file, tag->name);
+    fprintf(file, "\"}");
 }
 
 static void print_scenario(FILE* file, const Scenario* scenario) {
@@ -168,7 +184,7 @@ static void print_scenario(FILE* file, const Scenario* scenario) {
     }
     fprintf(file, "],");
     print_location(file, &scenario->location);
-    fprintf(file, "\"keyword\":\"%ls\",", scenario->keyword);
+    print_keyword(file, scenario->keyword);
     print_name(file, scenario->name);
     print_description(file, scenario->description);
     fprintf(file, "\"steps\":[");
@@ -185,7 +201,7 @@ static void print_example_table(FILE* file, const ExampleTable* example_table) {
     fprintf(file, "{\"type\":\"%ls\",", ast_item_type_to_string(example_table->type));
     print_location(file, &example_table->location);
     print_description(file, example_table->description);
-    fprintf(file, "\"keyword\":\"%ls\",", example_table->keyword);
+    print_keyword(file, example_table->keyword);
     print_name(file, example_table->name);
     fprintf(file, "\"tags\":[");
     int i;
@@ -227,7 +243,7 @@ static void print_scenario_outline(FILE* file, const ScenarioOutline* scenario_o
     }
     fprintf(file, "],");
     print_location(file, &scenario_outline->location);
-    fprintf(file, "\"keyword\":\"%ls\",", scenario_outline->keyword);
+    print_keyword(file, scenario_outline->keyword);
     print_name(file, scenario_outline->name);
     print_description(file, scenario_outline->description);
     fprintf(file, "\"steps\":[");
@@ -251,7 +267,8 @@ static void print_scenario_outline(FILE* file, const ScenarioOutline* scenario_o
 static void print_comment(FILE* file, const Comment* comment) {
     fprintf(file, "{\"type\":\"%ls\",", ast_item_type_to_string(comment->type));
     print_location(file, &comment->location);
-    fprintf(file, "\"text\":\"%ls\"}", comment->text);
+    print_text(file, comment->text);
+    fprintf(file, "}");
 }
 
 void print_feature(FILE* file, const Feature* feature) {
@@ -267,8 +284,10 @@ void print_feature(FILE* file, const Feature* feature) {
     }
     fprintf(file, "],");
     print_location(file, &feature->location);
-    fprintf(file, "\"language\":\"%ls\",", feature->language);
-    fprintf(file, "\"keyword\":\"%ls\",", feature->keyword);
+    fprintf(file, "\"language\":\"");
+    PrintUtilities_print_json_string(file, feature->language);
+    fprintf(file, "\",");
+    print_keyword(file, feature->keyword);
     print_name(file, feature->name);
     print_description(file, feature->description);
     fprintf(file, "\"children\":[");
diff --git a/gherkin/c/src/attachment_event.c b/gherkin/c/src/attachment_event.c
index 2476dac0334..4eff8f91534 100644
--- a/gherkin/c/src/attachment_event.c
+++ b/gherkin/c/src/attachment_event.c
@@ -1,4 +1,5 @@
 #include "attachment_event.h"
+#include "print_utilities.h"
 #include "string_utilities.h"
 #include <string.h>
 #include <stdlib.h>
@@ -46,12 +47,14 @@ static void AttachmentEvent_print(const Event* event, FILE* file) {
     }
     const AttachmentEvent* attachment_event = (const AttachmentEvent*)event;
     fprintf(file, "{");
-    fprintf(file, "\"data\":\"%ls\",", attachment_event->data);
-    fprintf(file, "\"media\":{\"encoding\":\"utf-8\",\"type\":\"text/vnd.cucumber.stacktrace+plain\"},");
+    fprintf(file, "\"data\":\"");
+    PrintUtilities_print_json_string(file, attachment_event->data);
+    fprintf(file, "\",\"media\":{\"encoding\":\"utf-8\",\"type\":\"text/vnd.cucumber.stacktrace+plain\"},");
     fprintf(file, "\"source\":{\"start\":");
     fprintf(file, "{\"line\":%d,", attachment_event->location.line);
     fprintf(file, "\"column\":%d},", attachment_event->location.column);
-    fprintf(file, "\"uri\":\"%ls\"},", attachment_event->uri);
-    fprintf(file, "\"type\":\"attachment\"");
+    fprintf(file, "\"uri\":\"");
+    PrintUtilities_print_json_string(file, attachment_event->uri);
+    fprintf(file, "\"},\"type\":\"attachment\"");
     fprintf(file, "}\n");
 }
diff --git a/gherkin/c/src/gherkin_document_event.c b/gherkin/c/src/gherkin_document_event.c
index 3099c9946bc..6f39611332b 100644
--- a/gherkin/c/src/gherkin_document_event.c
+++ b/gherkin/c/src/gherkin_document_event.c
@@ -1,5 +1,6 @@
 #include "gherkin_document_event.h"
 #include "ast_printer.h"
+#include "print_utilities.h"
 #include "string_utilities.h"
 #include <string.h>
 #include <stdlib.h>
@@ -42,8 +43,9 @@ static void GherkinDocumentEvent_print(const Event* event, FILE* file) {
     const GherkinDocumentEvent* gherkin_document_event = (const GherkinDocumentEvent*)event;
     fprintf(file, "{");
     fprintf(file, "\"type\":\"gherkin-document\",");
-    fprintf(file, "\"uri\":\"%ls\",", gherkin_document_event->uri);
-    fprintf(file, "\"document\":");
+    fprintf(file, "\"uri\":\"");
+    PrintUtilities_print_json_string(file, gherkin_document_event->uri);
+    fprintf(file, "\",\"document\":");
     AstPrinter_print_gherkin_document(file, gherkin_document_event->gherkin_document);
     fprintf(file, "}\n");
 }
diff --git a/gherkin/c/src/pickle_event.c b/gherkin/c/src/pickle_event.c
index e24ea8d7f9c..bf21ceb13ec 100644
--- a/gherkin/c/src/pickle_event.c
+++ b/gherkin/c/src/pickle_event.c
@@ -1,5 +1,6 @@
 #include "pickle_event.h"
 #include "pickle_printer.h"
+#include "print_utilities.h"
 #include "string_utilities.h"
 #include <string.h>
 #include <stdlib.h>
@@ -43,8 +44,9 @@ static void PickleEvent_print(const Event* event, FILE* file) {
     if (pickle_event) {
         fprintf(file, "{");
         fprintf(file, "\"type\":\"pickle\",");
-        fprintf(file, "\"uri\":\"%ls\",", pickle_event->uri);
-        fprintf(file, "\"pickle\":");
+        fprintf(file, "\"uri\":\"");
+        PrintUtilities_print_json_string(file, pickle_event->uri);
+        fprintf(file, "\",\"pickle\":");
         PicklePrinter_print_pickle(file, pickle_event->pickle);
         fprintf(file, "}\n");
     }
diff --git a/gherkin/c/src/pickle_printer.c b/gherkin/c/src/pickle_printer.c
index 9cd1f43afe0..9d7e90fc5b9 100644
--- a/gherkin/c/src/pickle_printer.c
+++ b/gherkin/c/src/pickle_printer.c
@@ -70,7 +70,9 @@ static void print_pickle_string(FILE* file, const PickleString* pickle_string) {
 static void print_tag(FILE* file, const PickleTag* tag) {
     fprintf(file, "{\"location\":");
     print_location(file, &tag->location);
-    fprintf(file, ",\"name\":\"%ls\"}", tag->name);
+    fprintf(file, ",\"name\":\"");
+    PrintUtilities_print_json_string(file, tag->name);
+    fprintf(file, "\"}");
 }
 
 static void print_pickle_step(FILE* file, const PickleStep* step) {
@@ -86,8 +88,9 @@ static void print_pickle_step(FILE* file, const PickleStep* step) {
         }
     }
     fprintf(file, "],");
-    fprintf(file, "\"text\":\"%ls\"", step->text);
-    fprintf(file, "}");
+    fprintf(file, "\"text\":\"");
+    PrintUtilities_print_json_string(file, step->text);
+    fprintf(file, "\"}");
 }
 
 void PicklePrinter_print_pickle(FILE* file, const Pickle* pickle) {
diff --git a/gherkin/c/src/print_utilities.c b/gherkin/c/src/print_utilities.c
index 0b4f2181da0..18b6f84365b 100644
--- a/gherkin/c/src/print_utilities.c
+++ b/gherkin/c/src/print_utilities.c
@@ -1,22 +1,30 @@
 #include "print_utilities.h"
+#include "unicode_utilities.h"
 
 void PrintUtilities_print_json_string(FILE* file, const wchar_t* text) {
     int i;
     for (i = 0; i < wcslen(text); ++i) {
         if (text[i] == L'\\' || text[i] == L'"') {
-            fprintf(file, "%lc", (wint_t)L'\\');
-            fprintf(file, "%lc", (wint_t)text[i]);
+            fputc((char)'\\', file);
+            fputc((char)text[i], file);
         }
         else if (text[i] == L'\n') {
-            fprintf(file, "%lc", (wint_t)L'\\');
-            fprintf(file, "%lc", (wint_t)L'n');
+            fputc((char)'\\', file);
+            fputc((char)'n', file);
         }
         else if (text[i] == L'\r') {
-            fprintf(file, "%lc", (wint_t)L'\\');
-            fprintf(file, "%lc", (wint_t)L'r');
+            fputc((char)'\\', file);
+            fputc((char)'r', file);
         }
         else {
-            fprintf(file, "%lc", (wint_t)text[i]);
+            i = UnicodeUtilities_print_wide_character_to_utf8_file(file, text, i);
         }
     }
 }
+
+void PrintUtilities_print_wide_string(FILE* file, const wchar_t* text) {
+    int i;
+    for (i = 0; i < wcslen(text); ++i) {
+        i = UnicodeUtilities_print_wide_character_to_utf8_file(file, text, i);
+    }
+}
diff --git a/gherkin/c/src/print_utilities.h b/gherkin/c/src/print_utilities.h
index 91d25db6fd4..61bad1ba755 100644
--- a/gherkin/c/src/print_utilities.h
+++ b/gherkin/c/src/print_utilities.h
@@ -10,6 +10,8 @@ extern "C" {
 
 void PrintUtilities_print_json_string(FILE* file, const wchar_t* text);
 
+void PrintUtilities_print_wide_string(FILE* file, const wchar_t* text);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gherkin/c/src/source_event.c b/gherkin/c/src/source_event.c
index 8ef1cf54e33..6153db2da7c 100644
--- a/gherkin/c/src/source_event.c
+++ b/gherkin/c/src/source_event.c
@@ -43,8 +43,9 @@ static void SourceEvent_print(const Event* event, FILE* file) {
     fprintf(file, "{");
     fprintf(file, "\"type\":\"source\",");
     fprintf(file, "\"media\":{\"encoding\":\"utf-8\",\"type\":\"text/vnd.cucumber.gherkin+plain\"},");
-    fprintf(file, "\"uri\":\"%ls\",", source_event->uri);
-    fprintf(file, "\"data\":\"");
+    fprintf(file, "\"uri\":\"");
+    PrintUtilities_print_json_string(file, source_event->uri);
+    fprintf(file, "\",\"data\":\"");
     PrintUtilities_print_json_string(file, source_event->source);
     fprintf(file, "\"}\n");
 }
diff --git a/gherkin/c/src/token_formatter_builder.c b/gherkin/c/src/token_formatter_builder.c
index 7324054fce2..335e2cdc938 100644
--- a/gherkin/c/src/token_formatter_builder.c
+++ b/gherkin/c/src/token_formatter_builder.c
@@ -1,4 +1,5 @@
 #include "token_formatter_builder.h"
+#include "print_utilities.h"
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -40,31 +41,41 @@ void TokenFormatterBuilder_build(Builder* builder, Token* token) {
         fprintf(((TokenFormatterBuilder*)builder)->file, "%s\n", token_type_to_string(token->matched_type));
     else if (token->matched_type == Token_TableRow || token->matched_type == Token_TagLine) {
         fprintf(((TokenFormatterBuilder*)builder)->file,
-                "(%d:%d)%s:%ls/%ls/",
+                "(%d:%d)%s:",
                 token->location.line,
                 token->location.column,
-                token_type_to_string(token->matched_type),
-                token->matched_keyword ? token->matched_keyword : L"",
-                token->matched_text ? token->matched_text : L"");
+                token_type_to_string(token->matched_type));
+        PrintUtilities_print_wide_string(((TokenFormatterBuilder*)builder)->file,
+                                   token->matched_keyword ? token->matched_keyword : L"");
+        fprintf(((TokenFormatterBuilder*)builder)->file, "/");
+        PrintUtilities_print_wide_string(((TokenFormatterBuilder*)builder)->file,
+                                   token->matched_text ? token->matched_text : L"");
+        fprintf(((TokenFormatterBuilder*)builder)->file, "/");
         int i;
         for (i = 0; i < token->matched_items->count; ++i) {
             if (i != 0)
                 fprintf(((TokenFormatterBuilder*)builder)->file, ",");
             fprintf(((TokenFormatterBuilder*)builder)->file,
-                    "%d:%ls",
-                    token->matched_items->items[i].column,
-                    token->matched_items->items[i].text);
+                    "%d:",
+                    token->matched_items->items[i].column);
+            PrintUtilities_print_wide_string(((TokenFormatterBuilder*)builder)->file,
+                                       token->matched_items->items[i].text);
         }
         fprintf(((TokenFormatterBuilder*)builder)->file, "\n");
     }
-    else
+    else {
         fprintf(((TokenFormatterBuilder*)builder)->file,
-                "(%d:%d)%s:%ls/%ls/\n",
+                "(%d:%d)%s:",
                 token->location.line,
                 token->location.column,
-                token_type_to_string(token->matched_type),
-                token->matched_keyword ? token->matched_keyword : L"",
-                token->matched_text ? token->matched_text : L"");
+                token_type_to_string(token->matched_type));
+        PrintUtilities_print_wide_string(((TokenFormatterBuilder*)builder)->file,
+                                   token->matched_keyword ? token->matched_keyword : L"");
+        fprintf(((TokenFormatterBuilder*)builder)->file, "/");
+        PrintUtilities_print_wide_string(((TokenFormatterBuilder*)builder)->file,
+                                   token->matched_text ? token->matched_text : L"");
+        fprintf(((TokenFormatterBuilder*)builder)->file, "/\n");
+    }
     Token_delete(token);
 }
 
diff --git a/gherkin/c/src/unicode_utilities.c b/gherkin/c/src/unicode_utilities.c
index c83bbdd9ae4..8e3575b1ab4 100644
--- a/gherkin/c/src/unicode_utilities.c
+++ b/gherkin/c/src/unicode_utilities.c
@@ -1,5 +1,7 @@
 #include "unicode_utilities.h"
 
+static void print_code_point_to_utf8_file(FILE* file, long code_point);
+
 long UnicodeUtilities_read_code_point_from_utf8_source(Utf8Source* utf8_source) {
     unsigned char c = Utf8Source_read(utf8_source);
     if (c < 0x80) {

From 1183c9620bf9124dad424abda72824758b743e69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Rasmusson?= <B.Rasmusson@computer.org>
Date: Sun, 23 Apr 2017 15:27:38 +0200
Subject: [PATCH 3/4] gherkin: (C) Conform to C90 (mostly)

To support a wider range of compilers, remove the C90 breaches so that
the code compiles also with -ansi and/or -std=c90 (using gcc).
Another reason to not use swprintf is that on many Windows compilers
swprintf has a different signature than the ISO standard specification.
When cleaning up commented out code in gherkin_generate_tokens.c (to
remove the usage of // for comments), settle to use the
FileTokenScanner to get coverage of it in the acceptance tests.

The code does not compile with -std=c90 -pedantic (using gcc), the
conformance to C90 is not taken that far.
---
 gherkin/c/src/Makefile                  |   2 +-
 gherkin/c/src/error_list.c              | 151 ++++++++++++++++++------
 gherkin/c/src/gherkin_cli.c             |   2 -
 gherkin/c/src/gherkin_generate_tokens.c |  10 +-
 4 files changed, 116 insertions(+), 49 deletions(-)

diff --git a/gherkin/c/src/Makefile b/gherkin/c/src/Makefile
index 0fa61f35102..eb03b7f4bd5 100644
--- a/gherkin/c/src/Makefile
+++ b/gherkin/c/src/Makefile
@@ -21,7 +21,7 @@ endif
 GENERATE_DEPS_FLAGS=-MMD -MP -MF $(basename $@).d
 AR_FLAGS=cr
 LD_FLAGS=
-LD_LIBS=
+LD_LIBS=-lm
 RM_CMD=rm -rf
 MKDIR_CMD=mkdir -p
 
diff --git a/gherkin/c/src/error_list.c b/gherkin/c/src/error_list.c
index 92badb41f3d..2bd00634622 100644
--- a/gherkin/c/src/error_list.c
+++ b/gherkin/c/src/error_list.c
@@ -2,6 +2,7 @@
 #include "error.h"
 #include "item_queue.h"
 #include "token.h"
+#include <math.h>
 #include <stdlib.h>
 #include <setjmp.h>
 
@@ -12,6 +13,14 @@ typedef struct ErrorList {
     jmp_buf* local_env;
 } ErrorList;
 
+static int calculate_string_length_for_location(int line_width, int column_width);
+
+static int print_location_to_string(wchar_t* string, int pos, int line, int line_width, int column, int column_width);
+
+static int calculate_string_length_for_number(int number);
+
+static int print_number_to_string(wchar_t* string, int pos, int number, int number_width);
+
 ErrorList* ErrorList_new() {
     ErrorList* error_list = (ErrorList*)malloc(sizeof(ErrorList));
     error_list->errors = ItemQueue_new();
@@ -63,60 +72,87 @@ void ErrorList_add(ErrorList* error_list, const wchar_t* error_text, const Locat
 }
 
 void ErrorList_add_unexpected_eof_error(ErrorList* error_list, Token* received_token, const wchar_t* expected_tokens) {
-    const wchar_t* const message = L"unexpected end of file, expected: %ls";
-    const int location_text_width = 11; // enough space for "(xxx:yyy): " to fit
-    const int message_length = wcslen(message) - 3 + wcslen(expected_tokens);
-    wchar_t* text = (wchar_t*)malloc((location_text_width + message_length + 1) * sizeof(wchar_t*));
-    int actual_location_width = swprintf(text, location_text_width + message_length + 1, L"(%d:%d): ", received_token->location.line, received_token->location.column);
-    if (actual_location_width > location_text_width) {
-        text = (wchar_t*)realloc(text, (actual_location_width + message_length + 1) * sizeof(wchar_t*));
-    }
-    swprintf(text + actual_location_width, message_length + 1, message, expected_tokens);
+    const wchar_t* const message = L"unexpected end of file, expected: ";
+    const int message_length = wcslen(message);
+    const int line_width = calculate_string_length_for_number(received_token->location.line);
+    const int column_width = calculate_string_length_for_number(received_token->location.column);
+    const int total_length = calculate_string_length_for_location(line_width, column_width) + message_length + wcslen(expected_tokens);
+    wchar_t* text = (wchar_t*)malloc((total_length + 1) * sizeof(wchar_t*));
+    int pos = 0;
+    pos = print_location_to_string(text, pos, received_token->location.line, line_width, received_token->location.column, column_width);
+    wcscpy(text + pos, message);
+    pos += message_length;
+    wcscpy(text + pos, expected_tokens);
+    text[total_length] = L'\0';
     ErrorList_add(error_list, text, received_token->location);
 }
 
 void ErrorList_add_unexpected_token_error(ErrorList* error_list, Token* received_token, const wchar_t* expected_tokens) {
-    const wchar_t* const message = L"expected: %ls, got '%ls'";
-    const int location_text_width = 11; // enough space for "(xxx:yyy): " to fit
-    const int message_length = wcslen(message) - 6 + wcslen(expected_tokens) + wcslen(received_token->line->trimmed_line);
-    wchar_t* text = (wchar_t*)malloc((location_text_width + message_length + 1) * sizeof(wchar_t*));
+    const wchar_t* const expected = L"expected: ";
+    const int expected_length = wcslen(expected);
+    const wchar_t* const got = L", got ";
+    const int got_length = wcslen(got);
+    const int expected_tokens_length = wcslen(expected_tokens);
+    const int received_tokens_length = wcslen(received_token->line->trimmed_line);
+    const int line_width = calculate_string_length_for_number(received_token->location.line);
+    const int column_width = calculate_string_length_for_number(received_token->location.column);
+    const int total_length = calculate_string_length_for_location(line_width, column_width) + expected_length + expected_tokens_length + got_length + received_tokens_length + 2;
+    wchar_t* text = (wchar_t*)malloc((total_length + 1) * sizeof(wchar_t*));
     int column = received_token->location.column;
     if (column == 0) {
         column = received_token->line->indent + 1;
     }
-    int actual_location_width = swprintf(text, location_text_width + message_length + 1, L"(%d:%d): ", received_token->location.line, column);
-    if (actual_location_width > location_text_width) {
-        text = (wchar_t*)realloc(text, (actual_location_width + message_length + 1) * sizeof(wchar_t*));
-    }
-    swprintf(text + actual_location_width, message_length + 1, message, expected_tokens, received_token->line->trimmed_line);
+    int pos = 0;
+    pos = print_location_to_string(text, pos, received_token->location.line, line_width, column, column_width);
+    wcscpy(text + pos, expected);
+    pos += expected_length;
+    wcscpy(text + pos, expected_tokens);
+    pos += expected_tokens_length;
+    wcscpy(text + pos, got);
+    pos += got_length;
+    text[pos++] = L'\'';
+    wcscpy(text + pos, received_token->line->trimmed_line);
+    pos += received_tokens_length;
+    text[pos++] = L'\'';
+    text[total_length] = L'\0';
     Location location = {received_token->location.line, column};
     ErrorList_add(error_list, text, location);
 }
 
 void ErrorList_add_no_such_language_error(ErrorList* error_list, Location* location, const wchar_t* language) {
-    const wchar_t* const message = L"Language not supported: %ls";
-    const int location_text_width = 11; // enough space for "(xxx:yyy): " to fit
-    const int message_length = wcslen(message) - 3 + wcslen(language);
-    wchar_t* text = (wchar_t*)malloc((location_text_width + message_length + 1) * sizeof(wchar_t*));
-    int actual_location_width = swprintf(text, location_text_width + message_length + 1, L"(%d:%d): ", location->line, location->column);
-    if (actual_location_width > location_text_width) {
-        text = (wchar_t*)realloc(text, (actual_location_width + message_length + 1) * sizeof(wchar_t*));
+    const wchar_t* const message = L"Language not supported: ";
+    const int message_length = wcslen(message);
+    const int language_length = wcslen(language);
+    Location used_location = {-1, -1};
+    if (location) {
+        used_location.line = location->line;
+        used_location.column = location->column;
     }
-    swprintf(text + actual_location_width, message_length + 1, message, language);
-    ErrorList_add(error_list, text, *location);
+    const int line_width = calculate_string_length_for_number(used_location.line);
+    const int column_width = calculate_string_length_for_number(used_location.column);
+    const int total_length = calculate_string_length_for_location(line_width, column_width) + message_length + language_length;
+    wchar_t* text = (wchar_t*)malloc((total_length + 1) * sizeof(wchar_t*));
+    int pos = 0;
+    pos = print_location_to_string(text, pos, used_location.line, line_width, used_location.column, column_width);
+    wcscpy(text + pos, message);
+    pos += message_length;
+    wcscpy(text + pos, language);
+    text[total_length] = L'\0';
+    ErrorList_add(error_list, text, used_location);
     ErrorList_jump_to_local_rescue_env(error_list);
 }
 
 void ErrorList_add_inconsisten_cell_count_error(ErrorList* error_list, Location location) {
     const wchar_t* const message = L"inconsistent cell count within the table";
-    const int location_text_width = 11; // enough space for "(xxx:yyy): " to fit
     const int message_length = wcslen(message);
-    wchar_t* text = (wchar_t*)malloc((location_text_width + message_length + 1) * sizeof(wchar_t*));
-    int actual_location_width = swprintf(text, location_text_width + message_length + 1, L"(%d:%d): ", location.line, location.column);
-    if (actual_location_width > location_text_width) {
-        text = (wchar_t*)realloc(text, (actual_location_width + message_length + 1) * sizeof(wchar_t*));
-    }
-    wcscpy(text + actual_location_width, message);
+    const int line_width = calculate_string_length_for_number(location.line);
+    const int column_width = calculate_string_length_for_number(location.column);
+    const int total_length = calculate_string_length_for_location(line_width, column_width) + message_length;
+    wchar_t* text = (wchar_t*)malloc((total_length + 1) * sizeof(wchar_t*));
+    int pos = 0;
+    pos = print_location_to_string(text, pos, location.line, line_width, location.column, column_width);
+    wcscpy(text + pos, message);
+    text[total_length] = L'\0';
     ErrorList_add(error_list, text, location);
     ErrorList_jump_to_local_rescue_env(error_list);
 }
@@ -132,10 +168,16 @@ void ErrorList_internal_grammar_error(ErrorList* error_list) {
 }
 
 void ErrorList_add_invalid_operation_error(ErrorList* error_list, int state) {
-    const wchar_t* const message = L"Unknown state: %d";
-    const int message_length = wcslen(message) + 10; // some extra space for the state number
-    wchar_t* text = (wchar_t*)malloc((message_length + 1) * sizeof(wchar_t*));
-    swprintf(text, message_length + 1, message, state);
+    const wchar_t* const message = L"Unknown state: ";
+    const int message_length = wcslen(message);
+    const int state_width = calculate_string_length_for_number(state);
+    const int total_length = message_length + state_width;
+    wchar_t* text = (wchar_t*)malloc((total_length + 1) * sizeof(wchar_t*));
+    int pos = 0;
+    wcscpy(text + pos, message);
+    pos += message_length;
+    print_number_to_string(text, pos, state, state_width);
+    text[total_length] = L'\0';
     Location location = {-1, -1};
     ErrorList_add(error_list, text, location);
 }
@@ -150,3 +192,36 @@ Error* ErrorList_next_error(ErrorList* error_list) {
     }
     return ErrorList_remove(error_list);
 }
+
+int calculate_string_length_for_location(int line_width, int column_width) {
+    return line_width + column_width + 5; /* "(<line>:<column): " */
+}
+
+int print_location_to_string(wchar_t* string, int pos, int line, int line_width, int column, int column_width) {
+    string[pos++] = L'(';
+    pos = print_number_to_string(string, pos, line, line_width);
+    string[pos++] = L':';
+    pos = print_number_to_string(string, pos, column, column_width);
+    string[pos++] = L')';
+    string[pos++] = L':';
+    string[pos++] = L' ';
+    return pos;
+
+}
+
+int calculate_string_length_for_number(int number) {
+    if (number == 0) {
+        return 1;
+    }
+    return (int)log10(number) + 1;
+}
+
+int print_number_to_string(wchar_t* string, int pos, int number, int number_width) {
+    int divisor = 1;
+    int i;
+    for (i = number_width - 1; i >= 0; --i) {
+        string[pos + i] = L'0' + ((number / divisor) % 10);
+        divisor *= 10;
+    }
+    return pos + number_width;
+}
diff --git a/gherkin/c/src/gherkin_cli.c b/gherkin/c/src/gherkin_cli.c
index cd210cbce7b..53fa733b250 100644
--- a/gherkin/c/src/gherkin_cli.c
+++ b/gherkin/c/src/gherkin_cli.c
@@ -7,7 +7,6 @@
 
 #include "file_reader.h"
 #include "string_token_scanner.h"
-//#include "file_token_scanner.h"
 #include "token_matcher.h"
 #include "parser.h"
 #include "ast_builder.h"
@@ -75,7 +74,6 @@ int main(int argc, char** argv) {
             Event_print((const Event*)source_event, stdout);
         }
         TokenScanner* token_scanner = StringTokenScanner_new(source_event->source);
-        //TokenScanner* token_scanner = FileTokenScanner_new(argv[i]);
         result_code = Parser_parse(parser, token_matcher, token_scanner);
         Event_delete((const Event*)source_event);
         if (result_code == 0) {
diff --git a/gherkin/c/src/gherkin_generate_tokens.c b/gherkin/c/src/gherkin_generate_tokens.c
index 374165340ba..d528219aadd 100644
--- a/gherkin/c/src/gherkin_generate_tokens.c
+++ b/gherkin/c/src/gherkin_generate_tokens.c
@@ -1,8 +1,7 @@
 #include <locale.h>
 #include <stdlib.h>
 #include "file_reader.h"
-#include "string_token_scanner.h"
-//#include "file_token_scanner.h"
+#include "file_token_scanner.h"
 #include "token_matcher.h"
 #include "parser.h"
 #include "token_formatter_builder.h"
@@ -11,16 +10,11 @@ int main(int argc, char** argv) {
     setlocale(LC_ALL, "en_US.UTF-8");
     int i;
     for (i = 1; i < argc; ++i) {
-        FileReader* file_reader = FileReader_new(argv[i]);
-        const wchar_t* source = FileReader_read(file_reader);
-        FileReader_delete(file_reader);
-        TokenScanner* token_scanner = StringTokenScanner_new(source);
-        //TokenScanner* token_scanner = FileTokenScanner_new(argv[i]);
+        TokenScanner* token_scanner = FileTokenScanner_new(argv[i]);
         TokenMatcher* token_matcher = TokenMatcher_new(L"en");
         Builder* builder = TokenFormatterBuilder_new();
         Parser* parser = Parser_new(builder);
         Parser_parse(parser, token_matcher, token_scanner);
-        free((void*)source);
         Parser_delete(parser);
         TokenFormatterBuilder_delete(builder);
         TokenMatcher_delete(token_matcher);

From 6cdd13ed513bfd8f6745916018b6603643867730 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Rasmusson?= <B.Rasmusson@computer.org>
Date: Sun, 23 Apr 2017 19:41:22 +0200
Subject: [PATCH 4/4] gherkin: (C) Avoid using repeated typedefs (support gcc
 <v4.6)

Technically C does not support repeated typedefs until C11, however
modern compilers do allow it (gcc also using -std=c90). But since
gcc <v4.6 does not support that, remove the usage of them.
---
 gherkin/c/gherkin-c-parser.razor  |  5 ++---
 gherkin/c/include/builder.h       |  9 +++------
 gherkin/c/include/compiler.h      |  6 ++----
 gherkin/c/include/event.h         |  4 ++--
 gherkin/c/include/item.h          |  4 ++--
 gherkin/c/include/parser.h        | 11 +++--------
 gherkin/c/include/token_matcher.h | 10 ++++------
 gherkin/c/include/token_scanner.h |  4 ++--
 gherkin/c/src/Makefile            |  2 +-
 gherkin/c/src/ast_builder.c       |  1 -
 gherkin/c/src/compiler.c          |  4 ++--
 gherkin/c/src/error_list.c        |  4 ++--
 gherkin/c/src/file_reader.c       |  4 ++--
 gherkin/c/src/item_queue.h        |  4 ++--
 gherkin/c/src/parser.c            |  5 ++---
 gherkin/c/src/token_queue.c       |  2 --
 gherkin/c/src/utf8_source.h       |  4 ++--
 17 files changed, 33 insertions(+), 50 deletions(-)

diff --git a/gherkin/c/gherkin-c-parser.razor b/gherkin/c/gherkin-c-parser.razor
index e1fd11476a5..a683595626f 100644
--- a/gherkin/c/gherkin-c-parser.razor
+++ b/gherkin/c/gherkin-c-parser.razor
@@ -32,7 +32,6 @@
 #include "token_scanner.h"
 #include "token_matcher.h"
 #include "token_queue.h"
-#include "builder.h"
 #include "error_list.h"
 #include <stdlib.h>
 #include <setjmp.h>
@@ -46,11 +45,11 @@ typedef struct ParserContext {
     ErrorList* errors;
 } ParserContext;
 
-typedef struct @Model.ParserClassName {
+struct @Model.ParserClassName {
     ParserContext* parser_context;
     Builder* builder;
     ErrorList* errors;
-} @Model.ParserClassName;
+};
 
 static Token* read_token(ParserContext* context);
 
diff --git a/gherkin/c/include/builder.h b/gherkin/c/include/builder.h
index 85f26c9f1a7..7457ea650cb 100644
--- a/gherkin/c/include/builder.h
+++ b/gherkin/c/include/builder.h
@@ -1,11 +1,10 @@
 #ifndef GHERKIN_BUILDER_H_
 #define GHERKIN_BUILDER_H_
 
+#include "error_list.h"
 #include "rule_type.h"
 #include "token.h"
 
-typedef struct ErrorList ErrorList;
-
 typedef struct Builder Builder;
 
 typedef void (*builder_reset_function) (Builder*);
@@ -16,14 +15,12 @@ typedef void (*build_function) (Builder*, Token*);
 
 typedef void (*rule_function) (Builder*, RuleType);
 
-typedef void (*rule_function) (Builder*, RuleType);
-
-typedef struct Builder {
+struct Builder {
     builder_reset_function reset;
     builder_error_context_function set_error_context;
     build_function build;
     rule_function start_rule;
     rule_function end_rule;
-} Builder;
+};
 
 #endif /* GHERKIN_BUILDER_H_ */
diff --git a/gherkin/c/include/compiler.h b/gherkin/c/include/compiler.h
index ac55f6dfe91..b90fb32267a 100644
--- a/gherkin/c/include/compiler.h
+++ b/gherkin/c/include/compiler.h
@@ -3,6 +3,8 @@
 
 #include <stdbool.h>
 #include <wchar.h>
+#include "gherkin_document.h"
+#include "pickle.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -10,10 +12,6 @@ extern "C" {
 
 typedef struct Compiler Compiler;
 
-typedef struct GherkinDocument GherkinDocument;
-
-typedef struct Pickle Pickle;
-
 Compiler* Compiler_new();
 
 void Compiler_delete(Compiler* compiler);
diff --git a/gherkin/c/include/event.h b/gherkin/c/include/event.h
index 1dbea76e9f2..ef793e236bc 100644
--- a/gherkin/c/include/event.h
+++ b/gherkin/c/include/event.h
@@ -20,11 +20,11 @@ typedef enum EventType {
     Gherkin_PickleEvent
 } EventType;
 
-typedef struct Event {
+struct Event {
     event_delete_function event_delete;
     event_print_function event_print;
     EventType event_type;
-} Event;
+};
 
 void Event_delete(const Event* event);
 
diff --git a/gherkin/c/include/item.h b/gherkin/c/include/item.h
index 36aa984ad1f..9e19bfe5e69 100644
--- a/gherkin/c/include/item.h
+++ b/gherkin/c/include/item.h
@@ -5,8 +5,8 @@ typedef struct Item Item;
 
 typedef void (*item_delete_function) (Item*);
 
-typedef struct Item {
+struct Item {
     item_delete_function item_delete;
-} Item;
+};
 
 #endif /* GHERKIN_ITEM_H_ */
diff --git a/gherkin/c/include/parser.h b/gherkin/c/include/parser.h
index 7b455655501..26ee333990e 100644
--- a/gherkin/c/include/parser.h
+++ b/gherkin/c/include/parser.h
@@ -1,7 +1,10 @@
 #ifndef GHERKIN_PARSER_H_
 #define GHERKIN_PARSER_H_
 
+#include "builder.h"
 #include "error.h"
+#include "token_matcher.h"
+#include "token_scanner.h"
 #include <stdbool.h>
 #include <wchar.h>
 
@@ -11,14 +14,6 @@ extern "C" {
 
 typedef struct Parser Parser;
 
-typedef struct Builder Builder;
-
-typedef struct TokenMatcher TokenMatcher;
-
-typedef struct TokenScanner TokenScanner;
-
-typedef struct Feature Feature;
-
 Parser* Parser_new(Builder* builder);
 
 void Parser_delete(Parser* parser);
diff --git a/gherkin/c/include/token_matcher.h b/gherkin/c/include/token_matcher.h
index e91a9f7610e..ce8409e34f7 100644
--- a/gherkin/c/include/token_matcher.h
+++ b/gherkin/c/include/token_matcher.h
@@ -2,23 +2,21 @@
 #define GHERKIN_TOKEN_MATCHER_H_
 
 #include <stdbool.h>
+#include "dialect.h"
+#include "error_list.h"
 #include "token.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef struct Dialect Dialect;
-
-typedef struct ErrorList ErrorList;
-
 typedef struct TokenMatcher TokenMatcher;
 
 typedef void (*matcher_reset_function) (TokenMatcher*);
 
 typedef bool (*match_function) (TokenMatcher*, Token*);
 
-typedef struct TokenMatcher {
+struct TokenMatcher {
     const wchar_t* default_language;
     const wchar_t* language;
     const Dialect* dialect;
@@ -40,7 +38,7 @@ typedef struct TokenMatcher {
     match_function match_Language;
     match_function match_Other;
     match_function match_EOF;
-} TokenMatcher;
+};
 
 TokenMatcher* TokenMatcher_new(const wchar_t* default_language);
 
diff --git a/gherkin/c/include/token_scanner.h b/gherkin/c/include/token_scanner.h
index dc6e21e8a4c..8f1783bad5e 100644
--- a/gherkin/c/include/token_scanner.h
+++ b/gherkin/c/include/token_scanner.h
@@ -15,10 +15,10 @@ typedef Token* (*read_function) (TokenScanner*);
 
 typedef void (*delete_function) (TokenScanner*);
 
-typedef struct TokenScanner {
+struct TokenScanner {
     read_function read;
     delete_function delete;
-} TokenScanner;
+};
 
 void TokenScanner_delete(TokenScanner* token_scanner);
 
diff --git a/gherkin/c/src/Makefile b/gherkin/c/src/Makefile
index eb03b7f4bd5..3fccc6e98e9 100644
--- a/gherkin/c/src/Makefile
+++ b/gherkin/c/src/Makefile
@@ -1,5 +1,5 @@
 GCC_FLAGS=-c -Wall -Werror -g
-CLANG_FLAGS=-c -Wall -Wno-typedef-redefinition -Werror -g
+CLANG_FLAGS=-c -Wall -Werror -g
 
 ifeq ($(CC),i686-w64-mingw32-gcc)
 	CC=i686-w64-mingw32-gcc
diff --git a/gherkin/c/src/ast_builder.c b/gherkin/c/src/ast_builder.c
index baa8819a341..a12cf03c27a 100644
--- a/gherkin/c/src/ast_builder.c
+++ b/gherkin/c/src/ast_builder.c
@@ -6,7 +6,6 @@
 #include "scenario_outline.h"
 #include "data_table.h"
 #include "doc_string.h"
-#include "error_list.h"
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/gherkin/c/src/compiler.c b/gherkin/c/src/compiler.c
index f05630e04f9..7c702781214 100644
--- a/gherkin/c/src/compiler.c
+++ b/gherkin/c/src/compiler.c
@@ -12,9 +12,9 @@
 #include "string_utilities.h"
 #include <stdlib.h>
 
-typedef struct Compiler {
+struct Compiler {
     ItemQueue* pickle_list;
-} Compiler;
+};
 
 typedef struct ReplacementItem {
     item_delete_function item_delete;
diff --git a/gherkin/c/src/error_list.c b/gherkin/c/src/error_list.c
index 2bd00634622..52438f9e305 100644
--- a/gherkin/c/src/error_list.c
+++ b/gherkin/c/src/error_list.c
@@ -6,12 +6,12 @@
 #include <stdlib.h>
 #include <setjmp.h>
 
-typedef struct ErrorList {
+struct ErrorList {
     ItemQueue* errors;
     QueueItem* current_error;
     jmp_buf* global_env;
     jmp_buf* local_env;
-} ErrorList;
+};
 
 static int calculate_string_length_for_location(int line_width, int column_width);
 
diff --git a/gherkin/c/src/file_reader.c b/gherkin/c/src/file_reader.c
index 7143128b671..401126d29f4 100644
--- a/gherkin/c/src/file_reader.c
+++ b/gherkin/c/src/file_reader.c
@@ -3,9 +3,9 @@
 #include "unicode_utilities.h"
 #include <stdlib.h>
 
-typedef struct FileReader {
+struct FileReader {
     const char* file_name;
-} FileReader;
+};
 
 static void extend_buffer_if_needed(wchar_t** buffer, int* buffer_size, int pos);
 
diff --git a/gherkin/c/src/item_queue.h b/gherkin/c/src/item_queue.h
index 8d665923cee..e8e3711c6ea 100644
--- a/gherkin/c/src/item_queue.h
+++ b/gherkin/c/src/item_queue.h
@@ -10,10 +10,10 @@ extern "C" {
 
 typedef struct QueueItem QueueItem;
 
-typedef struct QueueItem {
+struct QueueItem {
     Item* item;
     QueueItem* next;
-} QueueItem;
+};
 
 typedef struct ItemQueue {
     QueueItem* first;
diff --git a/gherkin/c/src/parser.c b/gherkin/c/src/parser.c
index c85c9c28d29..c92696066f5 100644
--- a/gherkin/c/src/parser.c
+++ b/gherkin/c/src/parser.c
@@ -4,7 +4,6 @@
 #include "token_scanner.h"
 #include "token_matcher.h"
 #include "token_queue.h"
-#include "builder.h"
 #include "error_list.h"
 #include <stdlib.h>
 #include <setjmp.h>
@@ -18,11 +17,11 @@ typedef struct ParserContext {
     ErrorList* errors;
 } ParserContext;
 
-typedef struct Parser {
+struct Parser {
     ParserContext* parser_context;
     Builder* builder;
     ErrorList* errors;
-} Parser;
+};
 
 static Token* read_token(ParserContext* context);
 
diff --git a/gherkin/c/src/token_queue.c b/gherkin/c/src/token_queue.c
index d564a15eaa6..2677f843bb8 100644
--- a/gherkin/c/src/token_queue.c
+++ b/gherkin/c/src/token_queue.c
@@ -2,8 +2,6 @@
 #include "item_queue.h"
 #include <stdlib.h>
 
-typedef struct QueueItem QueueItem;
-
 TokenQueue* TokenQueue_new() {
     return (TokenQueue*)ItemQueue_new();
 }
diff --git a/gherkin/c/src/utf8_source.h b/gherkin/c/src/utf8_source.h
index ae1f12aca9d..333871a3f10 100644
--- a/gherkin/c/src/utf8_source.h
+++ b/gherkin/c/src/utf8_source.h
@@ -15,10 +15,10 @@ typedef unsigned char (*utf8_source_read_function) (Utf8Source*);
 
 typedef void (*utf8_source_delete_function) (Utf8Source*);
 
-typedef struct Utf8Source {
+struct Utf8Source {
     utf8_source_read_function read;
     utf8_source_delete_function delete;
-} Utf8Source;
+};
 
 unsigned char Utf8Source_read(Utf8Source* utf8_source);