uparse.c

#ifdef FAST
#define PARM_NO_FLEX 1
#endif

// This is my latest attempt at writing a parser suitable for language
// translation as well as regular compiling, not to mention any sort
// of generic natural language processing such as a home voice assistant
// or a text adventure game.

// With each new effort I've applied what I learned in the previous
// iteration - this one however has very little that is new - it focuses
// primarily on simplifying the code and documenting it better - a
// realisation prompted by working on my Imp to C translator recently
// and having a great deal of trouble in remembering how things worked! -
// there being too many little edge cases due to things I added on
// piecemeal, which will be simpler and more generalised in this iteration.

// This is an 'Edinburgh style' parser - in that it uses the parsing
// algorithm that has been used in dozens of compilers written at
// Edinburgh - by Harry Whitfield, David Rees, Hamish Dewar, Peter
// Schofield, Peter Stephens, the Edinburgh Regional Computer Center,
// Edinburgh Portable Compilers Ltd., Peter Robertson, Ian Young,
// Lattice Logic Ltd. (3L), Rainer Thonnes, myself, and countless
// others.  And we inherited the algorithm and general approach to
// compiler writing from Tony Brooker, the author of the original
// Compiler-Compiler and inventor of the Atlas Autocode language which
// later evolved into the Imp language used at Edinburgh for 30 to 40
// years.

// NOTE: Tony Brooker's parser design was rediscovered in 2004 by someone
// who was ignorant of all the existing parsers in this style, and who
// renamed the parser style as a "PEG" (Parsing Expression Grammar) parser.
// Also the trick of making the parser into a memo function had been used
// many years earlier - around 1980 if I recall, when I applied it to a
// parser based on the SKIMP compiler (after having just had a lecture
// on memo fns - also invented at Edinburgh, by Donald Michie in the 60's),
// which we (my classmates and I) were using to write a text adventure system.)

// The algorithm used is roughly akin to that in the SKIMP description,
// starting about halfway down page 20 of:
// https://gtoal.com/history.dcs.ed.ac.uk/archive/languages/skimp/skimp_ii.html

// This algorithm is easy to code using tables, but the same navigation
// through the parse tree can be directed procedurally if preferred - see
// my old "tacc" parser, currently at https://gtoal.com/languages/algol60/ ,
// at least until it is replaced by this code, and its generated parser
// code at https://gtoal.com/languages/algol60/algol60.c

// The grammars for these parsers are in the BNF style.  We do not add many
// of the "bells & whistles" that are possible to extend BNF - the constructs
// can all be made using basic BNF and keeping the BNF syntax simple makes
// the mapping of grammar items to implementation code much easier.  The
// accompanying documentation will give examples of how to write various
// constructs in basic BNF.

// We use a program called "takeon" to convert the grammar file (language.g)
// into tables which are included by the parser (currently uparse.c).
// A simple grammar alone will act as a syntax checker when the parser
// is initially used.  However once you have debugged a grammar, you
// can add a language program to use the parse tree generated by the
// parser.  This program (language.c) can contain the grammar description
// within itself, as embedded comments.  A utility is supplied to extract
// the .g file from the .c file.  By embedding the grammar in this way
// we avoid the risk of separating the grammar from the file which uses
// the grammar - a problem which has indeed happened in the past - several
// times!

// Most Edinburgh compilers in fact generated code directly from the
// 'analysis record' which is the data structure that nowadays would
// be called a 'Concrete Syntax Tree'.  However modern compilers mostly
// prefer to work at the level of an 'Abstract Syntax Tree', which is
// a simplified tree that often has some items rearranged for convenience.

// To that end, we also supply a utility (regen) to generate a program
// which converts the CST to an AST in a 1:1 form - this can be used as
// a skeleton by the programmer to develop a more approriate AST suitable
// for their compiler.

#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdarg.h>

#include <wchar.h>
#include <locale.h>

//#include GRAMMAR // This is the grammar.h file specific to the language being compiled, generated from grammar.g
//                 // grammar.g is either written in raw format *or* extracted from the .c file specific to the compiler

#include "parser.h"

//#include "mnemosyne.h"  // An old memory leak detector. No longer recommended - valgrind does as well or better nowadays.
#include "regexp-lexer.h"

#ifndef FALSE
#define FALSE (0!=0)
#endif
#ifndef TRUE
#define TRUE (0==0)
#endif

static int debug_parser = FALSE;
static int debug_completion = FALSE;
static int debug_literals = FALSE;

// These upper bounds are only here to limit the extent of programming errors.
// They could be replaced by 'MAXINT' and the code would behave identically for working code.
// (except when compiling -DPARM_NO_FLEX which is not needed now the code has been speeded up)

DECLARE(Stringpool, wchar_t, 8000000);
#define _Stringpool(x) WRITE(x,Stringpool,wchar_t)
#define  Stringpool(x) READ(x,Stringpool,wchar_t)
int Stringpool_nextfree = 0;

DECLARE(source, wint_t, 600000); // accessed as a global by our regxp code for now.
#define _source(x) WRITE(x,source,wint_t)
#define  source(x) READ(x,source,wint_t)

DECLARE(CST,int,4000000); // concrete syntax tree
#define _CST(x) WRITE(x,CST,int)
#define  CST(x) READ(x,CST,int)

typedef struct source_descriptor {
  int start;
  int end;
} source_descriptor;

DECLARE(atom, source_descriptor, 600000);
#define _atom(x) WRITE(x,atom,source_descriptor)
#define  atom(x) READ(x,atom,source_descriptor)
static int last_used_descriptor = -1;

//################################################################################################## compile.c

int debug_ast = 0; // levels: 0 1 2 3

// This is algol.c, the 'compile()' procedure for Algol 60, loosely based on the ERCC's "algolps9" grammar file.

// The concrete syntax tree (CST, aka 'analysis record') is created by
// the parser which does not need to known anything specific about
// the language being compiled.

// The abstract syntax tree (AST) is created by the language-specific code
// from the CST - for a source code formatter it may be a 1:1 correspondence
// with the CST (in which case we would use the CST directly and not bother
// to create an AST from it), but for a proper compiler or even a language
// translator, the AST would usually be a simplified version of the CST,
// with unnecessary information removed and some items from the parse tree
// moved around for convenience.

// These data structures are stored in simple integer arrays.  They are
// effectively C structs, but without the complexity of declaring a variant
// record with over 200 different layouts to cover every phrase type.
// Structure members are determined by fixed offsets, which will often
// be implemented with literal constants though in the case of tuples
// with too many member fields to keep track of easily, with symbolic
// constants.

DECLARE(AST,int,8000000); // abstract syntax tree
#define _AST(x) WRITE(x,AST,int)
#define  AST(x) READ(x,AST,int)
int AST_nextfree = 0;

// TUPLE_RESULT_FIELDS represents the count of all the extra fields that can
// be added to every tuple (and which are filled in later) with things like
// the inferred type of an expression (bottom-up), or line numbers or whatever
// may turn out to be needed that wasn't thought of when the code was first
// designed.
// There are no extra fields in the CST, only in the AST.

#define TUPLE_RESULT_FIELDS 3
#define RESULT_FIELD_TYPEINFO 0
#define RESULT_FIELD_WHATEVER1 1
#define RESULT_FIELD_WHATEVER2 2

#define AST_idx_mask 0xfFFFFFF
#define AST_type_shift 28
#define AST_type_mask  15

#define AST_BIP     (1 << AST_type_shift)
#define AST_PHRASE  (2 << AST_type_shift)
#define AST_LITERAL (3 << AST_type_shift)


#define SubPhraseIdx(P,N)  AST(((P)&AST_idx_mask)+4+TUPLE_RESULT_FIELDS+N-1)
//#define SubPhrase(P,N)    (AST(((P)&AST_idx_mask)+4+TUPLE_RESULT_FIELDS+N-1)&AST_idx_mask)
#define SubPhrase(P,N)    (SubPhraseIdx(P,N)&AST_idx_mask)

void PrintLower(int Literal) {
  int i;
  int inclusive_start = atom(Literal).start;
  int exclusive_end = atom(Literal).end;
  for (i = inclusive_start; i < exclusive_end; i++) {
    wint_t c = source(i);
    if (isalpha(c) && isupper(c)) c = tolower(c);
    fprintf(stdout, "%lc", c);
  }
}

void PrintUpper(int Literal) {
  int i;
  int inclusive_start = atom(Literal).start;
  int exclusive_end = atom(Literal).end;
  for (i = inclusive_start; i < exclusive_end; i++) {
    wint_t c = source(i);
    if (isalpha(c) && islower(c)) c = toupper(c);
    fprintf(stdout, "%lc", c);
  }
}

#ifdef EXTRA_DEBUG
void XPrintAtom(int Literal, char *filename, int line) {
  int i;
  fprintf(stderr, "\"%s\", Line %d: Literal = %x\n", filename, line, Literal);
  for (i = atom(Literal).start; i < atom(Literal).end; i++) fprintf(stdout, "%lc", source(i));
}
#define PrintAtom(x) XPrintAtom(x, __FILE__, __LINE__)
#else
void PrintAtom(int Literal) {
  int i;
  for (i = atom(Literal).start; i < atom(Literal).end; i++) fprintf(stdout, "%lc", source(i));
}
#endif
#define S(x) ((wchar_t *)&Stringpool(x))
int wlit(int Literal) { // an int index into stringpool, not a wchar_t *, because the stringpool
                        // may be relocated underfoot.  To get a wchar_t from a stringpool index,
                        // use macro S(), but only in contexts where the stringpool cannot be relocated
                        // during the lifetime of the pointer.
  int i, Sp = Stringpool_nextfree;
  for (i = atom(Literal).start; i < atom(Literal).end; i++) _Stringpool(Stringpool_nextfree++) = source(i);
  _Stringpool(Stringpool_nextfree++) = 0;
  fprintf(stderr, "%ls", S(Sp));
  return Sp;
}

int mktuple(int op, int alt, int count, int T[]);

int reg(int C,char *s) {
  if (debug_ast >= 2) fprintf(stderr, "reg(%d /* %s */)\n", C, s);
  return AST_LITERAL | C;
}

int kw(int C, char *s) {
  if (debug_ast >= 2) fprintf(stderr, "kw(%d /* %s */)\n", C, s);
  return AST_LITERAL | C;
}

int ch(int C, char c) {
  if (debug_ast >= 2) fprintf(stderr, "ch(%d /* '%c'*/)\n", C, c);
  return AST_LITERAL | C;
}

int BIP(int C, int P) {
  if (debug_ast >= 2) fprintf(stderr, "BIP(%d)\n", C);
  return AST_BIP | C;
}


//################################################################################################## end of compile.c

int literal_descriptor(int inclusive_start, int exclusive_end) {
  int i;
  last_used_descriptor++;
  _atom(last_used_descriptor).start = inclusive_start;
  _atom(last_used_descriptor).end = exclusive_end;

  if (debug_literals) {
    fprintf(stderr, "atom[%x] = \"", last_used_descriptor);
    for (i = inclusive_start; i < exclusive_end; i++) {
      fprintf(stderr, "%lc", source(i));
    }
    fprintf(stderr, "\" @ %x\n", inclusive_start);
  }
  
  return last_used_descriptor;
}

// We keep track of the farthest distance parsed as a diagnostic aid to
// debug and/or determine the cause of parse failures.
static int farthest_read = -1;
wint_t examine(int offset) {

  if (debug_completion) {
    if (offset > farthest_read) {
      int i;
      for (i = farthest_read+1; i <= offset; i++) {
        fprintf(stderr, "%lc", source(i));
      }
      farthest_read = offset;
    }
  }
  
  return source(offset);
}

// was 10240 ... was getting a crash from a coment > 10K. Testing to see if this is where it was coming from.
// ... and apparently it was.  I guess I need to bte the bullet and find a way to make this flex.
// (btw there's another 10240 byte array in regexp-lexer.h:CHAR XSTRING[10240];  )
static wchar_t Matched_string[1024*32]; // Global because it is used in debug reports.
// Size is overkill, but converting to a flex array is a bit tricky for this one
// since it is being passed as a parameter rather than being used as a global.

// The global RR is a small hack to support pre-compiled regular expressions -
// RR points to an array allocated off the stack (but inside 'main' so it is always
// in scope). I suppose it should have been claimed through malloc for cleanliness.

// At some point I will remove the option to not pre-compile.
static regexp **RR = NULL;

int regex_match_r(const wchar_t *text, regexp *r, int *len) {
  // We have precompiled all regexps.
  // We could potentially cache the triple of <regexp, offset, result match>
  // in a memofn style, to make re-parsing after backtracking faster, but it
  // really is not needed,
  int i;
  i = regexec(r, 0); // i = 0 on exit for success.
  if (i) {
    regsub(r, L"\\0", Matched_string);  // DANGER WILL ROBINSON.  SIZE LIMIT CAN EASILY BE BUSTED.
    (*len) += wcslen(Matched_string);
    return TRUE;
  }
  return FALSE;
}

int *TP = NULL; // used for communication with regexp module.  Sorry, I know that's a bit hacky.

#define phrase_start(i) BOUNDS_CHECK(sequential_phrase_no_to_grammar_index,i,NUM_SIMPLE_PHRASES)
#define bip_map(i) BOUNDS_CHECK(bip_map,i,NUM_BIPS)
#define gram(x) BOUNDS_CHECK(gram,x,sizeof(gram)/sizeof(gram[0]))

#define IN_PARSER 1
#include GRAMMAR // This is the grammar.h file specific to the language being compiled, generated from grammar.g
                 // grammar.g is either written in raw format *or* extracted from the .c file specific to the compiler

int mktuple(int op, int alt, int count, int T[]) {
#define T(x) BOUNDS_CHECK(T,x,LARGEST_ALT)

  // T[] comes from a small array in build_ast that is only large enough
  // to hold the number of elements in the largest alt of a phrase.
  // We want to keep the amount of stack space claimed by
  // an instantiation of 'parse()' to a minimum, as we want to allow
  // whole-program parsing, not just line-at-a-time style.  So in
  // theory there may be as many calls to parse() as there are characters
  // in the file you are parsing, and that number would be multiplied
  // by the size of local stack data per call. (In practice, far fewer, but
  // since a typical large program may be 500,000 characters, that's still
  // a potentially huge stack frame if local data per call is not minimised)
  
  T[0] = -1; // unused for now
  
  int i, tuple = AST_nextfree;  // 'op' is the P_whatever used in the grammar
  _AST(AST_nextfree++) = 0;     // reserved field
  _AST(AST_nextfree++) = op;    // AST_op_offset
  _AST(AST_nextfree++) = alt;   // AST_alt_offset
  _AST(AST_nextfree++) = count; // AST_count_offset

  for (i = 0; i < TUPLE_RESULT_FIELDS; i++) {
    _AST(AST_nextfree++) = 0;   // source line where tuple was created etc
  }

  // Add tuple phrases to CST.  Include T[0]
  if (debug_ast) fprintf(stderr, "AST[%x] = mktuple(%ls, %d, %d, [ ", tuple&AST_idx_mask, PHRASE(op), alt, count);  // WARNING! PHRASE() MACRO NOT DEFINED YET????
  for (i = 1; i <= count; i++) {
    if (debug_ast) fprintf(stderr, "%x ", T(i));
    _AST(AST_nextfree++) = T(i);
  }
  if (debug_ast) fprintf(stderr, "]) {Type %d}\n", AST_PHRASE>>AST_type_shift);
  return AST_PHRASE | tuple;
#undef T
}

// Convert CST to AST.
#define build_ast(x) build_ast_inner(x, __FILE__, __LINE__)

int build_ast_inner(int P, char *file, int line) { // Parameter is index into CST (with type info); returns an index into an AST.
  int T[LARGEST_ALT];  // This *has* to be local stack data, not static.
                       // (otherwise two successive calls will corrupt
                       // the first call's results.  This is not ideal.)
  int phrases = 0;
  int phrase  = CST(P++);
  int alt     = CST(P++);
  int P_      = phrase&INDEX_MASK;
  int type    = PhraseType(phrase);

  if ((P&(~INDEX_MASK)) != 0) {
    fprintf(stderr, "build_ast(0x%x) was passed a parameter that was not a simple index into CST[]:\n", P&(~INDEX_MASK));
    fprintf(stderr, "  build_ast(%d -> %d,%d) in \"%s\", line %d\n", P, CST(P)&INDEX_MASK, P_, file, line);
    exit(EXIT_FAILURE);
  }
  if ((type != PhraseType(PHRASE_TYPE)) && (type != PhraseType(SEMANTIC_TYPE))) {
    fprintf(stderr, "build_ast(TYPE=0x%x) was passed a parameter that does not point to a PHRASE_TYPE or a SEMANTIC_TYPE\n", type);
    fprintf(stderr, "  build_ast(%d -> %d,%d) in \"%s\", line %d\n", P, CST(P)&INDEX_MASK, P_, file, line);
    exit(EXIT_FAILURE);
  }
  if (debug_ast >= 2) {
    if (type == PhraseType(PHRASE_TYPE)) {
      fprintf(stderr, "build_ast(%d /* %ls */)\n", P-2, phrasename[phrasenum(P_)]);
    } else if (type == PhraseType(SEMANTIC_TYPE)) {
      fprintf(stderr, "build_ast(%d /* %ls */)\n", P, semantic_phrasename[P_]);
    }
  }
  
  switch (P_) {
#include CST2AST // e.g. "algol60-ast.h"
// CST2AST is the code that converts the CST to an AST.  A default version (eg algol60-ast.c)
// is generated by program "regen".  The programmer will almost certainly want to modify this
// so that it creates a more appropriate AST for the specific application.
  }
  return -1;
}

// The module that is specific to the application is the one passed in as APPMODULE
// which is derived from (in this example) algol60-comp.c which is generated by gencomp.
// It should be renamed appropriately for the application, e.g. to algol60-indent.c, and
// edited to do whatever is required of the main application, whether that is a
// source-to-source translator, an indent program like 'soap', or a real compiler:

#ifdef APPMODULE
#include APPMODULE       // e.g. passed in by -DAPPMODULE="algol60-indent.c"
// APPMODULE goes hand-in-hand with the main procedure in it, which acts on the AST,
// which is passed is by -DAPPCOMMAND=reindent or whatever the compile() procedure is called.

#else

// If an application module is not supplied, we'll use a default module which simply
// re-outputs the source from the AST.  This is a more compact version of the code
// that would be created by gencomp, which expands all the specific grammar phrases
// explicitly.

void walk_ast(int P, int depth) {
  if (P <= 0) return;
  int i;
  // avoid runtime error of "left shift of 15 by 28 places":
  int AST_type = (int)((unsigned int)P&(((unsigned int)AST_type_mask)<<(unsigned int)AST_type_shift));
  int AST_index = P&AST_idx_mask;
  int op = AST(AST_index+1);
  int alt = AST(AST_index+2);
  int count = AST(AST_index+3);

  if (AST_type == AST_PHRASE) {

    switch (op) {

    default: // Use the default output code:
      for (i = 1; i <= count; i++) walk_ast(SubPhraseIdx(P,i), depth+1);
    }

  } else if ((AST_type == AST_LITERAL) || (AST_type == AST_BIP)) {
    PrintAtom(AST_index);
  }
}
#endif


// phrasenum & TERM_NAME are so useful they may be better in a library.

// Look up name of a phrase entry.
int phrasenum_inner(int PhraseStart, char *file, int line) {
  int i = 0, phrasesize = NUM_SIMPLE_PHRASES;
  for (;;) { if ((i >= phrasesize) || (phrase_start(i) == PhraseStart)) break; i++; }
  if (i == phrasesize) {
    fprintf(stderr,
            "DEBUG #A: Cannot find a phrase starting at index=%d (0x%x), from \"%s\", line %d\n",
            PhraseStart, PhraseStart, file, line);
    //exit(1);
    return -1;
  };
  return i;//+PHRASE_BASE;
}
int phrasenum(int PhraseStart) {
  return phrasenum_inner(PhraseStart, "unknown", 0);
}
#define phrasenum(x) phrasenum_inner(x, __FILE__, __LINE__)

// Look up name of a BIP entry.
int BIPnum(int BIPStart) {
  int i = 0, bipsize = sizeof(bip_map)/sizeof(bip_map[0]);
  for (;;) { if ((i >= bipsize) || (bip_map(i) == BIPStart)) break; i++; }
  if (i == bipsize) { fprintf(stderr, "DEBUG #B: Cannot find a phrase starting at index=%d\n", BIPStart); exit(1); };
  return i;
}

// Look up name of a phrase entry for use in diagnostics.
wchar_t *PHRASE_inner(int G_PhraseStart, char *file, int line) {
  int P_num = PHRASE_BASE+phrasenum_inner(G_PhraseStart&INDEX_MASK, file, line);
  if (P_num < PHRASE_BASE) {
    fprintf(stderr, "PHRASE called with bad parameters from \"%s\", line %d\n", file, line);
    return L"ERROR";
  } else {
    if (P_num >= NUM_BIPS+NUM_SIMPLE_PHRASES+NUM_SEMANTIC_PHRASES) {
      fprintf(stderr, "P_num is too high (%d >= %d = %d+%d+%d) at line %d\n",
              P_num, NUM_BIPS+NUM_SIMPLE_PHRASES+NUM_SEMANTIC_PHRASES,
              NUM_BIPS, NUM_SIMPLE_PHRASES, NUM_SEMANTIC_PHRASES, __LINE__+4);
    } if (P_num < 0) {
      fprintf(stderr, "P_num is less that 0 at line %d\n",__LINE__+2);
    }
    wchar_t *result = (wchar_t *)phrasename[P_num]; // <--- getting a warning that this is out of range (with assistant.g)
    return result;
  }
}
wchar_t *PHRASE(int PhraseStart) {
  return PHRASE_inner(PhraseStart, "unknown", 0);
}
#define PHRASE(x) PHRASE_inner(x, __FILE__, __LINE__)

// Look up name of a phrase entry for use in diagnostics.
wchar_t *SEMANTIC_PHRASE_inner(int G_PhraseStart, char *file, int line) {
  int P_num = SEMANTIC_BASE+G_PhraseStart; // phrasenum_inner(G_PhraseStart, file, line);
  if (P_num < 0) {
    fprintf(stderr, "SEMANTIC_PHRASE called with bad parameters from \"%s\", line %d\n", file, line);
    return L"ERROR";
  } else {
    //fprintf(stderr, "SEMANTIC_PHRASE at G=%d maps to P=%d\n", G_PhraseStart, P_num);
    wchar_t *result = (wchar_t *)phrasename[P_num];
    return result;
  }
}
wchar_t *SEMANTIC_PHRASE(int PhraseStart) {
  return SEMANTIC_PHRASE_inner(PhraseStart, "unknown", 0);
}
#define SEMANTIC_PHRASE(x) SEMANTIC_PHRASE_inner(x, __FILE__, __LINE__)


// Be careful not to evaluate parameters twice.
// Maybe a static inline would be better.
#define MAX(a,b) ({int A=a, B=b; (A>B ? A : B);})

// Look up description of a terminal for use in diagnostics.
#define TERM_NAME(P) TERM_NAME_inner(P, __LINE__)
wchar_t *TERM_NAME_inner(int P, int line) {
#define TMPSIZE 512 // Overkill.
  static wchar_t tmp[TMPSIZE+1];
  int type = PhraseType(P)<<GRAMMAR_TYPE_SHIFT;
  
  // THIS IS ALL MESSED UP RIGHT NOW.
  if (type == PHRASE_TYPE) {
    wchar_t *name = PHRASE(P&INDEX_MASK);
    P = phrasenum(P&INDEX_MASK);
    if ((P&INDEX_MASK)+PHRASE_BASE < NUM_BIPS+NUM_SIMPLE_PHRASES+NUM_SEMANTIC_PHRASES) {
      swprintf(tmp, TMPSIZE, L"<%s%s%ls>", P&NEGATED_PHRASE ? "!":"",
                                           P&GUARD_PHRASE   ? "?":"",
                                           name /*phrasename[(P&INDEX_MASK)+PHRASE_BASE]*/);
    } else {
      // called with a G_* instead of a P_* !
      // Need to convert by finding index of entry in sequential_phrase_no_to_grammar_index aka phrase_start()
      // that matches. 
      swprintf(tmp, TMPSIZE, L"<%s%s phrasename[%d] at line %d>", P&NEGATED_PHRASE ? "!":"",
                                           P&GUARD_PHRASE   ? "?":"",
                                           (P&INDEX_MASK)+PHRASE_BASE, line);
    }
  } else if (type == SEMANTIC_TYPE) {
    wchar_t *name = SEMANTIC_PHRASE(P&INDEX_MASK);
    if ((P&INDEX_MASK)+SEMANTIC_BASE < NUM_BIPS+NUM_SIMPLE_PHRASES+NUM_SEMANTIC_PHRASES) {
      swprintf(tmp, TMPSIZE, L"<%s%s%ls>", P&NEGATED_PHRASE ? "!":"",
                                           P&GUARD_PHRASE   ? "?":"",
                                           name /*phrasename[(P&INDEX_MASK)+SEMANTIC_BASE]*/);
    } else {
      // called with a G_* instead of a P_* !
      // Need to convert by finding index of entry in sequential_phrase_no_to_grammar_index aka phrase_start()
      // that matches. 
      swprintf(tmp, TMPSIZE, L"<%s%s phrasename[%d]>", P&NEGATED_PHRASE ? "!":"",
                                           P&GUARD_PHRASE   ? "?":"",
                                           (P&INDEX_MASK)+SEMANTIC_BASE);
    }
  } else if (type == BIP_TYPE) {
    if ((P&INDEX_MASK)+BIP_BASE < NUM_BIPS+NUM_SIMPLE_PHRASES+NUM_SEMANTIC_PHRASES) {
      swprintf(tmp, TMPSIZE, L"<%s%s%ls>", P&NEGATED_PHRASE ? "!":"",
                                           P&GUARD_PHRASE   ? "?":"",
                                           phrasename[(P&INDEX_MASK)+BIP_BASE]);
    } else {
      swprintf(tmp, TMPSIZE, L"<%s%s phrasename[%d]>", P&NEGATED_PHRASE ? "!":"",
                                           P&GUARD_PHRASE   ? "?":"",
                                           (P&INDEX_MASK)+BIP_BASE);
    }
  } else if (type == KEYWORD_TYPE) {
    swprintf(tmp, TMPSIZE, L"\"%ls\"", keyword[P&INDEX_MASK]);
  } else if (type == CHAR_TYPE) {
    swprintf(tmp, TMPSIZE, L"'%lc'", P&255);
  } else if (type == REGEXP_TYPE) {
    swprintf(tmp, TMPSIZE, L"«%ls»", regexps[P&INDEX_MASK]+1);
    
  } else {
    // one of the ones I haven't got around to yet.
    swprintf(tmp, TMPSIZE, L"{undecoded terminal %d %x}", PhraseType(P), P&INDEX_MASK);
  }
  return tmp;
}

#define TAB "    "
#define LTAB L"    "
#define indent(depth) do {int i; for (i = 0; i < depth; i++) fprintf(stderr, TAB); } while(0);


// Main recursive parser procedure.

static int CSTidx = 0; // index of the next CST entry to receive some
                       // data from the parse tree.

// 'TP' is "Text Pointer" - the index of the next character in the source file
// to be parsed. (Actually we no longer parse directly from the source file -
// to accommodate Unicode more easily, there is a pre-pass which reads the
// source file into a wint_t array.  This pre-pass can also be used, if
// necessary, to pre-filter sources in the style of 'line reconstruction'
// performed by compilers in the 60's.

static int BestTPOK = 0, BestTPFail = 0;

// Initially returned true/false but need to switch to returning CST index.

// Parse returns:

//   >= 2 for a rule
//   1 for a <!phrase> (successful, but no associated data)
//   0 for a parse fail

static int RECURSION_MAX_DEPTH = 0, runaway_recursion = 1, error = 0;

void show_pending_input(void) {
  int i, j;
  wint_t c;
  
  fprintf(stderr, "; # '");
  i = *TP; j = 0;
  for (;;) {
    c = source(i);
    if (c == 0) break;
    if (c == '\n') fprintf(stderr, "\\n"); else fprintf(stderr, "%lc", c);
    i += 1; j += 1;
    if (j == 10) break;
  }
  fprintf(stderr, "'%s\n", c == 0 ? " <EOF>" : " ...");
}

/* Should I make the result of parse() an unsigned int? - it would simplify some arithmetic shifts... */

// parse() would be a lot shorter and possibly easier to follow, if it weren't for the runtime
// tracing of the parse.  But I beg you, don't remove it.  It is *extremely* useful when debugging
// a new grammar.

int parse(int *XTP, int P, int depth) { // depth is only used for indenting the debugging
  if (depth > RECURSION_MAX_DEPTH) RECURSION_MAX_DEPTH = depth;
  // Highest observed MAX DEPTH was < 1000.

#ifdef DEBUG_RECURSIVE_PROBLEM
  switch (PhraseType(P)<<GRAMMAR_TYPE_SHIFT) {
  case PHRASE_TYPE: fprintf(stderr, "P<%ls>", PHRASE(P & INDEX_MASK)); break;
  case KEYWORD_TYPE: fprintf(stderr, "%ls", TERM_NAME(P)); break;
  case CHAR_TYPE: fprintf(stderr, "%ls", TERM_NAME(P)); break;
  case REGEXP_TYPE: fprintf(stderr, "%ls", TERM_NAME(P)); break;
  case BIP_TYPE:  fprintf(stderr, "BIP"); break;
  case SEMANTIC_TYPE: { int ProcNo = P&INDEX_MASK; fprintf(stderr, "%ls()", semantic_phrasename[ProcNo]); }; break;
  default: fprintf(stderr, "UNKNOWN PHRASE TYPE"); break;
  }
  //show_pending_input();
  fprintf(stderr, "\n"); 
#endif

  if (depth == 40000) { // 10000
    fprintf(stderr, "\n* ERROR: We appear to have runaway parse recursion - possibly an undetected grammar loop from left-recursion.\n");
    // FORCE GDB CRASH
    runaway_recursion/=error; // in gdb, see where the array index was out of range by typing:  up 2
  }
  int rule = UNASSIGNED; // used for results of sub-phrase and terminal parsing.
  int This_Phrase = CSTidx; // needed for backtracking after a failed alternative.
  TP=XTP; // Unfortunately we communicate with the regexp package through a global *TP
          // so this hack makes our text pointer globally visible.  Right now I can't
          // remember why the global is *TP and not just a simple 'TP'.
  if (*TP > BestTPOK) BestTPOK = *TP;
  int InitialTP = *TP;
  int type = PhraseType(P);
  //int negated = P & NEGATED_PHRASE;
  //int guard = P & GUARD_PHRASE;
  int whitespace = P & WHITESPACE_ALLOWED;
  int index = P & INDEX_MASK;
  int InitIndex = index;

  // Although it is only *preceding* white space that is skipped, trailing white space at
  // the end of the file is also skipped because it precedes the EOF token.
  if (whitespace) { while (examine(*TP) == ' ' || examine(*TP) == '\n' || examine(*TP) == '\t') (*TP)++; }
                    // (the second and third examine()s above could be replaced with source() safely.)
  if (*TP > BestTPFail) BestTPFail = *TP;
  
  // Each item in the grammar either describes the tree structure, or a terminal to be matched.
  // There are multiple types of terminal possible - this code has implemented a few of them but
  // a few others have been sketched in for future expansion.
  switch (type<<GRAMMAR_TYPE_SHIFT) {

  case PHRASE_TYPE:
    {
      // Recursively match a sub-phrase:
      // Matched terminals are written to the analysis record (Concrete Syntax Tree)
      // for use by the code associated with the grammar.  The CST very much reflects the
      // layout of the grammar tables except that it only holds the one alt that was successful.

      int i, j;
      int Alt, Alts, Phrase, Phrases;
      
      // THIS BLOCK IS FOR TRACE INFORMATION ONLY.
      if (debug_parser) {
        int ix = index;
        indent(depth); fprintf(stderr, "P<%ls> = ", PHRASE(ix));
        // BUG: It looks like the expansion is corrupt:
        // reported:       P<SS> = <init> <terminate> <MAIN-PROGRAM> <terminate>; # 'b̲e̲g̲i̲n̲' ...
        // actual grammar: P<SS> = <init> <optional-stropping-conversion> <SOURCE> <terminate>;

        Alts = gram[ix++]&INDEX_MASK;
        for (Alt = 0; Alt < Alts; Alt++) {
          Phrases = gram[ix++]&INDEX_MASK;
          for (Phrase = 0; Phrase < Phrases; Phrase++) {
            int Object = gram[ix];
            fprintf(stderr, "%ls", TERM_NAME(Object));
            ix += 1;
            if (Phrase+1 != Phrases) fprintf(stderr, " ");
          }
          if (Alt+1 != Alts) fprintf(stderr, ", ");
        }
        show_pending_input();
      }

      // The primary parser engine follows.
      
      // P's index part is a pointer to the start of a phrase
      Alts = gram[index++]&INDEX_MASK;
      int BacktrackLiteralPos = last_used_descriptor;
      int BacktrackTextPos = *TP;
      for (Alt = 0; Alt < Alts; Alt++) {
        // TO DO: rework this so that Matched is considered true if >= 0 and false if < 0. Then remove bugfix.
        int Matched = TRUE;

        last_used_descriptor = BacktrackLiteralPos;
        *TP = BacktrackTextPos;

        if (debug_parser) if (Alt) { indent(depth); fprintf(stderr, TAB "----------------------------------- Alternative #%d\n", Alt); }
        
        Phrases = gram[index++]&INDEX_MASK;

        CSTidx = This_Phrase;   // To backtrack the location where the parsed results are stored
                                // It is reset as we move on to each alternative after an alternative fails.
          
        // IF ALL TERMS IN THE LOOP BELOW ARE SUCCESSFUL, WE HAVE PARSED THE PHRASE AND SAVED THE ANALYSIS RECORD:

        _CST(CSTidx++) = P;     // Record which P<phrase> we are matching. (This could be done outside the alt loop, but the logic is clearer if we do it here)
        _CST(CSTidx++) = Alt;   // And record which alt matched.
        if (Phrases == 0) {
          if (debug_parser) { indent(depth); fprintf(stderr, TAB "NULL  Matched\n"); }          
        } else {
          int SubphraseIdx = CSTidx;     // This is where the result of each subphrase will be stored.
          CSTidx += Phrases;      // Skip ahead to after where the info for this phrase
                                  // would be stored.  This will cause recursively-parsed
                                  // sub-phrases to be stored *after* this phrase.
                                  // We just have to be careful when we save the data
                                  // for this phrase, to do so in the gap we just created,
                                  // i.e. at "SubphraseIdx" and upward.

          for (Phrase = 0; Phrase < Phrases; Phrase++) {
            if (Matched) {
              int Object = gram[index];
              int LastTP = *TP;

              Matched = ((_CST(SubphraseIdx) = rule = parse(TP, Object, depth+1)) != 0); // All phrases in an Alt have to parse for the Alt to succeed.

              if (Object & NEGATED_PHRASE) {
                Matched = !Matched; if (Matched) _CST(CSTidx) = 1; // mark a <!phrase> in the analysis record.
              }
              
              // Guard phrases <?phrase> are recorded in the analysis record like normal phrases - they just don't increment
              // the text pointer, so that the same text may be parsed again by the actual phrase which follows the guard.
              if (Matched && (Object & GUARD_PHRASE)) *TP = LastTP;

              SubphraseIdx += 1;

            } else {
              // Recursive parse failed, or literal comparison failed, so
              // skip subsequent phrases in this Alt, updating pointer to next Alt.
              if (debug_parser) { indent(depth); fprintf(stderr, TAB "P%ls  Skipped.\n", TERM_NAME(gram[index])); }
            }
            index++; // remember, index points to the next grammar item.  Maybe I should rename it.
          }
        }

        if (Matched) {
          if (debug_parser) { indent(depth); fprintf(stderr, TAB "P<%ls> FOUND", PHRASE(InitIndex)); show_pending_input(); }
          return This_Phrase; // with TP and atom index (last_used_descriptor) updated
        } else {
          if (debug_parser) { indent(depth); fprintf(stderr, TAB "Alt %d of P<%ls> NOT FOUND", Alt, PHRASE(InitIndex)); show_pending_input(); }
        }
      }

      // All failures must return via here:
      *TP = BacktrackTextPos;
      last_used_descriptor = BacktrackLiteralPos;

      if (debug_parser) { indent(depth); fprintf(stderr, "P<%ls> NOT FOUND", PHRASE(InitIndex)); show_pending_input(); }
      
      return 0; // with TP and atom index (last_used_descriptor) backtracked
    }
    
  case KEYWORD_TYPE:
    {
      int i;
      int len = wcslen(keyword[index]);
      
      if (debug_parser) { indent(depth); fprintf(stderr, "%ls", TERM_NAME(P)); }
      
      if (wcsncmp((const wchar_t *)&source(*TP), keyword[index], len) == 0) {
        
        if (debug_parser) { wint_t TMP=source(len+*TP);source(len+*TP)=0;fprintf(stderr, "  Matched \"%ls\"\n", (const wchar_t *)&source(*TP));source(len+*TP)=TMP; }
        
        *TP += len; (void)examine(*TP);
        return literal_descriptor(InitialTP/* -len+*TP */,*TP);
      } else {
        
        if (debug_parser) fprintf(stderr, TAB "No match\n");
        
        return FALSE;
      }
    }
    
  case CHAR_TYPE:
    {
      if (debug_parser) { indent(depth); fprintf(stderr, "%ls", TERM_NAME(P)); }
      
      if (examine(*TP) == (P&255)) {
        
        if (debug_parser) fprintf(stderr, TAB "Matched '%lc'\n", source(*TP));
        
        *TP += 1;  // by one source() element
        return literal_descriptor(InitialTP/* -1+*TP */, *TP);
      } else {
        
        if (debug_parser) fprintf(stderr, TAB "No match\n");
        
        return 0;
      }
    }
    
  case REGEXP_TYPE:
    {
      int len = 0;
      
      if (debug_parser) { indent(depth); fprintf(stderr, "%ls", TERM_NAME(P)); }
      
      if (regex_match_r((const wchar_t *)&source(*TP), RR[index], &len)) {
        *TP += len;
        
        if (debug_parser) fprintf(stderr, TAB "Matched \"%ls\"\n", Matched_string);
        
        return literal_descriptor(InitialTP/* -len+*TP */, *TP);
      } else {
        
        if (debug_parser) fprintf(stderr, TAB "No match\n");
        
        if ((*TP)+len > BestTPFail) BestTPFail = (*TP)+len;
        return 0;
      }
    }

    
  case BIP_TYPE:
    {
      int i;
      int Bip = P & INDEX_MASK;
      
      if (debug_parser) indent(depth);
      
      // These have to be coordinated with the grammar files.
      // We'll supply a set of BIPs that can be called from any grammar.
      switch (Bip) {
        // Any white space will have been skipped on entry to parse()
      case 0: { // EOF
        if (source(*TP) == 0) {
          
          if (debug_parser) fprintf(stderr, "BIP: EOF (Matched)\n");
          
          // This *should* have handled initial whitespace before EOF. But a \n is being left unread.
          return literal_descriptor(InitialTP/* *TP */, *TP);
        } else {
          
          if (debug_parser) fprintf(stderr, "BIP: EOF (No match)\n");
          
          return 0;
        }
      }
      case 1: {// ch
        // need to add ch that it matched.
        if (examine(*TP) == 0) return 0; // EOF!
        if (examine(*TP) != 0) {
          
          if (debug_parser) fprintf(stderr, "BIP: ch (Matched)\n");
          
          (*TP)++;
          return literal_descriptor(InitialTP/* -1+*TP */, *TP);
        } else {
          
          if (debug_parser) fprintf(stderr, "BIP: ch (No match)\n");
          
          return 0;
        }
      }
      case 2: {// nl
        if (examine(*TP) == '\n') {
          
          if (debug_parser) fprintf(stderr, "BIP: nl (Matched)\n");
          
          (*TP)++;
          return literal_descriptor(InitialTP/* -1+*TP */, *TP);
        } else {
          
          if (debug_parser) fprintf(stderr, "BIP: nl (No match)\n");
          
          return 0;
        }
      }
      default:
        if (debug_parser) fprintf(stderr, "BIP_TYPE (unknown BIP #%d)\n", Bip);
        return 0;
      }
    }

  case SEMANTIC_TYPE:
    {
      int ProcNo = P&INDEX_MASK;

      if (debug_parser) { indent(depth); fprintf(stderr, "C<%ls> = procno[%d]() = %ls()", SEMANTIC_PHRASE(index), ProcNo, semantic_phrasename[ProcNo]); show_pending_input(); }
      
      // Call the parse-time C<code> here to perform semantic checks during parse-time, rather than just simple syntax checking.
      //if (debug_parser) { indent(depth); fprintf(stderr, "Calling parsetime[ProcNo=%d]() aka parse_%ls()\n", ProcNo, semantic_phrasename[ProcNo]); }
      
      if (parsetime[ProcNo]()) {
        
        if (debug_parser) { indent(depth); fprintf(stderr, "%ls() = TRUE; // Matched.\n", semantic_phrasename[ProcNo]); }
        
        return literal_descriptor(InitialTP, *TP); // Most likely empty, but if semantic code moved (*TP) then return the desired text.
      } else {
        
        if (debug_parser) { indent(depth); fprintf(stderr, "%ls() = FALSE; // No match\n", semantic_phrasename[ProcNo]); }
        
        return 0;
      }
      //fprintf(stderr, "     SEMANTIC_TYPE - NOT IMPLEMENTED\n"); break;
    }
    
  // Handled within PHRASE_TYPE:
  case OPTION_TYPE:
    fprintf(stderr, "     OPTION_TYPE - Failure\n"); break;
  case COUNT_OF_PHRASES:
    fprintf(stderr, "     COUNT_OF_PHRASES - Failure\n"); break;
  case COUNT_OF_ALTS:
    fprintf(stderr, "     COUNT_OF_ALTS - Failure\n"); break;
  case ALT_NUMBER:
    fprintf(stderr, "     ALT_NUMBER - Failure\n"); break;

  // reserved for expansion to add new terminal types
  case UTF32CHAR_TYPE:
    fprintf(stderr, "     UTF32CHAR_TYPE - Failure\n"); break;
  case STRING_TYPE:   // <------- replace CHAR_TYPE handing with this one. Unlike keywords, no stropping support and no spaces within tokens. TO DO.
    fprintf(stderr, "     STRING_TYPE - Failure\n"); break;
  case UTF32STRING_TYPE:
    fprintf(stderr, "     UTF32STRING_TYPE - Failure\n"); break;

  default:
    fprintf(stderr, "     (%02x << GRAMMAR_TYPE_SHIFT) - Failure\n", type); break;
  }
  return 0;
}

extern int regex_main(int argc, char **argv);

#ifndef PARSER_MAIN
#define PARSER_MAIN "uparse-main.c"
#endif

#include PARSER_MAIN