From e8c2c8eb60ce88855ad139f78042f11f2aae07a2 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Fri, 26 Aug 2022 10:24:55 -0400 Subject: [PATCH 1/4] Fix parsing bigquery control structures --- src/parser.ts | 22 ++++++++----- src/tokenizer.ts | 5 +++ test/parser/bigquery.spec.ts | 62 ++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 8 deletions(-) create mode 100644 test/parser/bigquery.spec.ts diff --git a/src/parser.ts b/src/parser.ts index 1b42f93..77b10ca 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -57,7 +57,14 @@ export const EXECUTION_TYPES: Record = { ANON_BLOCK: 'ANON_BLOCK', }; -const statementsWithEnds = ['CREATE_TRIGGER', 'CREATE_FUNCTION', 'CREATE_PROCEDURE', 'ANON_BLOCK']; +const statementsWithEnds = [ + 'CREATE_TRIGGER', + 'CREATE_FUNCTION', + 'CREATE_PROCEDURE', + 'ANON_BLOCK', + 'UNKNOWN', +]; + const blockOpeners: Record = { generic: ['BEGIN', 'CASE'], psql: ['BEGIN', 'CASE', 'LOOP', 'IF'], @@ -65,7 +72,7 @@ const blockOpeners: Record = { mssql: ['BEGIN', 'CASE'], sqlite: ['BEGIN', 'CASE'], oracle: ['DECLARE', 'BEGIN', 'CASE'], - bigquery: ['BEGIN', 'CASE'], + bigquery: ['BEGIN', 'CASE', 'IF', 'LOOP', 'REPEAT', 'WHILE', 'FOR'], }; interface ParseOptions { @@ -581,7 +588,6 @@ function stateMachineStatementParser( ): StatementParser { let currentStepIndex = 0; let prevToken: Token | undefined; - let prevPrevToken: Token | undefined; let prevNonWhitespaceToken: Token | undefined; let lastBlockOpener: Token | undefined; @@ -606,7 +612,6 @@ function stateMachineStatementParser( }; const setPrevToken = (token: Token) => { - prevPrevToken = prevToken; prevToken = token; if (token.type !== 'whitespace') { prevNonWhitespaceToken = token; @@ -627,7 +632,8 @@ function stateMachineStatementParser( if ( statement.type && token.type === 'semicolon' && - (!statementsWithEnds.includes(statement.type) || (openBlocks === 0 && statement.canEnd)) + (!statementsWithEnds.includes(statement.type) || + (openBlocks === 0 && (statement.type === 'UNKNOWN' || statement.canEnd))) ) { statement.endStatement = ';'; return; @@ -653,7 +659,7 @@ function stateMachineStatementParser( prevPrevToken?.value.toUpperCase() !== 'END' ) { if ( - ['oracle', 'bigquery'].includes(dialect) && + dialect === 'oracle' && lastBlockOpener?.value === 'DECLARE' && token.value.toUpperCase() === 'BEGIN' ) { @@ -667,8 +673,7 @@ function stateMachineStatementParser( setPrevToken(token); if (statement.type === 'ANON_BLOCK' && !anonBlockStarted) { anonBlockStarted = true; - // don't return - } else { + } else if (statement.type) { return; } } @@ -683,6 +688,7 @@ function stateMachineStatementParser( if (statement.type && statement.start >= 0) { // statement has already been identified // just wait until end of the statement + setPrevToken(token); return; } diff --git a/src/tokenizer.ts b/src/tokenizer.ts index 4f17a9c..cf5fbff 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -28,6 +28,11 @@ const KEYWORDS = [ 'BEGIN', 'DECLARE', 'CASE', + 'LOOP', + 'IF', + 'REPEAT', + 'WHILE', + 'FOR', 'PROCEDURE', ]; diff --git a/test/parser/bigquery.spec.ts b/test/parser/bigquery.spec.ts new file mode 100644 index 0000000..f3305f5 --- /dev/null +++ b/test/parser/bigquery.spec.ts @@ -0,0 +1,62 @@ +import { parse } from '../../src/parser'; +import { expect } from 'chai'; + +describe('Parser for bigquery', () => { + // all testcases are taken straight from bigquery docs on procedural language + // see https://cloud.google.com/bigquery/docs/reference/standard-sql/procedural-language + describe('control structures', () => { + [ + `CASE + WHEN + EXISTS(SELECT 1 FROM schema.products_a WHERE product_id = target_product_id) + THEN SELECT 'found product in products_a table'; + WHEN + EXISTS(SELECT 1 FROM schema.products_b WHERE product_id = target_product_id) + THEN SELECT 'found product in products_b table'; + ELSE + SELECT 'did not find product'; + END CASE;`, + `IF EXISTS(SELECT 1 FROM schema.products + WHERE product_id = target_product_id) THEN + SELECT CONCAT('found product ', CAST(target_product_id AS STRING)); + ELSEIF EXISTS(SELECT 1 FROM schema.more_products + WHERE product_id = target_product_id) THEN + SELECT CONCAT('found product from more_products table', + CAST(target_product_id AS STRING)); + ELSE + SELECT CONCAT('did not find product ', CAST(target_product_id AS STRING)); + END IF;`, + `LOOP + SET x = x + 1; + IF x >= 10 THEN + LEAVE; + END IF; + END LOOP;`, + `REPEAT + SET x = x + 1; + SELECT x; + UNTIL x >= 3 + END REPEAT;`, + `WHILE x < 0 DO + SET x = x + 1; + SELECT x; + END WHILE;`, + `FOR record IN + (SELECT word, word_count + FROM bigquery-public-data.samples.shakespeare + LIMIT 5) + DO + SELECT record.word, record.word_count; + END FOR;`, + ].forEach((sql) => { + it(`parses ${sql.substring( + 0, + Math.min(sql.indexOf(' '), sql.indexOf('\n')), + )} structure`, () => { + const result = parse(sql, false, 'bigquery'); + expect(result.body.length).to.eql(1); + expect(sql.substring(result.body[0].start, result.body[0].end + 1)).to.eql(sql); + }); + }); + }); +}); From 08dbd3402d8aeefb9a4578d4d43124a684529e45 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Fri, 26 Aug 2022 10:35:19 -0400 Subject: [PATCH 2/4] improve bq control test --- test/parser/bigquery.spec.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/parser/bigquery.spec.ts b/test/parser/bigquery.spec.ts index f3305f5..0d4ea77 100644 --- a/test/parser/bigquery.spec.ts +++ b/test/parser/bigquery.spec.ts @@ -53,9 +53,11 @@ describe('Parser for bigquery', () => { 0, Math.min(sql.indexOf(' '), sql.indexOf('\n')), )} structure`, () => { - const result = parse(sql, false, 'bigquery'); - expect(result.body.length).to.eql(1); + const result = parse(`${sql}\nSELECT 1;`, false, 'bigquery'); + expect(result.body.length).to.eql(2); expect(sql.substring(result.body[0].start, result.body[0].end + 1)).to.eql(sql); + expect(result.body[0].type).to.eql('UNKNOWN'); + expect(result.body[1].type).to.eql('SELECT'); }); }); }); From 167a52c4254ccba7c1144a79058bf578a706d8d1 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Fri, 26 Aug 2022 10:44:08 -0400 Subject: [PATCH 3/4] parse standalone BEGIN as ANON_BLOCK --- src/parser.ts | 10 +++++++--- test/parser/bigquery.spec.ts | 7 +++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/parser.ts b/src/parser.ts index 1a45825..98d47a2 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -273,12 +273,16 @@ function createStatementParserByToken(token: Token, options: ParseOptions): Stat return createDeleteStatementParser(options); case 'TRUNCATE': return createTruncateStatementParser(options); - case 'DECLARE': case 'BEGIN': + if (['bigquery', 'oracle'].includes(options.dialect)) { + return createBlockStatementParser(options); + } + break; + case 'DECLARE': if (options.dialect === 'oracle') { return createBlockStatementParser(options); } - // eslint-disable-next-line no-fallthrough + break; default: break; } @@ -324,7 +328,7 @@ function createBlockStatementParser(options: ParseOptions) { preCanGoToNext: () => false, validation: { acceptTokens: [ - { type: 'keyword', value: 'DECLARE' }, + ...(options.dialect === 'oracle' ? [{ type: 'keyword', value: 'DECLARE' }] : []), { type: 'keyword', value: 'BEGIN' }, ], }, diff --git a/test/parser/bigquery.spec.ts b/test/parser/bigquery.spec.ts index 0d4ea77..7657429 100644 --- a/test/parser/bigquery.spec.ts +++ b/test/parser/bigquery.spec.ts @@ -61,4 +61,11 @@ describe('Parser for bigquery', () => { }); }); }); + + it('parses BEGIN statement as ANON_BLOCK', () => { + const result = parse(`BEGIN SELECT 1; END; SELECT 1;`, false, 'bigquery'); + expect(result.body.length).to.eql(2); + expect(result.body[0].type).to.eql('ANON_BLOCK'); + expect(result.body[1].type).to.eql('SELECT'); + }); }); From 893ebfb53335cba5ded669adbbdbb286488759b5 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Fri, 26 Aug 2022 10:48:47 -0400 Subject: [PATCH 4/4] do not detect transaction as anon_block --- src/parser.ts | 10 +++++++--- test/parser/bigquery.spec.ts | 8 ++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/parser.ts b/src/parser.ts index 98d47a2..db5acfc 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -201,7 +201,7 @@ export function parse(input: string, isStrict = true, dialect: Dialect = 'generi continue; } - statementParser = createStatementParserByToken(token, { isStrict, dialect }); + statementParser = createStatementParserByToken(token, nextToken, { isStrict, dialect }); if (cteState.isCte) { statementParser.getStatement().start = cteState.state.start; cteState.isCte = false; @@ -254,7 +254,11 @@ function initState({ input, prevState }: { input?: string; prevState?: State }): }; } -function createStatementParserByToken(token: Token, options: ParseOptions): StatementParser { +function createStatementParserByToken( + token: Token, + nextToken: Token, + options: ParseOptions, +): StatementParser { if (token.type === 'keyword') { switch (token.value.toUpperCase()) { case 'SELECT': @@ -274,7 +278,7 @@ function createStatementParserByToken(token: Token, options: ParseOptions): Stat case 'TRUNCATE': return createTruncateStatementParser(options); case 'BEGIN': - if (['bigquery', 'oracle'].includes(options.dialect)) { + if (['bigquery', 'oracle'].includes(options.dialect) && nextToken.value !== 'TRANSACTION') { return createBlockStatementParser(options); } break; diff --git a/test/parser/bigquery.spec.ts b/test/parser/bigquery.spec.ts index 7657429..d1af5a5 100644 --- a/test/parser/bigquery.spec.ts +++ b/test/parser/bigquery.spec.ts @@ -68,4 +68,12 @@ describe('Parser for bigquery', () => { expect(result.body[0].type).to.eql('ANON_BLOCK'); expect(result.body[1].type).to.eql('SELECT'); }); + + it('parses BEGIN TRANSACTION as UNKNOWN', () => { + const result = parse(`BEGIN TRANSACTION; SELECT 1; COMMIT;`, false, 'bigquery'); + expect(result.body.length).to.eql(3); + expect(result.body[0].type).to.eql('UNKNOWN'); + expect(result.body[1].type).to.eql('SELECT'); + expect(result.body[2].type).to.eql('UNKNOWN'); + }); });