diff --git a/meerkat-browser/package.json b/meerkat-browser/package.json index 0eb9f783..24bf7dec 100644 --- a/meerkat-browser/package.json +++ b/meerkat-browser/package.json @@ -1,6 +1,6 @@ { "name": "@devrev/meerkat-browser", - "version": "0.0.104", + "version": "0.0.105", "dependencies": { "tslib": "^2.3.0", "@devrev/meerkat-core": "*", diff --git a/meerkat-core/package.json b/meerkat-core/package.json index 743afa17..74ef3cda 100644 --- a/meerkat-core/package.json +++ b/meerkat-core/package.json @@ -1,6 +1,6 @@ { "name": "@devrev/meerkat-core", - "version": "0.0.104", + "version": "0.0.105", "dependencies": { "tslib": "^2.3.0" }, diff --git a/meerkat-core/src/cube-filter-transformer/in/in.spec.ts b/meerkat-core/src/cube-filter-transformer/in/in.spec.ts index 6ad59a49..207b33e8 100644 --- a/meerkat-core/src/cube-filter-transformer/in/in.spec.ts +++ b/meerkat-core/src/cube-filter-transformer/in/in.spec.ts @@ -4,7 +4,7 @@ import { inTransform } from './in'; describe('In transforms Tests', () => { it('Should throw error if values are undefined', () => { expect(() => - inTransform({ + inTransform({ member: 'country', operator: 'contains', memberInfo: { @@ -16,53 +16,61 @@ describe('In transforms Tests', () => { ).toThrow(); }); - it('Should return the correct value for string member', () => { - const expectedOutput = { - "alias": "", - "children": [ - { - "alias": "", - "class": "COLUMN_REF", - "column_names": [ - "country", - ], - "type": "COLUMN_REF", - }, - { - "alias": "", - "class": "CONSTANT", - "type": "VALUE_CONSTANT", - "value": { - "is_null": false, - "type": { - "id": "VARCHAR", - "type_info": null, - }, - "value": "US", - }, - }, - ], - "class": "OPERATOR", - "type": "COMPARE_IN", - }; - expect( - inTransform({ - member: 'country', - operator: 'contains', - values: ['US'], - memberInfo: { - name: 'country', - sql: 'table.country', - type: 'string', - }, - }) - ).toEqual(expectedOutput); + it('Should return optimized string_split approach for string type', () => { + const result = inTransform({ + member: 'country', + operator: 'in', + values: ['US', 'Canada', 'Mexico'], + memberInfo: { + name: 'country', + sql: 'table.country', + type: 'string', + }, + }); + + // Check it returns a subquery structure with string_split + expect(result).toHaveProperty('class', 'SUBQUERY'); + expect(result).toHaveProperty('type', 'SUBQUERY'); + expect(result).toHaveProperty('subquery_type', 'ANY'); + + // Verify it's using string_split + const selectList = (result as any).subquery.node.select_list[0]; + expect(selectList.function_name).toBe('unnest'); + expect(selectList.children[0].function_name).toBe('string_split'); + + // Verify no CAST for strings + expect(selectList.type).toBe('FUNCTION'); }); - it('Should return the correct value for string_array member', () => { + it('Should return optimized string_split approach with CAST for number type', () => { + const result = inTransform({ + member: 'order_id', + operator: 'in', + values: [1, 2, 3], + memberInfo: { + name: 'order_id', + sql: 'table.order_id', + type: 'number', + }, + }); + + // Check it returns a subquery structure + expect(result).toHaveProperty('class', 'SUBQUERY'); + expect(result).toHaveProperty('type', 'SUBQUERY'); + expect(result).toHaveProperty('subquery_type', 'ANY'); + + // Verify it's using string_split with CAST + const selectList = (result as any).subquery.node.select_list[0]; + expect(selectList.type).toBe('OPERATOR_CAST'); + expect(selectList.cast_type.id).toBe('DOUBLE'); + expect(selectList.child.function_name).toBe('unnest'); + expect(selectList.child.children[0].function_name).toBe('string_split'); + }); + + it('Should return standard ARRAY_CONSTRUCTOR for string_array type', () => { const output = inTransform({ member: 'country', - operator: 'contains', + operator: 'in', values: ['US', 'Germany', 'Israel'], memberInfo: { name: 'country', @@ -70,76 +78,169 @@ describe('In transforms Tests', () => { type: 'string_array', }, }) as ConjunctionExpression; - expect(output).toEqual( { - "alias": "", - "catalog": "", - "children": [ - { - "alias": "", - "class": "COLUMN_REF", - "column_names": [ - "country", - ], - "type": "COLUMN_REF", + + // For array types, should use && operator with ARRAY_CONSTRUCTOR + expect(output.function_name).toBe('&&'); + expect(output.children[1].type).toBe('ARRAY_CONSTRUCTOR'); + expect(output.children[1].children.length).toBe(3); + }); + + it('Should return standard COMPARE_IN for other types (default case)', () => { + const output = inTransform({ + member: 'some_field', + operator: 'in', + values: ['val1', 'val2'], + memberInfo: { + name: 'some_field', + sql: 'table.some_field', + type: 'time' as any, // Unknown type to trigger default case + }, + }); + + // Default case should use COMPARE_IN + expect(output).toHaveProperty('type', 'COMPARE_IN'); + expect(output).toHaveProperty('class', 'OPERATOR'); + expect((output as any).children.length).toBe(3); // column + 2 values + }); + + it('Should handle large value lists efficiently with string_split', () => { + const largeValueList = Array.from({ length: 1000 }, (_, i) => `value${i}`); + const result = inTransform({ + member: 'country', + operator: 'in', + values: largeValueList, + memberInfo: { + name: 'country', + sql: 'table.country', + type: 'string', + }, + }); + + // Should still use subquery approach + expect(result).toHaveProperty('class', 'SUBQUERY'); + + // Verify only 2 VALUE_CONSTANT nodes (joined string + delimiter) + const selectList = (result as any).subquery.node.select_list[0]; + const stringSplitChildren = selectList.children[0].children; + expect(stringSplitChildren.length).toBe(2); + expect(stringSplitChildren[0].value.value).toContain('§‡¶'); // Contains delimiter + }); + + it('Should use delimiter to join values', () => { + const result = inTransform({ + member: 'country', + operator: 'in', + values: ['US', 'Canada'], + memberInfo: { + name: 'country', + sql: 'table.country', + type: 'string', + }, + }); + + const selectList = (result as any).subquery.node.select_list[0]; + const joinedValue = selectList.children[0].children[0].value.value; + const delimiter = selectList.children[0].children[1].value.value; + + expect(delimiter).toBe('§‡¶'); + expect(joinedValue).toBe('US§‡¶Canada'); + }); + + it('Should handle the original test case structure for reference', () => { + const output = inTransform({ + member: 'country', + operator: 'in', + values: ['US', 'Germany', 'Israel'], + memberInfo: { + name: 'country', + sql: 'table.country', + type: 'string_array', + }, + }) as ConjunctionExpression; + expect(output).toEqual({ + alias: '', + catalog: '', + children: [ + { + alias: '', + class: 'COLUMN_REF', + column_names: ['country'], + type: 'COLUMN_REF', }, + { + alias: '', + children: [ { - "alias": "", - "children": [ - { - "alias": "", - "class": "CONSTANT", - "type": "VALUE_CONSTANT", - "value": { - "is_null": false, - "type": { - "id": "VARCHAR", - "type_info": null, - }, - "value": "US", + alias: '', + class: 'CONSTANT', + type: 'VALUE_CONSTANT', + value: { + is_null: false, + type: { + id: 'VARCHAR', + type_info: null, }, + value: 'US', + }, }, { - "alias": "", - "class": "CONSTANT", - "type": "VALUE_CONSTANT", - "value": { - "is_null": false, - "type": { - "id": "VARCHAR", - "type_info": null, - }, - "value": "Germany", + alias: '', + class: 'CONSTANT', + type: 'VALUE_CONSTANT', + value: { + is_null: false, + type: { + id: 'VARCHAR', + type_info: null, }, + value: 'Germany', + }, }, { - "alias": "", - "class": "CONSTANT", - "type": "VALUE_CONSTANT", - "value": { - "is_null": false, - "type": { - "id": "VARCHAR", - "type_info": null, - }, - "value": "Israel", + alias: '', + class: 'CONSTANT', + type: 'VALUE_CONSTANT', + value: { + is_null: false, + type: { + id: 'VARCHAR', + type_info: null, }, - }], - "class": "OPERATOR", - "type": "ARRAY_CONSTRUCTOR", - }, - ], - "class": "FUNCTION", - "distinct": false, - "export_state": false, - "filter": null, - "function_name": "&&", - "is_operator": true, - "order_bys": { - "orders": [], - "type": "ORDER_MODIFIER", + value: 'Israel', + }, + }, + ], + class: 'OPERATOR', + type: 'ARRAY_CONSTRUCTOR', }, - "schema": "", - "type": "FUNCTION", + ], + class: 'FUNCTION', + distinct: false, + export_state: false, + filter: null, + function_name: '&&', + is_operator: true, + order_bys: { + orders: [], + type: 'ORDER_MODIFIER', + }, + schema: '', + type: 'FUNCTION', }); }); + + it('Should throw error if values array is empty', () => { + expect(() => + inTransform({ + member: 'country', + operator: 'in', + values: [], + memberInfo: { + name: 'country', + sql: 'table.country', + type: 'string', + }, + }) + ).toThrow('In filter must have at least one value'); + }); }); diff --git a/meerkat-core/src/cube-filter-transformer/in/in.ts b/meerkat-core/src/cube-filter-transformer/in/in.ts index 41d3e87d..f075a88e 100644 --- a/meerkat-core/src/cube-filter-transformer/in/in.ts +++ b/meerkat-core/src/cube-filter-transformer/in/in.ts @@ -1,5 +1,15 @@ -import { COLUMN_NAME_DELIMITER } from '../../member-formatters/constants'; +import { + COLUMN_NAME_DELIMITER, + STRING_ARRAY_DELIMITER, +} from '../../member-formatters/constants'; import { Dimension, Measure } from '../../types/cube-types/table'; +import { + AggregateHandling, + QueryNodeType, + ResultModifierType, + SubqueryType, + TableReferenceType, +} from '../../types/duckdb-serialization-types'; import { ExpressionClass, ExpressionType, @@ -12,23 +22,24 @@ const inDuckDbCondition = ( values: string[], memberInfo: Measure | Dimension ) => { - const sqlTreeValues = values.map((value) => { - return { - class: ExpressionClass.CONSTANT, - type: ExpressionType.VALUE_CONSTANT, - alias: '', - value: valueBuilder(value, memberInfo), - }; - }); const columnRef = { - class: 'COLUMN_REF', - type: 'COLUMN_REF', + class: ExpressionClass.COLUMN_REF, + type: ExpressionType.COLUMN_REF, alias: '', column_names: columnName.split(COLUMN_NAME_DELIMITER), }; + switch (memberInfo.type) { case 'number_array': case 'string_array': { + const sqlTreeValues = values.map((value) => { + return { + class: ExpressionClass.CONSTANT, + type: ExpressionType.VALUE_CONSTANT, + alias: '', + value: valueBuilder(value, memberInfo), + }; + }); return { class: ExpressionClass.FUNCTION, type: ExpressionType.FUNCTION, @@ -55,7 +66,179 @@ const inDuckDbCondition = ( catalog: '', }; } + case 'string': + case 'number': { + /** + * Doing the string split optimization here because as the number of nodes in the AST increase, + * the time take to parse the AST increases, thereby increasing the time to generate the SQL. + */ + const joinedValues = values.join(STRING_ARRAY_DELIMITER); + + return { + class: ExpressionClass.SUBQUERY, + type: ExpressionType.SUBQUERY, + alias: '', + subquery_type: SubqueryType.ANY, + subquery: { + node: { + type: QueryNodeType.SELECT_NODE, + modifiers: [], + cte_map: { map: [] }, + select_list: [ + // For numeric types, we need to CAST the string result to the appropriate type + memberInfo.type === 'number' + ? { + class: ExpressionClass.CAST, + type: ExpressionType.OPERATOR_CAST, + alias: '', + child: { + class: ExpressionClass.FUNCTION, + type: ExpressionType.FUNCTION, + alias: '', + function_name: 'unnest', + schema: '', + children: [ + { + class: ExpressionClass.FUNCTION, + type: ExpressionType.FUNCTION, + alias: '', + function_name: 'string_split', + schema: '', + children: [ + { + class: ExpressionClass.CONSTANT, + type: ExpressionType.VALUE_CONSTANT, + alias: '', + value: { + type: { id: 'VARCHAR', type_info: null }, + is_null: false, + value: joinedValues, + }, + }, + { + class: ExpressionClass.CONSTANT, + type: ExpressionType.VALUE_CONSTANT, + alias: '', + value: { + type: { id: 'VARCHAR', type_info: null }, + is_null: false, + value: STRING_ARRAY_DELIMITER, + }, + }, + ], + filter: null, + order_bys: { + type: ResultModifierType.ORDER_MODIFIER, + orders: [], + }, + distinct: false, + is_operator: false, + export_state: false, + catalog: '', + }, + ], + filter: null, + order_bys: { + type: ResultModifierType.ORDER_MODIFIER, + orders: [], + }, + distinct: false, + is_operator: false, + export_state: false, + catalog: '', + }, + cast_type: { + id: 'DOUBLE', + type_info: null, + }, + try_cast: false, + } + : { + class: ExpressionClass.FUNCTION, + type: ExpressionType.FUNCTION, + alias: '', + function_name: 'unnest', + schema: '', + children: [ + { + class: ExpressionClass.FUNCTION, + type: ExpressionType.FUNCTION, + alias: '', + function_name: 'string_split', + schema: '', + children: [ + { + class: ExpressionClass.CONSTANT, + type: ExpressionType.VALUE_CONSTANT, + alias: '', + value: { + type: { id: 'VARCHAR', type_info: null }, + is_null: false, + value: joinedValues, + }, + }, + { + class: ExpressionClass.CONSTANT, + type: ExpressionType.VALUE_CONSTANT, + alias: '', + value: { + type: { id: 'VARCHAR', type_info: null }, + is_null: false, + value: STRING_ARRAY_DELIMITER, + }, + }, + ], + filter: null, + order_bys: { + type: ResultModifierType.ORDER_MODIFIER, + orders: [], + }, + distinct: false, + is_operator: false, + export_state: false, + catalog: '', + }, + ], + filter: null, + order_bys: { + type: ResultModifierType.ORDER_MODIFIER, + orders: [], + }, + distinct: false, + is_operator: false, + export_state: false, + catalog: '', + }, + ], + from_table: { + type: TableReferenceType.EMPTY, + alias: '', + sample: null, + }, + where_clause: null, + group_expressions: [], + group_sets: [], + aggregate_handling: AggregateHandling.STANDARD_HANDLING, + having: null, + sample: null, + qualify: null, + }, + }, + child: columnRef, + comparison_type: ExpressionType.COMPARE_EQUAL, + }; + } default: { + // For other types, use the standard COMPARE_IN approach + const sqlTreeValues = values.map((value) => { + return { + class: ExpressionClass.CONSTANT, + type: ExpressionType.VALUE_CONSTANT, + alias: '', + value: valueBuilder(value, memberInfo), + }; + }); + return { class: ExpressionClass.OPERATOR, type: ExpressionType.COMPARE_IN, @@ -68,7 +251,7 @@ const inDuckDbCondition = ( export const inTransform: CubeToParseExpressionTransform = (query) => { const { member, values, memberInfo } = query; - if (!values) { + if (!values || values.length === 0) { throw new Error('In filter must have at least one value'); } return inDuckDbCondition(member, values, memberInfo); diff --git a/meerkat-core/src/cube-filter-transformer/not-in/not-in.spec.ts b/meerkat-core/src/cube-filter-transformer/not-in/not-in.spec.ts index 3907aa8c..1641d668 100644 --- a/meerkat-core/src/cube-filter-transformer/not-in/not-in.spec.ts +++ b/meerkat-core/src/cube-filter-transformer/not-in/not-in.spec.ts @@ -4,7 +4,7 @@ import { notInTransform } from './not-in'; describe('Not In transforms Tests', () => { it('Should throw error if values are undefined', () => { expect(() => - notInTransform({ + notInTransform({ member: 'country', operator: 'contains', memberInfo: { @@ -16,45 +16,120 @@ describe('Not In transforms Tests', () => { ).toThrow(); }); - it('Should return the correct value for string member', () => { - const expectedOutput = { - "alias": "", - "children": [{ - "alias": "", - "class": "COLUMN_REF", - "column_names": [ - "country", - ], - "type": "COLUMN_REF", - }, - { - "alias": "", - "class": "CONSTANT", - "type": "VALUE_CONSTANT", - "value": { - "is_null": false, - "type": { - "id": "VARCHAR", - "type_info": null, - }, - "value": "US", - }, - }], - "class": "OPERATOR", - "type": "COMPARE_NOT_IN", - } - expect( - notInTransform({ - member: 'country', - operator: 'contains', - values: ['US'], - memberInfo: { - name: 'country', - sql: 'table.country', - type: 'string', - }, - }) - ).toEqual(expectedOutput); + it('Should return optimized string_split approach for string type', () => { + const result = notInTransform({ + member: 'country', + operator: 'notIn', + values: ['US', 'Canada', 'Mexico'], + memberInfo: { + name: 'country', + sql: 'table.country', + type: 'string', + }, + }); + + // Check it returns OPERATOR_NOT wrapping a subquery + expect(result).toHaveProperty('class', 'OPERATOR'); + expect(result).toHaveProperty('type', 'OPERATOR_NOT'); + expect(result.children[0]).toHaveProperty('class', 'SUBQUERY'); + + // Verify it's using string_split + const subquery = (result as any).children[0]; + const selectList = subquery.subquery.node.select_list[0]; + expect(selectList.function_name).toBe('unnest'); + expect(selectList.children[0].function_name).toBe('string_split'); + + // Verify no CAST for strings + expect(selectList.type).toBe('FUNCTION'); + }); + + it('Should return optimized string_split approach with CAST for number type', () => { + const result = notInTransform({ + member: 'order_id', + operator: 'notIn', + values: [1, 2, 3], + memberInfo: { + name: 'order_id', + sql: 'table.order_id', + type: 'number', + }, + }); + + // Check it returns OPERATOR_NOT wrapping a subquery + expect(result).toHaveProperty('class', 'OPERATOR'); + expect(result).toHaveProperty('type', 'OPERATOR_NOT'); + + // Verify it's using string_split with CAST + const subquery = (result as any).children[0]; + const selectList = subquery.subquery.node.select_list[0]; + expect(selectList.type).toBe('OPERATOR_CAST'); + expect(selectList.cast_type.id).toBe('DOUBLE'); + expect(selectList.child.function_name).toBe('unnest'); + expect(selectList.child.children[0].function_name).toBe('string_split'); + }); + + it('Should return standard COMPARE_NOT_IN for other types (default case)', () => { + const output = notInTransform({ + member: 'some_field', + operator: 'notIn', + values: ['val1', 'val2'], + memberInfo: { + name: 'some_field', + sql: 'table.some_field', + type: 'time' as any, // Unknown type to trigger default case + }, + }); + + // Default case should use COMPARE_NOT_IN + expect(output).toHaveProperty('type', 'COMPARE_NOT_IN'); + expect(output).toHaveProperty('class', 'OPERATOR'); + expect((output as any).children.length).toBe(3); // column + 2 values + }); + + it('Should handle large value lists efficiently with string_split', () => { + const largeValueList = Array.from({ length: 1000 }, (_, i) => `value${i}`); + const result = notInTransform({ + member: 'country', + operator: 'notIn', + values: largeValueList, + memberInfo: { + name: 'country', + sql: 'table.country', + type: 'string', + }, + }); + + // Should use OPERATOR_NOT wrapping subquery + expect(result).toHaveProperty('type', 'OPERATOR_NOT'); + expect(result.children[0]).toHaveProperty('class', 'SUBQUERY'); + + // Verify only 2 VALUE_CONSTANT nodes (joined string + delimiter) + const subquery = (result as any).children[0]; + const selectList = subquery.subquery.node.select_list[0]; + const stringSplitChildren = selectList.children[0].children; + expect(stringSplitChildren.length).toBe(2); + expect(stringSplitChildren[0].value.value).toContain('§‡¶'); + }); + + it('Should use delimiter to join values', () => { + const result = notInTransform({ + member: 'country', + operator: 'notIn', + values: ['US', 'Canada'], + memberInfo: { + name: 'country', + sql: 'table.country', + type: 'string', + }, + }); + + const subquery = (result as any).children[0]; + const selectList = subquery.subquery.node.select_list[0]; + const joinedValue = selectList.children[0].children[0].value.value; + const delimiter = selectList.children[0].children[1].value.value; + + expect(delimiter).toBe('§‡¶'); + expect(joinedValue).toBe('US§‡¶Canada'); }); it('Should return the correct value for string_array member', () => { @@ -69,83 +144,96 @@ describe('Not In transforms Tests', () => { }, }) as ConjunctionExpression; expect(output).toEqual({ - "alias": "", - "children": [ - { - "alias": "", - "catalog": "", - "children": [ - { - "alias": "", - "class": "COLUMN_REF", - "column_names": [ - "country", - ], - "type": "COLUMN_REF", - }, - { - "alias": "", - "children": [ - { - "alias": "", - "class": "CONSTANT", - "type": "VALUE_CONSTANT", - "value": { - "is_null": false, - "type": { - "id": "VARCHAR", - "type_info": null, - }, - "value": "US", + alias: '', + children: [ + { + alias: '', + catalog: '', + children: [ + { + alias: '', + class: 'COLUMN_REF', + column_names: ['country'], + type: 'COLUMN_REF', + }, + { + alias: '', + children: [ + { + alias: '', + class: 'CONSTANT', + type: 'VALUE_CONSTANT', + value: { + is_null: false, + type: { + id: 'VARCHAR', + type_info: null, }, + value: 'US', }, - { - "alias": "", - "class": "CONSTANT", - "type": "VALUE_CONSTANT", - "value": { - "is_null": false, - "type": { - "id": "VARCHAR", - "type_info": null, - }, - "value": "Germany", + }, + { + alias: '', + class: 'CONSTANT', + type: 'VALUE_CONSTANT', + value: { + is_null: false, + type: { + id: 'VARCHAR', + type_info: null, }, + value: 'Germany', }, - { - "alias": "", - "class": "CONSTANT", - "type": "VALUE_CONSTANT", - "value": { - "is_null": false, - "type": { - "id": "VARCHAR", - "type_info": null, - }, - "value": "Israel", + }, + { + alias: '', + class: 'CONSTANT', + type: 'VALUE_CONSTANT', + value: { + is_null: false, + type: { + id: 'VARCHAR', + type_info: null, }, + value: 'Israel', }, - ], - "class": "OPERATOR", - "type": "ARRAY_CONSTRUCTOR", - }, - ], - "class": "FUNCTION", - "distinct": false, - "export_state": false, - "filter": null, - "function_name": "&&", - "is_operator": true, - "order_bys": { - "orders": [], - "type": "ORDER_MODIFIER", + }, + ], + class: 'OPERATOR', + type: 'ARRAY_CONSTRUCTOR', }, - "schema": "", - "type": "FUNCTION", + ], + class: 'FUNCTION', + distinct: false, + export_state: false, + filter: null, + function_name: '&&', + is_operator: true, + order_bys: { + orders: [], + type: 'ORDER_MODIFIER', }, - ], - "class": "OPERATOR", - "type": "OPERATOR_NOT", + schema: '', + type: 'FUNCTION', + }, + ], + class: 'OPERATOR', + type: 'OPERATOR_NOT', }); }); + + it('Should throw error if values array is empty', () => { + expect(() => + notInTransform({ + member: 'country', + operator: 'notIn', + values: [], + memberInfo: { + name: 'country', + sql: 'table.country', + type: 'string', + }, + }) + ).toThrow('Not in filter must have at least one value'); + }); }); diff --git a/meerkat-core/src/cube-filter-transformer/not-in/not-in.ts b/meerkat-core/src/cube-filter-transformer/not-in/not-in.ts index b5eb63a1..741dab2e 100644 --- a/meerkat-core/src/cube-filter-transformer/not-in/not-in.ts +++ b/meerkat-core/src/cube-filter-transformer/not-in/not-in.ts @@ -1,7 +1,17 @@ import { Dimension, Measure } from '../../types/cube-types/table'; import { CubeToParseExpressionTransform } from '../factory'; -import { COLUMN_NAME_DELIMITER } from '../../member-formatters/constants'; +import { + COLUMN_NAME_DELIMITER, + STRING_ARRAY_DELIMITER, +} from '../../member-formatters/constants'; +import { + AggregateHandling, + QueryNodeType, + ResultModifierType, + SubqueryType, + TableReferenceType, +} from '../../types/duckdb-serialization-types'; import { ExpressionClass, ExpressionType, @@ -13,23 +23,24 @@ const notInDuckDbCondition = ( values: string[], memberInfo: Measure | Dimension ) => { - const sqlTreeValues = values.map((value) => { - return { - class: ExpressionClass.CONSTANT, - type: ExpressionType.VALUE_CONSTANT, - alias: '', - value: valueBuilder(value, memberInfo), - }; - }); const columnRef = { - class: 'COLUMN_REF', - type: 'COLUMN_REF', + class: ExpressionClass.COLUMN_REF, + type: ExpressionType.COLUMN_REF, alias: '', column_names: columnName.split(COLUMN_NAME_DELIMITER), }; + switch (memberInfo.type) { case 'number_array': case 'string_array': { + const sqlTreeValues = values.map((value) => { + return { + class: ExpressionClass.CONSTANT, + type: ExpressionType.VALUE_CONSTANT, + alias: '', + value: valueBuilder(value, memberInfo), + }; + }); return { class: ExpressionClass.OPERATOR, type: ExpressionType.OPERATOR_NOT, @@ -52,7 +63,7 @@ const notInDuckDbCondition = ( ], filter: null, order_bys: { - type: 'ORDER_MODIFIER', + type: ResultModifierType.ORDER_MODIFIER, orders: [], }, distinct: false, @@ -63,7 +74,186 @@ const notInDuckDbCondition = ( ], }; } + case 'string': + case 'number': { + /** + * Doing the string split optimization here because as the number of nodes in the AST increase, + * the time take to parse the AST increases, thereby increasing the time to generate the SQL. + */ + const joinedValues = values.join(STRING_ARRAY_DELIMITER); + + return { + class: ExpressionClass.OPERATOR, + type: ExpressionType.OPERATOR_NOT, + alias: '', + children: [ + { + class: ExpressionClass.SUBQUERY, + type: ExpressionType.SUBQUERY, + alias: '', + subquery_type: SubqueryType.ANY, + subquery: { + node: { + type: QueryNodeType.SELECT_NODE, + modifiers: [], + cte_map: { map: [] }, + select_list: [ + // For numeric types, we need to CAST the string result to the appropriate type + memberInfo.type === 'number' + ? { + class: ExpressionClass.CAST, + type: ExpressionType.OPERATOR_CAST, + alias: '', + child: { + class: ExpressionClass.FUNCTION, + type: ExpressionType.FUNCTION, + alias: '', + function_name: 'unnest', + schema: '', + children: [ + { + class: ExpressionClass.FUNCTION, + type: ExpressionType.FUNCTION, + alias: '', + function_name: 'string_split', + schema: '', + children: [ + { + class: ExpressionClass.CONSTANT, + type: ExpressionType.VALUE_CONSTANT, + alias: '', + value: { + type: { id: 'VARCHAR', type_info: null }, + is_null: false, + value: joinedValues, + }, + }, + { + class: ExpressionClass.CONSTANT, + type: ExpressionType.VALUE_CONSTANT, + alias: '', + value: { + type: { id: 'VARCHAR', type_info: null }, + is_null: false, + value: STRING_ARRAY_DELIMITER, + }, + }, + ], + filter: null, + order_bys: { + type: ResultModifierType.ORDER_MODIFIER, + orders: [], + }, + distinct: false, + is_operator: false, + export_state: false, + catalog: '', + }, + ], + filter: null, + order_bys: { + type: ResultModifierType.ORDER_MODIFIER, + orders: [], + }, + distinct: false, + is_operator: false, + export_state: false, + catalog: '', + }, + cast_type: { + id: 'DOUBLE', + type_info: null, + }, + try_cast: false, + } + : { + class: ExpressionClass.FUNCTION, + type: ExpressionType.FUNCTION, + alias: '', + function_name: 'unnest', + schema: '', + children: [ + { + class: ExpressionClass.FUNCTION, + type: ExpressionType.FUNCTION, + alias: '', + function_name: 'string_split', + schema: '', + children: [ + { + class: ExpressionClass.CONSTANT, + type: ExpressionType.VALUE_CONSTANT, + alias: '', + value: { + type: { id: 'VARCHAR', type_info: null }, + is_null: false, + value: joinedValues, + }, + }, + { + class: ExpressionClass.CONSTANT, + type: ExpressionType.VALUE_CONSTANT, + alias: '', + value: { + type: { id: 'VARCHAR', type_info: null }, + is_null: false, + value: STRING_ARRAY_DELIMITER, + }, + }, + ], + filter: null, + order_bys: { + type: ResultModifierType.ORDER_MODIFIER, + orders: [], + }, + distinct: false, + is_operator: false, + export_state: false, + catalog: '', + }, + ], + filter: null, + order_bys: { + type: ResultModifierType.ORDER_MODIFIER, + orders: [], + }, + distinct: false, + is_operator: false, + export_state: false, + catalog: '', + }, + ], + from_table: { + type: TableReferenceType.EMPTY, + alias: '', + sample: null, + }, + where_clause: null, + group_expressions: [], + group_sets: [], + aggregate_handling: AggregateHandling.STANDARD_HANDLING, + having: null, + sample: null, + qualify: null, + }, + }, + child: columnRef, + comparison_type: ExpressionType.COMPARE_EQUAL, + }, + ], + }; + } default: { + // For other types, use the standard COMPARE_NOT_IN approach + const sqlTreeValues = values.map((value) => { + return { + class: ExpressionClass.CONSTANT, + type: ExpressionType.VALUE_CONSTANT, + alias: '', + value: valueBuilder(value, memberInfo), + }; + }); + return { class: ExpressionClass.OPERATOR, type: ExpressionType.COMPARE_NOT_IN, @@ -76,7 +266,7 @@ const notInDuckDbCondition = ( export const notInTransform: CubeToParseExpressionTransform = (query) => { const { member, values, memberInfo } = query; - if (!values) { + if (!values || values.length === 0) { throw new Error('Not in filter must have at least one value'); } diff --git a/meerkat-core/src/member-formatters/constants.ts b/meerkat-core/src/member-formatters/constants.ts index 60413962..cfed3d46 100644 --- a/meerkat-core/src/member-formatters/constants.ts +++ b/meerkat-core/src/member-formatters/constants.ts @@ -1,2 +1,6 @@ export const COLUMN_NAME_DELIMITER = '.'; export const MEERKAT_OUTPUT_DELIMITER = '__'; + +// Multi-character delimiter using three different uncommon characters +// to minimize the chance of collision with real data +export const STRING_ARRAY_DELIMITER = '§‡¶'; diff --git a/meerkat-node/package.json b/meerkat-node/package.json index f5a7a1ec..f56f91ee 100644 --- a/meerkat-node/package.json +++ b/meerkat-node/package.json @@ -1,6 +1,6 @@ { "name": "@devrev/meerkat-node", - "version": "0.0.104", + "version": "0.0.105", "dependencies": { "@swc/helpers": "~0.5.0", "@devrev/meerkat-core": "*", diff --git a/meerkat-node/src/__tests__/test-data.ts b/meerkat-node/src/__tests__/test-data.ts index 09b57b08..b483b5fa 100644 --- a/meerkat-node/src/__tests__/test-data.ts +++ b/meerkat-node/src/__tests__/test-data.ts @@ -986,7 +986,9 @@ export const TEST_DATA = [ [ { testName: 'In', - expectedSQL: `SELECT orders.* FROM (SELECT customer_id AS orders__customer_id, vendors AS orders__vendors, * FROM (select * from orders) AS orders) AS orders WHERE ((orders__customer_id IN ('1', '2')) AND (orders__vendors && (ARRAY['myntra', 'amazon'])))`, + // customer_id is string type -> uses optimized string_split + // vendors is string_array type -> uses ARRAY overlap && + expectedSQL: `SELECT orders.* FROM (SELECT customer_id AS orders__customer_id, vendors AS orders__vendors, * FROM (select * from orders) AS orders) AS orders WHERE ((orders__customer_id = ANY(SELECT unnest(string_split('1§‡¶2', '§‡¶')))) AND (orders__vendors && (ARRAY['myntra', 'amazon'])))`, cubeInput: { measures: ['*'], filters: [ @@ -1032,6 +1034,51 @@ export const TEST_DATA = [ }, ], }, + { + testName: 'In with numeric type (optimized with CAST)', + // order_id is number type -> uses optimized string_split with CAST to DOUBLE + expectedSQL: `SELECT orders.* FROM (SELECT order_id AS orders__order_id, * FROM (select * from orders) AS orders) AS orders WHERE (orders__order_id = ANY(SELECT CAST(unnest(string_split('1§‡¶2§‡¶3', '§‡¶')) AS DOUBLE)))`, + cubeInput: { + measures: ['*'], + filters: [ + { + member: 'orders.order_id', + operator: 'in', + values: [1, 2, 3], + }, + ], + dimensions: [], + }, + expectedOutput: [ + { + order_id: 1, + customer_id: '1', + product_id: '1', + order_date: '2022-01-01', + order_amount: 50.0, + orders__order_id: 1, + vendors: ['myntra', 'amazon', 'flipkart'], + }, + { + order_id: 2, + customer_id: '1', + product_id: '2', + order_date: '2022-01-02', + order_amount: 80.0, + orders__order_id: 2, + vendors: ['myntra'], + }, + { + order_id: 3, + customer_id: '2', + product_id: '3', + order_date: '2022-02-01', + order_amount: 25.0, + orders__order_id: 3, + vendors: [], + }, + ], + }, { testName: 'In with single quotes', expectedSQL: `SELECT orders.* FROM (SELECT vendors AS orders__vendors, * FROM (select * from orders) AS orders) AS orders WHERE ((orders__vendors && (ARRAY['swiggy''s'])))`, @@ -1063,11 +1110,80 @@ export const TEST_DATA = [ }, ], }, + { + testName: + 'Multiple In filters combined (customer_id, product_id, order_id)', + // Tests all three optimized filters working together + // customer_id (string), product_id (string), order_id (number with CAST) + expectedSQL: `SELECT orders.* FROM (SELECT customer_id AS orders__customer_id, product_id AS orders__product_id, order_id AS orders__order_id, * FROM (select * from orders) AS orders) AS orders WHERE ((orders__customer_id = ANY(SELECT unnest(string_split('1§‡¶2', '§‡¶')))) AND (orders__product_id = ANY(SELECT unnest(string_split('1§‡¶2', '§‡¶')))) AND (orders__order_id = ANY(SELECT CAST(unnest(string_split('1§‡¶2§‡¶3§‡¶4', '§‡¶')) AS DOUBLE))))`, + cubeInput: { + measures: ['*'], + filters: [ + { + and: [ + { + member: 'orders.customer_id', + operator: 'in', + values: ['1', '2'], + }, + { + member: 'orders.product_id', + operator: 'in', + values: ['1', '2'], + }, + { + member: 'orders.order_id', + operator: 'in', + values: [1, 2, 3, 4], + }, + ], + }, + ], + dimensions: [], + }, + expectedOutput: [ + { + order_id: 4, + customer_id: '2', + product_id: '1', + order_date: '2022-03-01', + order_amount: 75.0, + orders__customer_id: '2', + orders__product_id: '1', + orders__order_id: 4, + vendors: ['flipkart'], + }, + { + order_id: 2, + customer_id: '1', + product_id: '2', + order_date: '2022-01-02', + order_amount: 80.0, + orders__customer_id: '1', + orders__product_id: '2', + orders__order_id: 2, + vendors: ['myntra'], + }, + { + order_id: 1, + customer_id: '1', + product_id: '1', + order_date: '2022-01-01', + order_amount: 50.0, + orders__customer_id: '1', + orders__product_id: '1', + orders__order_id: 1, + vendors: ['myntra', 'amazon', 'flipkart'], + }, + ], + }, ], [ { testName: 'Not In', - expectedSQL: `SELECT orders.* FROM (SELECT customer_id AS orders__customer_id, vendors AS orders__vendors, * FROM (select * from orders) AS orders) AS orders WHERE ((orders__customer_id NOT IN ('1', '2')) AND (NOT (orders__vendors && (ARRAY['myntra', 'flipkart']))))`, + // customer_id is string type -> uses optimized string_split with NOT + // vendors is string_array type -> uses NOT with ARRAY overlap && + expectedSQL: `SELECT orders.* FROM (SELECT customer_id AS orders__customer_id, vendors AS orders__vendors, * FROM (select * from orders) AS orders) AS orders WHERE ((NOT (orders__customer_id = ANY(SELECT unnest(string_split('1§‡¶2', '§‡¶'))))) AND (NOT (orders__vendors && (ARRAY['myntra', 'flipkart']))))`, cubeInput: { measures: ['*'], filters: [ @@ -1135,5 +1251,202 @@ export const TEST_DATA = [ }, ], }, + { + testName: 'Not In with numeric type (optimized with CAST)', + // order_id is number type -> uses optimized string_split with CAST and NOT + expectedSQL: `SELECT orders.* FROM (SELECT order_id AS orders__order_id, * FROM (select * from orders) AS orders) AS orders WHERE (NOT (orders__order_id = ANY(SELECT CAST(unnest(string_split('1§‡¶2', '§‡¶')) AS DOUBLE))))`, + cubeInput: { + measures: ['*'], + filters: [ + { + member: 'orders.order_id', + operator: 'notIn', + values: [1, 2], + }, + ], + dimensions: [], + }, + expectedOutput: [ + { + order_id: 3, + customer_id: '2', + product_id: '3', + order_date: '2022-02-01', + order_amount: 25.0, + orders__order_id: 3, + vendors: [], + }, + { + order_id: 4, + customer_id: '2', + product_id: '1', + order_date: '2022-03-01', + order_amount: 75.0, + orders__order_id: 4, + vendors: ['flipkart'], + }, + { + order_id: 5, + customer_id: '3', + product_id: '1', + order_date: '2022-03-02', + order_amount: 100.0, + orders__order_id: 5, + vendors: ['myntra', 'amazon', 'flipkart'], + }, + { + order_id: 6, + customer_id: '4', + product_id: '2', + order_date: '2022-04-01', + order_amount: 45.0, + orders__order_id: 6, + vendors: [], + }, + { + order_id: 7, + customer_id: '4', + product_id: '3', + order_date: '2022-05-01', + order_amount: 90.0, + orders__order_id: 7, + vendors: ['myntra', 'flipkart'], + }, + { + order_id: 8, + customer_id: '5', + product_id: '1', + order_date: '2022-05-02', + order_amount: 65.0, + orders__order_id: 8, + vendors: ['amazon', 'flipkart'], + }, + { + order_id: 9, + customer_id: '5', + product_id: '2', + order_date: '2022-05-05', + order_amount: 85.0, + orders__order_id: 9, + vendors: [], + }, + { + order_id: 10, + customer_id: '6', + product_id: '3', + order_date: '2022-06-01', + order_amount: 120.0, + orders__order_id: 10, + vendors: ['myntra', 'amazon'], + }, + { + order_id: 11, + customer_id: '6aa6', + product_id: '3', + order_date: '2024-06-01', + order_amount: 0.0, + orders__order_id: 11, + vendors: ['amazon'], + }, + { + order_id: 12, + customer_id: null, + product_id: '3', + order_date: '2024-07-01T00:00:00.000Z', + order_amount: 100.0, + orders__order_id: 12, + orders__order_date: undefined, + vendors: ['flipkart'], + }, + { + order_id: 13, + customer_id: '7', + product_id: '6', + order_date: '2024-08-01T00:00:00.000Z', + order_amount: 100.0, + orders__order_id: 13, + orders__order_date: undefined, + vendors: ["swiggy's"], + }, + ], + }, + { + testName: + 'Multiple NotIn filters combined (customer_id, product_id, order_id)', + // Tests all three optimized NOT IN filters working together + expectedSQL: `SELECT orders.* FROM (SELECT customer_id AS orders__customer_id, product_id AS orders__product_id, order_id AS orders__order_id, * FROM (select * from orders) AS orders) AS orders WHERE ((NOT (orders__customer_id = ANY(SELECT unnest(string_split('1§‡¶2', '§‡¶'))))) AND (NOT (orders__product_id = ANY(SELECT unnest(string_split('1§‡¶2', '§‡¶'))))) AND (NOT (orders__order_id = ANY(SELECT CAST(unnest(string_split('1§‡¶2', '§‡¶')) AS DOUBLE)))))`, + cubeInput: { + measures: ['*'], + filters: [ + { + and: [ + { + member: 'orders.customer_id', + operator: 'notIn', + values: ['1', '2'], + }, + { + member: 'orders.product_id', + operator: 'notIn', + values: ['1', '2'], + }, + { + member: 'orders.order_id', + operator: 'notIn', + values: [1, 2], + }, + ], + }, + ], + dimensions: [], + }, + expectedOutput: [ + { + order_id: 7, + customer_id: '4', + product_id: '3', + order_date: '2022-05-01', + order_amount: 90.0, + orders__customer_id: '4', + orders__product_id: '3', + orders__order_id: 7, + vendors: ['myntra', 'flipkart'], + }, + { + order_id: 10, + customer_id: '6', + product_id: '3', + order_date: '2022-06-01', + order_amount: 120.0, + orders__customer_id: '6', + orders__product_id: '3', + orders__order_id: 10, + vendors: ['myntra', 'amazon'], + }, + { + order_id: 11, + customer_id: '6aa6', + product_id: '3', + order_date: '2024-06-01', + order_amount: 0.0, + orders__customer_id: '6aa6', + orders__product_id: '3', + orders__order_id: 11, + vendors: ['amazon'], + }, + { + order_id: 13, + customer_id: '7', + product_id: '6', + order_date: '2024-08-01T00:00:00.000Z', + order_amount: 100.0, + orders__customer_id: '7', + orders__product_id: '6', + orders__order_id: 13, + orders__order_date: undefined, + vendors: ["swiggy's"], + }, + ], + }, ], ];