Skip to content

Commit

Permalink
HIVE-26713: StringExpr ArrayIndexOutOfBoundsException with LIKE '%xxx…
Browse files Browse the repository at this point in the history
…%' (apache#4999)(Ryu Kobayashi, reviewed by Attila Turoczy, Butao Zhang)
  • Loading branch information
ryukobayashi authored and dengzhhu653 committed Mar 7, 2024
1 parent 58daa05 commit 2469476
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 3 deletions.
1 change: 1 addition & 0 deletions data/files/control_characters.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
abcde�fghi
13 changes: 13 additions & 0 deletions ql/src/test/queries/clientpositive/like_control_characters.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
set hive.mapred.mode=nonstrict;
set hive.explain.user=false;
set hive.vectorized.execution.enabled=true;

create temporary table foo (col string);

-- SORT_QUERY_RESULTS

LOAD DATA LOCAL INPATH '../../data/files/control_characters.txt' INTO TABLE foo;

explain select col, count(*) from foo where col like '%fg%' group by col;
select col, count(*) from foo where col like '%fg%' group by col;

Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
PREHOOK: query: create temporary table foo (col string)
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@foo
POSTHOOK: query: create temporary table foo (col string)
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@foo
PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/control_characters.txt' INTO TABLE foo
PREHOOK: type: LOAD
#### A masked pattern was here ####
PREHOOK: Output: default@foo
POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/control_characters.txt' INTO TABLE foo
POSTHOOK: type: LOAD
#### A masked pattern was here ####
POSTHOOK: Output: default@foo
PREHOOK: query: explain select col, count(*) from foo where col like '%fg%' group by col
PREHOOK: type: QUERY
PREHOOK: Input: default@foo
#### A masked pattern was here ####
POSTHOOK: query: explain select col, count(*) from foo where col like '%fg%' group by col
POSTHOOK: type: QUERY
POSTHOOK: Input: default@foo
#### A masked pattern was here ####
STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-0 depends on stages: Stage-1

STAGE PLANS:
Stage: Stage-1
Tez
#### A masked pattern was here ####
Edges:
Reducer 2 <- Map 1 (SIMPLE_EDGE)
#### A masked pattern was here ####
Vertices:
Map 1
Map Operator Tree:
TableScan
alias: foo
filterExpr: (col like '%fg%') (type: boolean)
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: (col like '%fg%') (type: boolean)
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
Group By Operator
aggregations: count()
keys: col (type: string)
minReductionHashAggr: 0.99
mode: hash
outputColumnNames: _col0, _col1
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Map-reduce partition columns: _col0 (type: string)
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
value expressions: _col1 (type: bigint)
Execution mode: vectorized, llap
LLAP IO: all inputs
Reducer 2
Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
aggregations: count(VALUE._col0)
keys: KEY._col0 (type: string)
mode: mergepartial
outputColumnNames: _col0, _col1
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

Stage: Stage-0
Fetch Operator
limit: -1
Processor Tree:
ListSink

PREHOOK: query: select col, count(*) from foo where col like '%fg%' group by col
PREHOOK: type: QUERY
PREHOOK: Input: default@foo
#### A masked pattern was here ####
POSTHOOK: query: select col, count(*) from foo where col like '%fg%' group by col
POSTHOOK: type: QUERY
POSTHOOK: Input: default@foo
#### A masked pattern was here ####
abcde�fghi 1
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,15 @@ public int find(byte[] input, int start, int len) {
}
s_tmp--;
}
next += shift[input[next] & MAX_BYTE];

// if the character string contains control characters,
// overflow occurs.
int shiftIndex = input[next] & MAX_BYTE;
if (shiftIndex >= MAX_BYTE) {
next++;
} else {
next += shift[shiftIndex];
}
}
return -1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@

import org.junit.Test;

import java.nio.charset.StandardCharsets;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;

import static org.junit.Assert.*;
import java.io.ByteArrayOutputStream;
import java.nio.charset.StandardCharsets;

public class TestStringExpr {
@Test
Expand All @@ -49,6 +51,24 @@ public void test() throws Exception {
assertEquals("Testing match at end of string", 24, find(pattern, input4));
}

@Test
public void testControlCharacters() throws Exception {
StringExpr.Finder pattern = compile("pattern");
assertNotNull(pattern);

byte b = -1;
byte[] controlBytes1 = "abcedf".getBytes(StandardCharsets.UTF_8);
byte[] controlBytes2 = "pattern".getBytes(StandardCharsets.UTF_8);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
outputStream.write(controlBytes1);
outputStream.write(b);
outputStream.write(controlBytes2);
byte[] controlChar = outputStream.toByteArray();
outputStream.close();

assertEquals("Testing valid match", 7, pattern.find(controlChar, 0, controlChar.length));
}

private StringExpr.Finder compile(String pattern) {
return StringExpr.compile(pattern.getBytes(StandardCharsets.UTF_8));
}
Expand Down

0 comments on commit 2469476

Please sign in to comment.