diff --git a/doc/modules/cassandra/pages/cql/functions.adoc b/doc/modules/cassandra/pages/cql/functions.adoc index f126ddaf0ae4..d50b5cf8f974 100644 --- a/doc/modules/cassandra/pages/cql/functions.adoc +++ b/doc/modules/cassandra/pages/cql/functions.adoc @@ -238,6 +238,36 @@ For every xref:cql/types.adoc#native-types[type] supported by CQL, the function Conversely, the function `blobAsType` takes a 64-bit `blob` argument and converts it to a `bigint` value. For example, `bigintAsBlob(3)` returns `0x0000000000000003` and `blobAsBigint(0x0000000000000003)` returns `3`. +[[index-functions]] +===== Index functions + +====== `sai_analyze` + +The `sai_analyze` functions returns the tokens that a SAI index will generate for a certain text value. The arguments +are that text value and the JSON configuration of the SAI analyzer. This JSON configuration is the same as the one used +to create the SAI index. For example, this function call: + +[source,cql] +---- +sai_analyze('johnny apples seedlings', + '{ + "tokenizer": {"name": "whitespace"} + }') +---- +Will return `['johnny', 'apples', 'seedlings']` + +This other function call: +[source,cql] +---- +sai_analyze('johnny apples seedlings', + '{ + "tokenizer": {"name": "whitespace"}, + "filters": [{"name": "porterstem"}] + }') +---- +Will return `['johnni', 'appl', 'seedl']` + + [[vector-functions]] ===== Vector functions diff --git a/src/java/org/apache/cassandra/cql3/functions/IndexFcts.java b/src/java/org/apache/cassandra/cql3/functions/IndexFcts.java new file mode 100644 index 000000000000..d8b5a9b29c64 --- /dev/null +++ b/src/java/org/apache/cassandra/cql3/functions/IndexFcts.java @@ -0,0 +1,96 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.cql3.functions; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import com.google.common.base.Charsets; + +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.index.sai.analyzer.JSONAnalyzerParser; +import org.apache.cassandra.index.sai.analyzer.LuceneAnalyzer; +import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.lucene.analysis.Analyzer; + +public abstract class IndexFcts +{ + public static void addFunctionsTo(NativeFunctions functions) + { + functions.add(new SAIAnalyzeFunction()); + } + + /** + * CQL native function to get the tokens produced for given text value and the analyzer defined by the given JSON options. + */ + private static class SAIAnalyzeFunction extends NativeScalarFunction + { + private static final String NAME = "sai_analyze"; + private static final ListType returnType = ListType.getInstance(UTF8Type.instance, false); + + private SAIAnalyzeFunction() + { + super(NAME, returnType, UTF8Type.instance, UTF8Type.instance); + } + + @Override + public ByteBuffer execute(ProtocolVersion protocolVersion, List parameters) throws InvalidRequestException + { + if (parameters.get(0) == null) + return null; + String text = UTF8Type.instance.compose(parameters.get(0)); + + if (parameters.get(1) == null) + throw new InvalidRequestException("Function " + name + " requires a non-null json_analyzer parameter (2nd argument)"); + String json = UTF8Type.instance.compose(parameters.get(1)); + + LuceneAnalyzer luceneAnalyzer = null; + List tokens = new ArrayList<>(); + try (Analyzer analyzer = JSONAnalyzerParser.parse(json)) + { + luceneAnalyzer = new LuceneAnalyzer(UTF8Type.instance, analyzer, new HashMap<>()); + + ByteBuffer toAnalyze = ByteBuffer.wrap(text.getBytes(Charsets.UTF_8)); + luceneAnalyzer.reset(toAnalyze); + ByteBuffer analyzed; + + while (luceneAnalyzer.hasNext()) + { + analyzed = luceneAnalyzer.next(); + tokens.add(ByteBufferUtil.string(analyzed, Charsets.UTF_8)); + } + } + catch (Exception ex) + { + throw new InvalidRequestException("Function " + name + " unable to analyze text=" + text + " json_analyzer=" + json, ex); + } + finally + { + if (luceneAnalyzer != null) + { + luceneAnalyzer.end(); + } + } + + return returnType.decompose(tokens); + } + } +} diff --git a/src/java/org/apache/cassandra/cql3/functions/NativeFunctions.java b/src/java/org/apache/cassandra/cql3/functions/NativeFunctions.java index f577a22ad134..9da19f842558 100644 --- a/src/java/org/apache/cassandra/cql3/functions/NativeFunctions.java +++ b/src/java/org/apache/cassandra/cql3/functions/NativeFunctions.java @@ -42,6 +42,7 @@ public class NativeFunctions AggregateFcts.addFunctionsTo(this); BytesConversionFcts.addFunctionsTo(this); VectorFcts.addFunctionsTo(this); + IndexFcts.addFunctionsTo(this); } }; diff --git a/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java b/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java index f1e3be18e576..f99cf4fd44d5 100644 --- a/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java +++ b/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java @@ -22,7 +22,6 @@ import com.google.common.collect.ImmutableList; import org.apache.cassandra.config.CassandraRelevantProperties; -import org.apache.cassandra.index.sai.virtual.AnalyzerView; import org.apache.cassandra.index.sai.virtual.IndexesSystemView; import org.apache.cassandra.index.sai.virtual.SSTablesSystemView; import org.apache.cassandra.index.sai.virtual.SegmentsSystemView; @@ -55,7 +54,6 @@ private static Collection buildTables() .add(new InternodeInboundTable(VIRTUAL_VIEWS)) .add(new SSTablesSystemView(VIRTUAL_VIEWS)) .add(new SegmentsSystemView(VIRTUAL_VIEWS)) - .add(new AnalyzerView(VIRTUAL_VIEWS)) .addAll(TableMetricTables.getAll(VIRTUAL_VIEWS)); if (CassandraRelevantProperties.SYSTEM_VIEWS_INCLUDE_ALL.getBoolean() || CassandraRelevantProperties.SYSTEM_VIEWS_INCLUDE_LOCAL_AND_PEERS.getBoolean()) diff --git a/src/java/org/apache/cassandra/index/sai/virtual/AnalyzerView.java b/src/java/org/apache/cassandra/index/sai/virtual/AnalyzerView.java deleted file mode 100644 index 274c3b6feb0b..000000000000 --- a/src/java/org/apache/cassandra/index/sai/virtual/AnalyzerView.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.cassandra.index.sai.virtual; - -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; - -import com.google.common.base.Charsets; - -import org.apache.cassandra.db.DecoratedKey; -import org.apache.cassandra.db.marshal.CompositeType; -import org.apache.cassandra.db.marshal.UTF8Type; -import org.apache.cassandra.db.virtual.AbstractVirtualTable; -import org.apache.cassandra.db.virtual.SimpleDataSet; -import org.apache.cassandra.dht.LocalPartitioner; -import org.apache.cassandra.exceptions.InvalidRequestException; -import org.apache.cassandra.index.sai.analyzer.JSONAnalyzerParser; -import org.apache.cassandra.index.sai.analyzer.LuceneAnalyzer; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.utils.ByteBufferUtil; -import org.apache.lucene.analysis.Analyzer; - -public class AnalyzerView extends AbstractVirtualTable -{ - public AnalyzerView(String keyspace) - { - super(TableMetadata.builder(keyspace, "analyzer") - .kind(TableMetadata.Kind.VIRTUAL) - .partitioner(new LocalPartitioner(CompositeType.getInstance(UTF8Type.instance, UTF8Type.instance))) - .addPartitionKeyColumn("text", UTF8Type.instance) - .addPartitionKeyColumn("json_analyzer", UTF8Type.instance) - .addRegularColumn("tokens", UTF8Type.instance) - .build()); - } - - @Override - public DataSet data() - { - return new SimpleDataSet(metadata()); - } - - @Override - public DataSet data(DecoratedKey partitionKey) - { - LuceneAnalyzer luceneAnalyzer = null; - String text = null; - String optionsString = null; - try - { - ByteBuffer[] array = ((CompositeType) metadata().partitionKeyType).split(partitionKey.getKey()); - text = UTF8Type.instance.compose(array[0]); - optionsString = UTF8Type.instance.compose(array[1]); - - Analyzer analyzer = JSONAnalyzerParser.parse(optionsString); - luceneAnalyzer = new LuceneAnalyzer(UTF8Type.instance, analyzer, new HashMap<>()); - - ByteBuffer toAnalyze = ByteBuffer.wrap(text.getBytes(Charsets.UTF_8)); - luceneAnalyzer.reset(toAnalyze); - ByteBuffer analyzed = null; - - List list = new ArrayList<>(); - - while (luceneAnalyzer.hasNext()) - { - analyzed = luceneAnalyzer.next(); - - list.add(ByteBufferUtil.string(analyzed, Charsets.UTF_8)); - } - - SimpleDataSet result = new SimpleDataSet(metadata()); - result.row(text, optionsString).column("tokens", list.toString()); - return result; - } - catch (Exception ex) - { - throw new InvalidRequestException("Unable to analyze text="+text+" json_analyzer="+optionsString, ex); - } - finally - { - if (luceneAnalyzer != null) - { - luceneAnalyzer.end(); - } - } - } -} diff --git a/test/unit/org/apache/cassandra/cql3/functions/IndexFctsTest.java b/test/unit/org/apache/cassandra/cql3/functions/IndexFctsTest.java new file mode 100644 index 000000000000..c5b6b6dd7000 --- /dev/null +++ b/test/unit/org/apache/cassandra/cql3/functions/IndexFctsTest.java @@ -0,0 +1,49 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.cql3.functions; + +import org.junit.Test; + +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.index.sai.SAITester; + +public class IndexFctsTest extends SAITester +{ + @Test + public void testAnalyzeFunction() throws Throwable + { + createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)"); + execute("INSERT INTO %s (k, v) VALUES (1, 'johnny apples seedlings')"); + execute("INSERT INTO %s (k, v) VALUES (2, null)"); + + assertRows(execute("SELECT k, sai_analyze(v, ?) FROM %s", + "{\n" + + "\t\"tokenizer\":{\"name\":\"whitespace\"},\n" + + "\t\"tokenizer\":{\"name\":\"whitespace\"},\n" + + "\t\"filters\":[{\"name\":\"porterstem\"}]\n" + + '}'), + row(1, list("johnni", "appl", "seedl")), + row(2, null)); + + assertInvalidThrowMessage("Function system.sai_analyze requires a non-null json_analyzer parameter (2nd argument)", + InvalidRequestException.class, + "SELECT sai_analyze(v, null) FROM %s"); + + assertInvalidThrowMessage("Function system.sai_analyze unable to analyze text=abc json_analyzer=def", + InvalidRequestException.class, + "SELECT sai_analyze('abc', 'def') FROM %s"); + } +} diff --git a/test/unit/org/apache/cassandra/index/sai/virtual/AnalyzerViewTest.java b/test/unit/org/apache/cassandra/index/sai/virtual/AnalyzerViewTest.java deleted file mode 100644 index a28d8fb23253..000000000000 --- a/test/unit/org/apache/cassandra/index/sai/virtual/AnalyzerViewTest.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.index.sai.virtual; - -import com.google.common.collect.ImmutableList; -import org.junit.BeforeClass; -import org.junit.Test; - -import org.apache.cassandra.cql3.CQLTester; -import org.apache.cassandra.cql3.UntypedResultSet; -import org.apache.cassandra.db.virtual.VirtualKeyspace; -import org.apache.cassandra.db.virtual.VirtualKeyspaceRegistry; -import org.apache.cassandra.index.sai.SAITester; -import org.apache.cassandra.schema.SchemaConstants; - -import static org.junit.Assert.assertEquals; - -public class AnalyzerViewTest extends SAITester -{ - @BeforeClass - public static void setup() throws Exception - { - VirtualKeyspaceRegistry.instance.register(new VirtualKeyspace(SchemaConstants.VIRTUAL_VIEWS, ImmutableList.of(new AnalyzerView(SchemaConstants.VIRTUAL_VIEWS)))); - - CQLTester.setUpClass(); - } - - @Test - public void test() throws Throwable - { - UntypedResultSet results = execute("SELECT * FROM system_views.analyzer WHERE text = 'johnny apples seedlings'"+ - " AND json_analyzer = '{\n" + - "\t\"tokenizer\":{\"name\":\"whitespace\"},\n" + - "\t\"filters\":[{\"name\":\"porterstem\"}]\n" + - "}' ALLOW FILTERING"); - UntypedResultSet.Row row = results.one(); - String tokenized = row.getString("tokens"); - - assertEquals("[johnni, appl, seedl]", tokenized); - } -}