Skip to content

Commit

Permalink
Add a sample query expander QueryExpanderTSV
Browse files Browse the repository at this point in the history
It reads synonyms from TSV format file. Synonyms file should have the
following contents:

  key[TAB]synonym1[TAB]synonym2[TAB]...

For example:

  rroonga[TAB]rroonga[TAB]Ruby groonga

With the above synonyms, --query rroonga is expanded to
--query "((rroonga) OR (Ruby groonga))".

TODO:
  * Support magic comment at the head.
  * Install the default synonyms file as /etc/groonga/synonyms.tsv.
  • Loading branch information
kou committed Oct 11, 2012
1 parent b1fe08c commit d1b00b8
Show file tree
Hide file tree
Showing 11 changed files with 378 additions and 4 deletions.
14 changes: 14 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ AC_CONFIG_FILES([
plugins/tokenizers/Makefile
plugins/suggest/Makefile
plugins/table/Makefile
plugins/query_expanders/Makefile
examples/Makefile
examples/dictionary/Makefile
examples/dictionary/edict/Makefile
Expand Down Expand Up @@ -1159,6 +1160,9 @@ AC_SUBST(expanded_pluginsdir)
tokenizers_pluginsdir="\${pluginsdir}/tokenizers"
AC_SUBST(tokenizers_pluginsdir)

query_expanders_pluginsdir="\${pluginsdir}/query_expanders"
AC_SUBST(query_expanders_pluginsdir)

suggest_pluginsdir="\${pluginsdir}/suggest"
AC_SUBST(suggest_pluginsdir)

Expand All @@ -1175,6 +1179,16 @@ if test -z "$suffix"; then
fi
AC_DEFINE_UNQUOTED(GRN_PLUGIN_SUFFIX, ["$suffix"], "plugin suffix")

# for query expanders
GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE="synonyms.tsv"
AC_DEFINE_UNQUOTED(GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE,
["$GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE"],
"The relative synonyms file for TSV query expander")
GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE="${pkgdatadir}/${GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE}"
AC_DEFINE_UNQUOTED(GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE,
["$GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE"],
"The default synonyms file for TSV query expander")

# for examples
examplesdir="\$(pkgdatadir)/examples"
AC_SUBST(examplesdir)
Expand Down
1 change: 1 addition & 0 deletions plugins/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@
add_subdirectory(suggest)
add_subdirectory(tokenizers)
add_subdirectory(table)
add_subdirectory(query_expanders)
9 changes: 5 additions & 4 deletions plugins/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
SUBDIRS = \
tokenizers \
suggest \
table
SUBDIRS = \
tokenizers \
suggest \
table \
query_expanders

EXTRA_DIST = \
CMakeLists.txt
27 changes: 27 additions & 0 deletions plugins/query_expanders/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright(C) 2012 Brazil
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 2.1 as published by the Free Software Foundation.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

include_directories(
${CMAKE_SOURCE_DIR}/lib
)

set(QUERY_EXPANDERS_DIR "${GRN_PLUGINS_DIR}/query_expanders")
read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/tsv_sources.am TSV_SOURCES)
add_library(tsv_query_expander MODULE ${TSV_SOURCES})
set_target_properties(tsv_query_expander PROPERTIES
PREFIX ""
OUTPUT_NAME "tsv")
target_link_libraries(tsv_query_expander libgroonga)
install(TARGETS tsv_query_expander DESTINATION "${QUERY_EXPANDERS_DIR}")
20 changes: 20 additions & 0 deletions plugins/query_expanders/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
EXTRA_DIST = \
CMakeLists.txt

INCLUDES = \
-I$(top_builddir) \
-I$(top_srcdir)/include \
-I$(top_srcdir)/lib

AM_LDFLAGS = \
-avoid-version \
-module \
-no-undefined

LIBS = \
$(top_builddir)/lib/libgroonga.la

query_expanders_plugins_LTLIBRARIES =
query_expanders_plugins_LTLIBRARIES += tsv.la

include tsv_sources.am
230 changes: 230 additions & 0 deletions plugins/query_expanders/tsv.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
/* -*- c-basic-offset: 2 -*- */
/* Copyright(C) 2012 Brazil
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License version 2.1 as published by the Free Software Foundation.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/

#include <groonga/plugin.h>

/* groonga's internal headers: They should be removed. */
/* for grn_text_fgets() */
#include <str.h>
/* for GRN_PROC_ALLOC() */
#include <db.h>
/* for grn_win32_base_dir() */
#include <util.h>

#include <stdio.h>
#include <string.h>

#define MAX_SYNONYM_BYTES 4096

static grn_hash *synonyms = NULL;

#ifdef WIN32
static char *win32_synonyms_file = NULL;
const char *
get_system_synonyms_file(void)
{
if (!win32_synonyms_file) {
const char *base_dir;
const char *relative_path = GRN_RELATIVE_QUERY_EXPANDER_TSV_SYNONYMS_FILE;
char *synonyms_file;
char *path;
size_t base_dir_length;

base_dir = grn_win32_base_dir();
base_dir_length = strlen(base_dir);
synonyms_file =
malloc(base_dir_length + strlen("/") + strlen(relative_path) + 1);
strcpy(synonyms_file, base_dir);
strcat(synonyms_file, "/");
strcat(synonyms_file, relative_path);
win32_synonyms_file = synonyms_file;
}
return win32_synonyms_file;
}

#else /* WIN32 */
const char *
get_system_synonyms_file(void)
{
return GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE;
}
#endif /* WIN32 */

static inline grn_bool
is_comment_mark(char character)
{
return character == '#';
}

static void
parse_synonyms_file_line(grn_ctx *ctx, const char *line, int line_length,
grn_obj *key, grn_obj *value)
{
size_t i = 0;

if (is_comment_mark(line[i])) {
return;
}

while (i < line_length) {
char character = line[i];
i++;
if (character == '\t') {
break;
}
GRN_TEXT_PUTC(ctx, key, character);
}

if (i == line_length) {
return;
}

GRN_TEXT_PUTS(ctx, value, "((");
while (i < line_length) {
char character = line[i];
i++;
if (character == '\t') {
GRN_TEXT_PUTS(ctx, value, ") OR (");
} else {
GRN_TEXT_PUTC(ctx, value, character);
}
}
GRN_TEXT_PUTS(ctx, value, "))");

{
grn_id id;
void *value_location = NULL;

id = grn_hash_add(ctx, synonyms, GRN_TEXT_VALUE(key), GRN_TEXT_LEN(key),
&value_location, NULL);
if (id == GRN_ID_NIL) {
GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING,
"[plugin][query-expander][tsv] "
"failed to register key: <%.*s>",
(int)GRN_TEXT_LEN(key), GRN_TEXT_VALUE(key));
return;
}

grn_bulk_truncate(ctx, value, MAX_SYNONYM_BYTES - 1);
GRN_TEXT_PUTC(ctx, value, '\0');
memcpy(value_location, GRN_TEXT_VALUE(value), MAX_SYNONYM_BYTES);
}
}

static void
load_synonyms(grn_ctx *ctx)
{
const char *path;
FILE *file;
grn_obj line, key, value;

path = getenv("GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE");
if (!path) {
path = get_system_synonyms_file();
}
file = fopen(path, "r");
if (!file) {
GRN_LOG(ctx, GRN_LOG_WARNING,
"[plugin][query-expander][tsv] "
"synonyms file doesn't exist: <%s>",
path);
return;
}

GRN_TEXT_INIT(&line, 0);
GRN_TEXT_INIT(&key, 0);
GRN_TEXT_INIT(&value, 0);
grn_bulk_reserve(ctx, &value, MAX_SYNONYM_BYTES);
while (grn_text_fgets(ctx, &line, file) == GRN_SUCCESS) {
GRN_BULK_REWIND(&key);
GRN_BULK_REWIND(&value);
parse_synonyms_file_line(ctx,
GRN_TEXT_VALUE(&line), GRN_TEXT_LEN(&line),
&key, &value);
GRN_BULK_REWIND(&line);
}
GRN_OBJ_FIN(ctx, &line);
GRN_OBJ_FIN(ctx, &key);
GRN_OBJ_FIN(ctx, &value);

fclose(file);
}

static grn_obj *
func_query_expander_tsv(grn_ctx *ctx, int nargs, grn_obj **args,
grn_user_data *user_data)
{
grn_rc rc = GRN_END_OF_DATA;
grn_id id;
grn_obj *term, *expanded_term;
void *value;
grn_obj *rc_object;

term = args[0];
expanded_term = args[1];
id = grn_hash_get(ctx, synonyms,
GRN_TEXT_VALUE(term), GRN_TEXT_LEN(term),
&value);
if (id != GRN_ID_NIL) {
const char *query = value;
GRN_TEXT_PUTS(ctx, expanded_term, query);
rc = GRN_SUCCESS;
}

rc_object = GRN_PROC_ALLOC(GRN_DB_INT32, 0);
if (rc_object) {
GRN_INT32_SET(ctx, rc_object, rc);
}

return rc_object;
}

grn_rc
GRN_PLUGIN_INIT(grn_ctx *ctx)
{
if (!synonyms) {
synonyms = grn_hash_create(ctx, NULL,
GRN_TABLE_MAX_KEY_SIZE,
MAX_SYNONYM_BYTES,
GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_VAR_SIZE);
if (!synonyms) {
return ctx->rc;
}
load_synonyms(ctx);
}
return ctx->rc;
}

grn_rc
GRN_PLUGIN_REGISTER(grn_ctx *ctx)
{
grn_proc_create(ctx, "QueryExpanderTSV", strlen("QueryExpanderTSV"),
GRN_PROC_FUNCTION,
func_query_expander_tsv, NULL, NULL,
0, NULL);
return GRN_SUCCESS;
}

grn_rc
GRN_PLUGIN_FIN(grn_ctx *ctx)
{
if (synonyms) {
grn_hash_close(ctx, synonyms);
synonyms = NULL;
}
return GRN_SUCCESS;
}
2 changes: 2 additions & 0 deletions plugins/query_expanders/tsv_sources.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
tsv_la_SOURCES = \
tsv.c
2 changes: 2 additions & 0 deletions test/command/fixture/query_expander/tsv/synonyms.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-
rroonga rroonga Ruby groonga
3 changes: 3 additions & 0 deletions test/command/run-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ export GROONGA_SUGGEST_CREATE_DATASET
GRN_PLUGINS_DIR="$top_dir/plugins"
export GRN_PLUGINS_DIR

GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE="$top_dir/test/command/fixture/query_expander/tsv/synonyms.tsv"
export GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE

case `uname` in
Darwin)
DYLD_LIBRARY_PATH="$top_dir/lib/.libs:$DYLD_LIBRARY_PATH"
Expand Down
Loading

0 comments on commit d1b00b8

Please sign in to comment.