Navigation Menu

Skip to content

Commit

Permalink
Add html_untag() function
Browse files Browse the repository at this point in the history
It strips HTML tag from HTML and outputs only text.

TODO:
- Support attribute
- Support nested element
  • Loading branch information
kou committed Jun 26, 2013
1 parent 88edbe4 commit 9d75c4d
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 0 deletions.
47 changes: 47 additions & 0 deletions lib/proc.c
Expand Up @@ -4011,6 +4011,50 @@ selector_sub_filter(grn_ctx *ctx, grn_obj *table, grn_obj *index,
return run_sub_filter(ctx, table, nargs - 1, args + 1, res, op);
}

static grn_obj *
func_html_untag(grn_ctx *ctx, int nargs, grn_obj **args,
grn_user_data *user_data)
{
grn_obj *html;
grn_obj *text;
const char *html_raw;
int i, length;
grn_bool in_tag = GRN_FALSE;

if (nargs != 1) {
ERR(GRN_INVALID_ARGUMENT, "HTML is missing");
return NULL;
}

/* TODO: type check */
html = args[0];

text = GRN_PROC_ALLOC(html->header.domain, 0);
if (!text) {
return NULL;
}

html_raw = GRN_TEXT_VALUE(html);
length = GRN_TEXT_LEN(html);
for (i = 0; i < length; i++) {
switch (html_raw[i]) {
case '<' :
in_tag = GRN_TRUE;
break;
case '>' :
in_tag = GRN_FALSE;
break;
default :
if (!in_tag) {
GRN_TEXT_PUTC(ctx, text, html_raw[i]);
}
break;
}
}

return text;
}

#define DEF_VAR(v,name_str) do {\
(v).name = (name_str);\
(v).name_size = GRN_STRLEN(name_str);\
Expand Down Expand Up @@ -4217,4 +4261,7 @@ grn_db_init_builtin_query(grn_ctx *ctx)
func_sub_filter, NULL, NULL, 0, NULL);
grn_proc_set_selector(ctx, selector_proc, selector_sub_filter);
}

grn_proc_create(ctx, "html_untag", -1, GRN_PROC_FUNCTION,
func_html_untag, NULL, NULL, 0, NULL);
}
11 changes: 11 additions & 0 deletions test/command/suite/select/function/html_untag/simple.expected
@@ -0,0 +1,11 @@
table_create Entries TABLE_NO_KEY
[[0,0.0,0.0],true]
column_create Entries content COLUMN_SCALAR Text
[[0,0.0,0.0],true]
load --table Entries
[
{"content": "I <em>am</em> a boy."}
]
[[0,0.0,0.0],1]
select Entries --output_columns "html_untag(content)" --command_version 2
[[0,0.0,0.0],[[[1],[["html_untag","null"]],["I am a boy."]]]]
11 changes: 11 additions & 0 deletions test/command/suite/select/function/html_untag/simple.test
@@ -0,0 +1,11 @@
table_create Entries TABLE_NO_KEY
column_create Entries content COLUMN_SCALAR Text

load --table Entries
[
{"content": "I <em>am</em> a boy."}
]

select Entries \
--output_columns "html_untag(content)" \
--command_version 2

0 comments on commit 9d75c4d

Please sign in to comment.