Permalink
Browse files

add option to force input to be utf-8

  • Loading branch information...
1 parent 4228b7a commit 4d4ec4fd95763b8cdf20898366b224a8cbc6b8a7 @fizx committed Dec 24, 2009
Showing with 16 additions and 8 deletions.
  1. +1 −0 Makefile.am
  2. +1 −0 Makefile.in
  3. +6 −4 parsley.c
  4. +2 −1 parsley.h
  5. +5 −1 parsley_main.c
  6. +1 −1 util.c
  7. +0 −1 xml2json.c
View
@@ -30,6 +30,7 @@ bench:
check-am:
@echo "ambiguous..."; ./parsley test/ambiguous.let test/ambiguous.html 2>&1 | diff test/ambiguous.json - && echo " success."
+ @echo "unicode..."; ./parsley test/unicode.let test/unicode.html 2>&1 | diff test/unicode.json - && echo " success."
@echo "contains..."; ./parsley test/contains.let test/contains.html 2>&1 | diff test/contains.json - && echo " success."
@echo "math_ambiguity..."; ./parsley test/math_ambiguity.let test/math_ambiguity.html 2>&1 | diff test/math_ambiguity.json - && echo " success."
@echo "content..."; ./parsley test/content.let test/content.html 2>&1 | diff test/content.json - && echo " success."
View
@@ -804,6 +804,7 @@ bench:
check-am:
@echo "ambiguous..."; ./parsley test/ambiguous.let test/ambiguous.html 2>&1 | diff test/ambiguous.json - && echo " success."
+ @echo "unicode..."; ./parsley test/unicode.let test/unicode.html 2>&1 | diff test/unicode.json - && echo " success."
@echo "contains..."; ./parsley test/contains.let test/contains.html 2>&1 | diff test/contains.json - && echo " success."
@echo "math_ambiguity..."; ./parsley test/math_ambiguity.let test/math_ambiguity.html 2>&1 | diff test/math_ambiguity.json - && echo " success."
@echo "content..."; ./parsley test/content.let test/content.html 2>&1 | diff test/content.json - && echo " success."
View
@@ -95,17 +95,18 @@ static parsedParsleyPtr parse_error(char* format, ...) {
parsedParsleyPtr parsley_parse_file(parsleyPtr parsley, char* file, int flags) {
xmlSetGenericErrorFunc(NULL , parsleyXsltError);
bool html = flags & PARSLEY_OPTIONS_HTML;
+ char * encoding = flags & PARSLEY_OPTIONS_FORCE_UTF8 ? "UTF-8" : NULL;
if(html) {
htmlParserCtxtPtr htmlCtxt = htmlNewParserCtxt();
- htmlDocPtr html = htmlCtxtReadFile(htmlCtxt, file, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
+ htmlDocPtr html = htmlCtxtReadFile(htmlCtxt, file, encoding, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
htmlFreeParserCtxt(htmlCtxt);
if(html == NULL) return parse_error("Couldn't parse file: %s\n", file);
parsedParsleyPtr out = parsley_parse_doc(parsley, html, flags);
xmlFreeDoc(html);
return out;
} else {
xmlParserCtxtPtr ctxt = xmlNewParserCtxt();
- xmlDocPtr xml = xmlCtxtReadFile(ctxt, file, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
+ xmlDocPtr xml = xmlCtxtReadFile(ctxt, file, encoding, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
xmlFreeParserCtxt(ctxt);
if(xml == NULL) return parse_error("Couldn't parse file: %s\n", file);
parsedParsleyPtr out = parsley_parse_doc(parsley, xml, flags);
@@ -117,17 +118,18 @@ parsedParsleyPtr parsley_parse_file(parsleyPtr parsley, char* file, int flags) {
parsedParsleyPtr parsley_parse_string(parsleyPtr parsley, char* string, size_t size, char* base_uri, int flags) {
xmlSetGenericErrorFunc(NULL , parsleyXsltError);
bool html = flags & PARSLEY_OPTIONS_HTML;
+ char * encoding = flags & PARSLEY_OPTIONS_FORCE_UTF8 ? "UTF-8" : NULL;
if(base_uri == NULL) base_uri = "http://parselets.com/in-memory-string";
if(html) {
htmlParserCtxtPtr htmlCtxt = htmlNewParserCtxt();
- htmlDocPtr html = htmlCtxtReadMemory(htmlCtxt, string, size, base_uri, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
+ htmlDocPtr html = htmlCtxtReadMemory(htmlCtxt, string, size, base_uri, encoding, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
if(html == NULL) return parse_error("Couldn't parse string");
parsedParsleyPtr out = parsley_parse_doc(parsley, html, flags);
xmlFreeDoc(html);
return out;
} else {
xmlParserCtxtPtr ctxt = xmlNewParserCtxt();
- xmlDocPtr xml = xmlCtxtReadMemory(ctxt, string, size, base_uri, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
+ xmlDocPtr xml = xmlCtxtReadMemory(ctxt, string, size, base_uri, encoding, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
if(xml == NULL) return parse_error("Couldn't parse string");
parsedParsleyPtr out = parsley_parse_doc(parsley, xml, flags);
xmlFreeDoc(xml);
View
@@ -64,7 +64,8 @@ enum {
PARSLEY_OPTIONS_ALLOW_NET = 4,
PARSLEY_OPTIONS_ALLOW_LOCAL = 8,
PARSLEY_OPTIONS_COLLATE = 16,
- PARSLEY_OPTIONS_SGWRAP = 32
+ PARSLEY_OPTIONS_SGWRAP = 32,
+ PARSLEY_OPTIONS_FORCE_UTF8 = 64
};
typedef parsley_context * contextPtr;
View
@@ -46,6 +46,7 @@ static struct argp_option options[] = {
{"no-collate", 'N', 0, 0, "Don't collate array entries" },
{"sg-wrap", 's', 0, 0, "Wrap text nodes for SelectorGadget compatibility" },
{"user-agent", 'U', "USER_AGENT", 0, "Value of HTTP User-Agent header" },
+ {"utf8", 'u', 0, 0, "Force input to be read as UTF-8" },
{"no-net", 'z', 0, 0, "Disable ftp and http access for parselets" },
{"no-filesystem", 'Z', 0, 0, "Disable filesystem access for parselets" },
{ 0 }
@@ -62,6 +63,9 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state)
case 'x':
arguments->flags &= ~PARSLEY_OPTIONS_HTML;
break;
+ case 'u':
+ arguments->flags |= PARSLEY_OPTIONS_FORCE_UTF8;
+ break;
case 'U':
parsley_set_user_agent(arg);
case 'n':
@@ -121,7 +125,7 @@ int main (int argc, char **argv) {
struct list_elem *elemptr = &elem;
elem.has_next = 0;
arguments.output_xml = 0;
- arguments.flags = ~0 & ~PARSLEY_OPTIONS_SGWRAP;
+ arguments.flags = ~0 & ~PARSLEY_OPTIONS_SGWRAP & ~PARSLEY_OPTIONS_FORCE_UTF8;
arguments.include_files = elemptr;
arguments.output_file = "-";
argp_parse (&argp, argc, argv, 0, 0, &arguments);
View
@@ -69,7 +69,7 @@ _parsley_set_user_agent(char * agent) {
static void *
xmlUserAgentIOHTTPOpen(const char * file_name) {
- return(xmlNanoHTTPMethod(file_name, NULL, NULL, NULL, parsley_user_agent_header, 0));
+ return (void *)(xmlNanoHTTPMethod(file_name, NULL, NULL, NULL, parsley_user_agent_header, 0));
}
void
View
@@ -31,7 +31,6 @@ static struct json_object * _xml2json(xmlNodePtr xml) {
}
break;
case XML_TEXT_NODE:
- // json_object_put(json);
json = json_object_new_string(xml->content);
break;
}

0 comments on commit 4d4ec4f

Please sign in to comment.