diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 60f0bc19..3de8f943 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -47,6 +47,11 @@ jobs: sudo apt-get update sudo apt-get install libcurl4-openssl-dev + - name: install libjson-c + run: | + sudo apt-get update + sudo apt-get install libjson-c-dev + - name: build run: make diff --git a/.github/workflows/makefile.yml b/.github/workflows/makefile.yml index 4b874364..d4677012 100644 --- a/.github/workflows/makefile.yml +++ b/.github/workflows/makefile.yml @@ -9,6 +9,9 @@ on: jobs: ubuntu: runs-on: ubuntu-latest + + env: + TRURL_JSON_IN: true strategy: matrix: @@ -30,7 +33,7 @@ jobs: - name: install libcurl run: | sudo apt-get update - sudo apt-get install libcurl4-openssl-dev ${{ matrix.build.install_packages }} + sudo apt-get install libcurl4-openssl-dev libjson-c-dev ${{ matrix.build.install_packages }} - name: code style check run: make checksrc @@ -47,13 +50,16 @@ jobs: cygwin: runs-on: windows-latest + env: + TRURL_JSON_IN: true + steps: - uses: actions/checkout@v4 - name: install cygwin uses: cygwin/cygwin-install-action@master with: - packages: curl, libcurl-devel, libcurl4, make, gcc-core, python39 + packages: curl, libcurl-devel, libcurl4, make, gcc-core, python39, libjson-c-devel - name: make run: make @@ -67,9 +73,21 @@ jobs: macos: runs-on: macos-latest + env: + TRURL_JSON_IN: true + steps: - uses: actions/checkout@v4 + - name: Install Homebrew + id: install-homebrew + uses: Homebrew/actions/setup-homebrew@master + + - name: Install json-c + run: | + brew install json-c + echo "JSON_C_PREFIX=$(brew --prefix)" >> "$GITHUB_ENV" + - name: make run: make diff --git a/Makefile b/Makefile index 4038456c..1befaa02 100644 --- a/Makefile +++ b/Makefile @@ -33,6 +33,14 @@ CFLAGS += -Wconversion -Wmissing-prototypes -Wwrite-strings -Wsign-compare -Wno- ifndef NDEBUG CFLAGS += -g endif +ifdef TRURL_JSON_IN +CFLAGS += -DTRURL_JSON_IN -Wno-gnu +LDLIBS += -ljson-c +ifneq ($(strip $(JSON_C_PREFIX)),) +CFLAGS += -I$(JSON_C_PREFIX)/include +LDLIBS += -L$(JSON_C_PREFIX)/lib +endif +endif MANUAL = trurl.1 PREFIX ?= /usr/local diff --git a/README.md b/README.md index 5e3a1915..47e247fd 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,18 @@ trurl also uses `CURLUPART_ZONEID` added in libcurl 7.81.0 and It would certainly be possible to make trurl work with older libcurl versions if someone wanted to. +## Building with JSON input +To allow trurl to accept JSON input you must install libjson-c. Json input is disabled by default, so it +must be enabled at compile time. +1. Install json-c + - On Ubuntu and Debian linux it is called `libjson-c-dev`. + - On Cygwin it is called `libjson-c-devel`. + - On Homebrew it is called `json-c`. +2. Enable JSON input at compile time + - Ensure `TRURL_JSON_IN` is in you environment + - In your environment set `JSON_C_PREFIX` to the install path of json-c to include and link json-c from unexpected locations. This step is only required if json-c is installed somewhere gcc can't find. + + ### Older libcurls trurl builds with libcurl older than 7.81.0 but will then not work as diff --git a/testfiles/test0003.txt b/testfiles/test0003.txt new file mode 100644 index 00000000..7b8b712a --- /dev/null +++ b/testfiles/test0003.txt @@ -0,0 +1,66 @@ +[ + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }}, + { + "parts": { + "host": "example.com" + }} +] diff --git a/testfiles/test0004.txt b/testfiles/test0004.txt new file mode 100644 index 00000000..108fb9f0 --- /dev/null +++ b/testfiles/test0004.txt @@ -0,0 +1,31 @@ +[ + { + "parts": { + "scheme": "ftp", + "user": "scream", + "host": "url.com", + "path": "/", + "query": "ignoredquery", + "fragment": "a-fragment" + }, + "params": [ + { + "key": "query", + "value": "pair" + }, + { + "key": "singlequery", + } + ] + }, + { + "parts": { + "scheme": "http", + "host": "example.org", + "path": "/" + } + }, + { + "url": "example.com" + } +] diff --git a/tests.json b/tests.json index 844f9085..1e53ff23 100644 --- a/tests.json +++ b/tests.json @@ -2499,6 +2499,20 @@ "returncode": 0 } }, + { + "required": ["json-input"], + "input": { + "arguments": [ + "--json-file", + "testfiles/test0003.txt" + ] + }, + "expected": { + "returncode": 0, + "stderr": "", + "stdout": "http://example.com/\nhttp://example.com/\nhttp://example.com/\nhttp://example.com/\nhttp://example.com/\nhttp://example.com/\nhttp://example.com/\nhttp://example.com/\nhttp://example.com/\nhttp://example.com/\nhttp://example.com/\nhttp://example.com/\nhttp://example.com/\nhttp://example.com/\nhttp://example.com/\nhttp://example.com/\n" + } + }, { "input": { "arguments": [ @@ -2544,5 +2558,49 @@ "stderr": "", "returncode": 0 } + }, + { + "required": ["json-input"], + "input": { + "arguments": [ + "-j", + "testfiles/test0001.txt" + ] + }, + "expected": { + "returncode": 13 , + "stderr": "trurl error: Cannot parse JSON, expected an array of objects.\ntrurl error: Try trurl -h for help\n", + "stdout": "" + } + }, + { + "required": ["json-input"], + "input": { + "arguments": [ + "-j", + "testfiles/test0004.txt" + ] + }, + "expected": { + "returncode": 0, + "stderr": "trurl note: ignoring 'query', provide a separate 'params' array.\ntrurl note: Required key \"parts\" not found in json object.\n", + "stdout": "ftp://scream@url.com/?query=pair&singlequery#a-fragment\nhttp://example.org/\n" + } + }, + { + "required": ["json-input"], + "input": { + "arguments": [ + "-j", + "testfiles/test0004.txt", + "--set", + "user=person" + ] + }, + "expected": { + "returncode": 0, + "stderr": "trurl note: ignoring 'query', provide a separate 'params' array.\ntrurl note: Required key \"parts\" not found in json object.\n", + "stdout": "ftp://person@url.com/?query=pair&singlequery#a-fragment\nhttp://person@example.org/\n" + } } ] diff --git a/trurl.1 b/trurl.1 index 2faad488..6bd07f2f 100644 --- a/trurl.1 +++ b/trurl.1 @@ -169,6 +169,16 @@ Set the component to multiple values and output the result once for each iteration. Several combined iterations are allowed to generate combinations, but only one \fI--iterate\fP option per component. The listed items to iterate over should be separated by single spaces. +.IP "-j, --json-file [file]" +Creates urls based on a file which holds a JSON array. The --json-file option expects +an array of objects, each object containing a "parts" object and optionally a "params" +array. The keys of the "parts" object are equivalent to the options for --get or --set. +The only required key in the parts object is "host". the get/set option "query" is not +a valid key for the parts object. --json-file requires a separate "params" array where +each element of the array specifies a "key" and a "value". + +Like --url-file, [file] may either be a path to a file, or if the file name is '-' then +--json-file will read from standard in. .IP "--json" Outputs all set components of the URLs as JSON objects. All components of the URL that have data get populated in the parts object using their component @@ -457,6 +467,33 @@ A problem with --get A problem with --iterate .IP 12 A problem with --replace or --force-replace +.f1 +.IP "use JSON input for trurl" +.nf +$ echo ' + [ + { + "parts": { + "host": "example.com", + "path": "/a/path/to/index" + }, + "params": [ + { + "key": "foo", + "value": "bar" + } + ] + }, + { + "parts": { + "host": "curl.se", + "scheme": "ftp" + } + } + ]' | trurl -j - +http://example.com/a/path/to/index?foo=bar +ftp://curl.se/ +.fi .SH WWW https://curl.se/trurl .SH "SEE ALSO" diff --git a/trurl.c b/trurl.c index 38a49ed0..029f2d44 100644 --- a/trurl.c +++ b/trurl.c @@ -29,6 +29,10 @@ #include #include #include +#ifdef TRURL_JSON_IN +#include +#endif + #if defined(_MSC_VER) && (_MSC_VER < 1800) typedef enum { @@ -140,6 +144,7 @@ static const struct var variables[] = { #define ERROR_GET 10 /* bad --get syntax */ #define ERROR_ITER 11 /* bad --iterate syntax */ #define ERROR_REPL 12 /* a --replace problem */ +#define ERROR_JSON 13 /* a json string could not be parsed */ #ifndef SUPPORTS_URL_STRERROR /* provide a fake local mockup */ @@ -186,6 +191,7 @@ static void help(void) " -h, --help - this help\n" " --iterate [component]=[list] - create multiple URL outputs\n" " --json - output URL as JSON\n" + " -j, --json-file [file/-] - json input from file or stdin\n" " --keep-port - keep known default ports\n" " --no-guess-scheme - require scheme in URLs\n" " --punycode - encode hostnames in punycode\n" @@ -256,6 +262,9 @@ static void show_version(void) if(supports_puny) fprintf(stdout, " punycode2idn"); #endif +#ifdef TRURL_JSON_IN + fprintf(stdout, " json-input"); +#endif fprintf(stdout, "\n"); exit(0); @@ -296,6 +305,7 @@ struct option { bool end_of_options; bool quiet_warnings; bool force_replace; + bool json_in; /* -- stats -- */ unsigned int urls; @@ -639,6 +649,16 @@ static int getarg(struct option *o, o->force_replace = true; *usedarg = gap; } + else if(checkoptarg(o, "--json-file", flag, arg) || + checkoptarg(o, "-j", flag, arg)) { +#ifdef TRURL_JSON_IN + urlfile(o, arg); + *usedarg = gap; + o->json_in = true; +#else + trurl_warnf(o, "not built with support for JSON input."); +#endif + } else return 1; /* unrecognized option */ return 0; @@ -1620,6 +1640,201 @@ static void singleurl(struct option *o, curl_url_cleanup(uh); } +#ifdef TRURL_JSON_IN +#define JSON_INIT_SIZE 1024 +static void single_url_from_json(json_object *wholeurl, struct option *o) +{ + CURLU *uh = curl_url(); + /* extract all key / value pairs from params array and generate a + * new query=... string from the values. We are doing this instead of + * using a function like appendquery or addpqairs because those do it + * for all urls, and we only want it associated w/ the current url.*/ + char *this_query = NULL; + size_t this_q_size = 0; + json_object *params = NULL; + bool scheme_set = false; + if(json_object_object_get_ex(wholeurl, "params", ¶ms)) { + size_t params_length = json_object_array_length(params); + for(size_t j = 0; j < params_length; j++) { + json_object *param = json_object_array_get_idx(params, (int)j); + json_object *param_k_obj = NULL; + json_object *param_v_obj = NULL; + const char *param_k = NULL; + const char *param_v = NULL; + if(json_object_object_get_ex(param, "key", ¶m_k_obj)) + param_k = json_object_get_string(param_k_obj); + if(json_object_object_get_ex(param, "value", ¶m_v_obj)) + param_v = json_object_get_string(param_v_obj); + size_t value_length = param_v ? strlen(param_v) : 0; + size_t key_length = param_k ? strlen(param_k) : 0; + int set = value_length > 0 ? 1:0; + size_t qpair_len = key_length + value_length + set + 1; + char *qpair = calloc(qpair_len, sizeof(char)); + if(!qpair) + errorf(o, ERROR_MEM, "Out of memory"); + memcpy(qpair, param_k, key_length); + if(value_length) { + qpair[key_length] = '='; + memcpy(qpair + key_length + 1, param_v, value_length); + } + this_query = realloc(this_query, this_q_size + qpair_len); + memcpy(this_query + this_q_size, qpair, qpair_len); + this_q_size += qpair_len; + this_query[this_q_size - 1] = '&'; + free(qpair); + } + } + if(this_q_size) { + this_query[this_q_size - 1] = '\0'; + const char *qss = "query:="; /* do not encode the url */ + char *query_set_str = malloc(sizeof(char) * (this_q_size + strlen(qss))); + memcpy(query_set_str, qss, strlen(qss)); + memcpy(query_set_str + strlen(qss), this_query, this_q_size); + setone(uh, query_set_str, o); + free(query_set_str); + free(this_query); + } + /* Get all other parts of the url info. */ + json_object *parts = NULL; + json_object_object_get_ex(wholeurl, "parts", &parts); + if(!parts) { + trurl_warnf(o, "Required key \"parts\" not found in json object."); + curl_url_cleanup(uh); + return; + } + json_object_object_foreach(parts, key, field) { + if(!strcmp(key, "query")) { + trurl_warnf(o, "ignoring 'query', provide a separate 'params' array."); + continue; + } + /* Scheme is required to be set, so we need to ensure its set */ + if(scheme_set != true && !strcmp(key, "scheme")) + scheme_set = true; + const char *val_str = json_object_get_string(field); + size_t key_len = strlen(key); + size_t val_len = strlen(val_str); + /* +2, one char for '=' and one for null terminator. */ + char *set_str = malloc(val_len + key_len + 2); + memset(set_str, 0, val_len + key_len + 2); + memcpy(set_str, key, key_len); + memcpy(set_str + key_len + 1, val_str, val_len); + set_str[key_len] = '='; + setone(uh, set_str, o); + free(set_str); + } + if(!scheme_set) { + setone(uh, "scheme=http", o); + } + struct iterinfo iinfo; + memset(&iinfo, 0, sizeof(iinfo)); + iinfo.uh = uh; + singleurl(o, NULL, &iinfo, o->iter_list); + curl_url_cleanup(uh); +} +/* fd is a file which holds the json string. */ +/* Expects the file to contain a json array of objects. it will + * return urls as it finds them, treating the file as a stream. + * from_json only allocates as much space as required to parse + * the longest object in the array. */ +static void from_json(FILE *file, struct option *o) +{ + char reading_buff[JSON_INIT_SIZE]; + size_t last_write = 0; + size_t json_buf_size = JSON_INIT_SIZE; + char *json_string = calloc(sizeof(char), json_buf_size); + memset(reading_buff, 0, sizeof(char) * JSON_INIT_SIZE); + if(!json_string) { + free(json_string); + errorf(o, ERROR_MEM, "out of memory while reading JSON string."); + } + size_t i = 0; + int num_brackets = 0, prev_num_brackets = 0; + bool in_json_string = false; + bool in_array = false; + char current, previous; + int reading = 0; + /* reads in the file one character at a time and do some simple parsing + * to find a json object in an array. it then parses these objects with + * libjson-c and passes them to single_url_from_json */ + while((reading = getc(file)) != EOF) { + current = (char)reading; + if((current == ' ' || current == '\t'|| current == '\r' + || current == '\n') && !in_json_string && !num_brackets) + continue; + /* detect top level json array */ + if(current == '[' && !in_json_string && !num_brackets) { + in_array = true; + } + /* trurl expects an array of objects - if we aren't in an object and we see + * non array characters then we error out. */ + if(!num_brackets && !in_json_string && !in_array && current != ',' + && current != ']') { + free(json_string); + errorf(o, ERROR_JSON, + "Cannot parse JSON, expected an array of objects."); + } + if(current == '{' && !in_json_string) { + num_brackets++; + } + if(current == '"' && in_array && num_brackets) { + if(in_json_string && previous != '\\') + in_json_string = false; + else in_json_string = true; + } + /* only want to add to reading buff if we're in an object */ + if(num_brackets) { + reading_buff[i++] = current; + } + if(current == '}' && !in_json_string) { + num_brackets--; + } + /* when we've filled up the working buffer, copy it to heap */ + if(i == JSON_INIT_SIZE) { + /* we are reusing the json_string buffer for every url, so we + * only need to allocate more memory if the current url json + * string takes up more memory than any of the previous urls. */ + if(last_write + JSON_INIT_SIZE >= json_buf_size) { + json_buf_size += JSON_INIT_SIZE; + json_string = realloc(json_string, json_buf_size); + if(!json_string) { + errorf(o, ERROR_MEM, + "out of memory while reading JSON string."); + } + } + memcpy(json_string + last_write, reading_buff, JSON_INIT_SIZE); + last_write += JSON_INIT_SIZE; + memset(reading_buff, 0, sizeof(char) * JSON_INIT_SIZE); + i = 0; + } + /* when the number of bracket pairs has gone back down to + * zero, we know we have ready a whole json object, this string + * is passed to json-c to parse it, then that parsed object + * handed to single_url_from_json to convert print out a url */ + if(!num_brackets && prev_num_brackets == 1) { + /* anything that hasn't been copied over to json_string must + * be copied now */ + memcpy(json_string + last_write, reading_buff, JSON_INIT_SIZE); + json_object *jobj = json_tokener_parse(json_string); + if(!jobj) { + free(json_string); + errorf(o, ERROR_JSON, "Cannot parse JSON"); + } + single_url_from_json(jobj, o); + json_object_put(jobj); + memset(json_string, 0, json_buf_size); + memset(reading_buff, 0, sizeof(char) * JSON_INIT_SIZE); + i = 0; + last_write = 0; + } + previous = current; + prev_num_brackets = num_brackets; + } + free(json_string); +} +#endif + + + int main(int argc, const char **argv) { int exit_status = 0; @@ -1660,7 +1875,12 @@ int main(int argc, const char **argv) /* this is a file to read URLs from */ char buffer[4096]; /* arbitrary max */ bool end_of_file = false; - while(!end_of_file && fgets(buffer, sizeof(buffer), o.url)) { + if(o.json_in) { +#ifdef TRURL_JSON_IN + from_json(o.url, &o); +#endif + } + else while(!end_of_file && fgets(buffer, sizeof(buffer), o.url)) { char *eol = strchr(buffer, '\n'); if(eol && (eol > buffer)) { if(eol[-1] == '\r') @@ -1695,7 +1915,6 @@ int main(int argc, const char **argv) while((eol > buffer) && ((eol[-1] == ' ') || eol[-1] == '\t')) eol--; - if(eol > buffer) { /* if there is actual content left to deal with */ struct iterinfo iinfo; diff --git a/winbuild/vcpkg.json b/winbuild/vcpkg.json index 6ec27d75..62ca6266 100644 --- a/winbuild/vcpkg.json +++ b/winbuild/vcpkg.json @@ -2,6 +2,7 @@ "name": "trurl", "version": "0.x", "dependencies": [ - "curl" + "curl", + "json-c" ] }