From a1ba91bd9e02604a6ebcae0d2afb0b8310cb38b8 Mon Sep 17 00:00:00 2001 From: Colomban Wendling Date: Mon, 4 Dec 2017 21:03:42 -0800 Subject: [PATCH 1/7] Avoid weird length value Don't include the extra NUL in the length itself as it's actually not part of the data. Instead simply add it when necessary. --- src/document.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/document.c b/src/document.c index 12edff277e..2f084920c5 100644 --- a/src/document.c +++ b/src/document.c @@ -1876,7 +1876,7 @@ static gsize save_convert_to_encoding(GeanyDocument *doc, gchar **data, gsize *l g_return_val_if_fail(len != NULL, FALSE); /* try to convert it from UTF-8 to original encoding */ - conv_file_contents = g_convert(*data, *len - 1, doc->encoding, "UTF-8", + conv_file_contents = g_convert(*data, *len, doc->encoding, "UTF-8", &bytes_read, &conv_len, &conv_error); if (conv_error != NULL) @@ -1892,7 +1892,7 @@ _("An error occurred while converting the file from UTF-8 in \"%s\". The file re gint context_len; gunichar unic; /* don't read over the doc length */ - gint max_len = MIN((gint)bytes_read + 6, (gint)*len - 1); + gint max_len = MIN((gint)bytes_read + 6, (gint)*len); gchar context[7]; /* read 6 bytes from Sci + '\0' */ sci_get_text_range(doc->editor->sci, bytes_read, max_len, context); @@ -2147,22 +2147,22 @@ gboolean document_save_file(GeanyDocument *doc, gboolean force) /* notify plugins which may wish to modify the document before it's saved */ g_signal_emit_by_name(geany_object, "document-before-save", doc); - len = sci_get_length(doc->editor->sci) + 1; + len = sci_get_length(doc->editor->sci); if (doc->has_bom && encodings_is_unicode_charset(doc->encoding)) { /* always write a UTF-8 BOM because in this moment the text itself is still in UTF-8 * encoding, it will be converted to doc->encoding below and this conversion * also changes the BOM */ - data = (gchar*) g_malloc(len + 3); /* 3 chars for BOM */ + data = (gchar*) g_malloc(len + 3 + 1); /* 3 chars for BOM */ data[0] = (gchar) 0xef; data[1] = (gchar) 0xbb; data[2] = (gchar) 0xbf; - sci_get_text(doc->editor->sci, len, data + 3); + sci_get_text(doc->editor->sci, len + 1, data + 3); len += 3; } else { - data = (gchar*) g_malloc(len); - sci_get_text(doc->editor->sci, len, data); + data = (gchar*) g_malloc(len + 1); + sci_get_text(doc->editor->sci, len + 1, data); } /* save in original encoding, skip when it is already UTF-8 or has the encoding "None" */ From a70456c5627f95b63e5b98cd1f50c5c97ec26e5b Mon Sep 17 00:00:00 2001 From: Colomban Wendling Date: Mon, 4 Dec 2017 21:35:40 -0800 Subject: [PATCH 2/7] Add sci_set_text_with_length() Like sci_test_text() but allows for embedded NUL bytes. --- src/sciwrappers.c | 10 ++++++++++ src/sciwrappers.h | 2 ++ 2 files changed, 12 insertions(+) diff --git a/src/sciwrappers.c b/src/sciwrappers.c index eb9a8e8eb0..8e51b2614e 100644 --- a/src/sciwrappers.c +++ b/src/sciwrappers.c @@ -241,6 +241,16 @@ void sci_set_text(ScintillaObject *sci, const gchar *text) } +/* Sets all text, allowing for embedded NUL bytes */ +void sci_set_text_with_length(ScintillaObject *sci, const gchar *text, gsize len) +{ + sci_start_undo_action(sci); + sci_clear_all(sci); + SSM(sci, SCI_ADDTEXT, len, (sptr_t) text); + sci_end_undo_action(sci); +} + + gboolean sci_can_undo(ScintillaObject *sci) { return SSM(sci, SCI_CANUNDO, 0, 0) != FALSE; diff --git a/src/sciwrappers.h b/src/sciwrappers.h index c7f61abca1..5898007a4d 100644 --- a/src/sciwrappers.h +++ b/src/sciwrappers.h @@ -111,6 +111,8 @@ void sci_get_text_range (ScintillaObject *sci, gint start, gint end, gchar #ifdef GEANY_PRIVATE +void sci_set_text_with_length (ScintillaObject *sci, const gchar *text, gsize len); + gchar* sci_get_string (ScintillaObject *sci, guint msg, gulong wParam); void sci_set_line_numbers (ScintillaObject *sci, gboolean set); From c4606a139a029d56aa0a715cab8959f91e8bd83b Mon Sep 17 00:00:00 2001 From: Colomban Wendling Date: Mon, 4 Dec 2017 21:36:24 -0800 Subject: [PATCH 3/7] Don't cut the loaded data at the first NUL when passing it to Scintilla --- src/document.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/document.c b/src/document.c index 2f084920c5..6987e9185b 100644 --- a/src/document.c +++ b/src/document.c @@ -1382,7 +1382,7 @@ GeanyDocument *document_open_file_full(GeanyDocument *doc, const gchar *filename /* add the text to the ScintillaObject */ sci_set_readonly(doc->editor->sci, FALSE); /* to allow replacing text */ - sci_set_text(doc->editor->sci, filedata.data); /* NULL terminated data */ + sci_set_text_with_length(doc->editor->sci, filedata.data, filedata.len); queue_colourise(doc); /* Ensure the document gets colourised. */ /* detect & set line endings */ From 168476b866e2575ec751281239aecb68562af360 Mon Sep 17 00:00:00 2001 From: Colomban Wendling Date: Mon, 4 Dec 2017 21:37:42 -0800 Subject: [PATCH 4/7] Don't cut UTF-8 documents at the first NUL byte when saving them --- src/document.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/document.c b/src/document.c index 6987e9185b..80bdaa74dc 100644 --- a/src/document.c +++ b/src/document.c @@ -2175,10 +2175,6 @@ gboolean document_save_file(GeanyDocument *doc, gboolean force) return FALSE; } } - else - { - len = strlen(data); - } locale_filename = utils_get_locale_from_utf8(doc->file_name); From d2b1536753d272bb1a325d071c7fee9181cb4b8b Mon Sep 17 00:00:00 2001 From: Colomban Wendling Date: Mon, 4 Dec 2017 21:41:55 -0800 Subject: [PATCH 5/7] encodings: Accept NULs when validating UTF-8 --- src/encodings.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/encodings.c b/src/encodings.c index fb1d497abf..741e84e174 100644 --- a/src/encodings.c +++ b/src/encodings.c @@ -610,6 +610,29 @@ void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout, } +/* g_utf8_validate() but accepts NULs */ +static gboolean utf8_validate_with_nuls(const gchar *data, gssize size, const gchar **end) +{ + const gchar *endp; + + while (! g_utf8_validate(data, size, &endp)) + { + if (size <= (endp - data) || *endp != 0) + { + if (end) + *end = endp; + return FALSE; + } + + endp++; + size -= endp - data; + data = (gchar *) endp; + } + + return TRUE; +} + + /** * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset. * If @a fast is not set, additional checks to validate the converted string are performed. @@ -642,7 +665,7 @@ gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size, utf8_content = converted_contents; if (conv_error != NULL) g_error_free(conv_error); } - else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL)) + else if (conv_error != NULL || ! utf8_validate_with_nuls(converted_contents, bytes_written, NULL)) { if (conv_error != NULL) { @@ -883,7 +906,7 @@ handle_forced_encoding(BufferData *buffer, const gchar *forced_enc) if (utils_str_equal(forced_enc, "UTF-8")) { - if (! g_utf8_validate(buffer->data, buffer->len, NULL)) + if (! utf8_validate_with_nuls(buffer->data, buffer->len, NULL)) { return FALSE; } @@ -955,7 +978,7 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) /* try UTF-8 first */ if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 && - (buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL)) + (buffer->size == buffer->len) && utf8_validate_with_nuls(buffer->data, buffer->len, NULL)) { buffer->enc = g_strdup("UTF-8"); } From d957d384f056d514e894b55ef4dc7094d582ae86 Mon Sep 17 00:00:00 2001 From: Colomban Wendling Date: Tue, 5 Dec 2017 00:36:48 -0800 Subject: [PATCH 6/7] Properly load files with embedded NULs Such documents are still marked read-only because most editing features are still likely not to work as expected with NULs. --- src/document.c | 6 +-- src/encodings.c | 89 +++++++++++++++++++++--------------------- src/encodingsprivate.h | 2 +- 3 files changed, 49 insertions(+), 48 deletions(-) diff --git a/src/document.c b/src/document.c index 80bdaa74dc..d36b50f115 100644 --- a/src/document.c +++ b/src/document.c @@ -1026,9 +1026,9 @@ static gboolean load_text_file(const gchar *locale_filename, const gchar *displa if (filedata->readonly) { const gchar *warn_msg = _( - "The file \"%s\" could not be opened properly and has been truncated. " \ - "This can occur if the file contains a NULL byte. " \ - "Be aware that saving it can cause data loss.\nThe file was set to read-only."); + "The file \"%s\" contains a NUL byte. " \ + "Geany does not handle such files very well and be aware that editing " + "it can cause data loss.\nThe file was set to read-only."); if (main_status.main_window_realized) dialogs_show_msgbox(GTK_MESSAGE_WARNING, warn_msg, display_filename); diff --git a/src/encodings.c b/src/encodings.c index 741e84e174..7bb764a581 100644 --- a/src/encodings.c +++ b/src/encodings.c @@ -633,21 +633,8 @@ static gboolean utf8_validate_with_nuls(const gchar *data, gssize size, const gc } -/** - * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset. - * If @a fast is not set, additional checks to validate the converted string are performed. - * - * @param buffer The input string to convert. - * @param size The length of the string, or -1 if the string is nul-terminated. - * @param charset The charset to be used for conversion. - * @param fast @c TRUE to only convert the input and skip extended checks on the converted string. - * - * @return If the conversion was successful, a newly allocated nul-terminated string, - * which must be freed with @c g_free(). Otherwise @c NULL. - **/ -GEANY_API_SYMBOL -gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size, - const gchar *charset, gboolean fast) +static gchar *encodings_convert_to_utf8_from_charset_internal(const gchar *buffer, gssize size, + gsize *utf8_size, const gchar *charset, gboolean fast) { gchar *utf8_content = NULL; GError *conv_error = NULL; @@ -685,9 +672,31 @@ gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size, utf8_content = converted_contents; } + if (utf8_content && utf8_size) + *utf8_size = bytes_written; + return utf8_content; } +/** + * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset. + * If @a fast is not set, additional checks to validate the converted string are performed. + * + * @param buffer The input string to convert. + * @param size The length of the string, or -1 if the string is nul-terminated. + * @param charset The charset to be used for conversion. + * @param fast @c TRUE to only convert the input and skip extended checks on the converted string. + * + * @return If the conversion was successful, a newly allocated nul-terminated string, + * which must be freed with @c g_free(). Otherwise @c NULL. + **/ +GEANY_API_SYMBOL +gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size, + const gchar *charset, gboolean fast) +{ + return encodings_convert_to_utf8_from_charset_internal(buffer, size, NULL, charset, fast); +} + static gchar *encodings_check_regexes(const gchar *buffer, gsize size) { @@ -705,7 +714,7 @@ static gchar *encodings_check_regexes(const gchar *buffer, gsize size) static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gssize size, - const gchar *suggested_charset, gchar **used_encoding) + const gchar *suggested_charset, gsize *utf8_size, gchar **used_encoding) { const gchar *locale_charset = NULL; const gchar *charset; @@ -772,7 +781,7 @@ static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gss geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.", size, charset); - utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE); + utf8_content = encodings_convert_to_utf8_from_charset_internal(buffer, size, utf8_size, charset, FALSE); if (G_LIKELY(utf8_content != NULL)) { @@ -812,7 +821,7 @@ gchar *encodings_convert_to_utf8(const gchar *buffer, gssize size, gchar **used_ /* first try to read the encoding from the file content */ regex_charset = encodings_check_regexes(buffer, size); - utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding); + utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, NULL, used_encoding); g_free(regex_charset); return utf8; @@ -894,7 +903,6 @@ typedef struct gsize len; /* string length of data */ gchar *enc; gboolean bom; - gboolean partial; } BufferData; @@ -913,8 +921,9 @@ handle_forced_encoding(BufferData *buffer, const gchar *forced_enc) } else { - gchar *converted_text = encodings_convert_to_utf8_from_charset( - buffer->data, buffer->size, forced_enc, FALSE); + gsize converted_text_len; + gchar *converted_text = encodings_convert_to_utf8_from_charset_internal( + buffer->data, buffer->size, &converted_text_len, forced_enc, FALSE); if (converted_text == NULL) { return FALSE; @@ -922,7 +931,7 @@ handle_forced_encoding(BufferData *buffer, const gchar *forced_enc) else { SETPTR(buffer->data, converted_text); - buffer->len = strlen(converted_text); + buffer->len = converted_text_len; } } enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL); @@ -955,12 +964,13 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */ { - gchar *converted_text = encodings_convert_to_utf8_from_charset( - buffer->data, buffer->size, buffer->enc, FALSE); + gsize converted_text_len; + gchar *converted_text = encodings_convert_to_utf8_from_charset_internal( + buffer->data, buffer->size, &converted_text_len, buffer->enc, FALSE); if (converted_text != NULL) { SETPTR(buffer->data, converted_text); - buffer->len = strlen(converted_text); + buffer->len = converted_text_len; } else { @@ -985,8 +995,9 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) else { /* detect the encoding */ + gsize converted_text_len; gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data, - buffer->size, regex_charset, &buffer->enc); + buffer->size, regex_charset, &converted_text_len, &buffer->enc); if (converted_text == NULL) { @@ -994,7 +1005,7 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) return FALSE; } SETPTR(buffer->data, converted_text); - buffer->len = strlen(converted_text); + buffer->len = converted_text_len; } g_free(regex_charset); } @@ -1028,16 +1039,6 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc) * if we have a BOM */ tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL); - /* check whether the size of the loaded data is equal to the size of the file in the - * filesystem file size may be 0 to allow opening files in /proc/ which have typically a - * file size of 0 bytes */ - if (buffer->len != buffer->size && buffer->size != 0 && ( - tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */ - tmp_enc_idx == GEANY_ENCODING_UTF_7)) /* filter UTF-7/8 where no NULL bytes are allowed */ - { - buffer->partial = TRUE; - } - /* Determine character encoding and convert to UTF-8 */ if (forced_enc != NULL) { @@ -1074,22 +1075,22 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc) * @param forced_enc forced encoding to use, or @c NULL * @param used_encoding return location for the actually used encoding, or @c NULL * @param has_bom return location to store whether the data had a BOM, or @c NULL - * @param partial return location to store whether the conversion may be partial, or @c NULL + * @param has_null return location to store whether the converted buffer contains a NUL byte, or @c NULL * * @return @C TRUE if the conversion succeeded, @c FALSE otherwise. */ gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc, - gchar **used_encoding, gboolean *has_bom, gboolean *partial) + gchar **used_encoding, gboolean *has_bom, gboolean *has_null) { BufferData buffer; buffer.data = *buf; buffer.size = *size; - /* use strlen to check for null chars */ - buffer.len = strlen(buffer.data); + /* Special-case a size of 0 to allow opening files in /proc/ which typically have a + * file size of 0 bytes */ + buffer.len = (buffer.size == 0) ? strlen(buffer.data) : buffer.size; buffer.enc = NULL; buffer.bom = FALSE; - buffer.partial = FALSE; if (! handle_buffer(&buffer, forced_enc)) return FALSE; @@ -1101,8 +1102,8 @@ gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *f g_free(buffer.enc); if (has_bom) *has_bom = buffer.bom; - if (partial) - *partial = buffer.partial; + if (has_null) + *has_null = memchr(buffer.data, '\0', buffer.len) != NULL; *buf = buffer.data; return TRUE; diff --git a/src/encodingsprivate.h b/src/encodingsprivate.h index 8c574bfefe..efdb9fb945 100644 --- a/src/encodingsprivate.h +++ b/src/encodingsprivate.h @@ -73,7 +73,7 @@ void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout, GtkCell gboolean encodings_is_unicode_charset(const gchar *string); gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc, - gchar **used_encoding, gboolean *has_bom, gboolean *partial); + gchar **used_encoding, gboolean *has_bom, gboolean *has_null); GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len); From 307335a65dfe9f7102793ea051bd14abc802b537 Mon Sep 17 00:00:00 2001 From: Colomban Wendling Date: Tue, 5 Dec 2017 00:43:21 -0800 Subject: [PATCH 7/7] Show an infobar for files with embedded NULs This is less annoying than a modal dialog while better showing which document is affected. --- src/document.c | 38 +++++++++++++++++++++++++++++--------- src/documentprivate.h | 1 + 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/document.c b/src/document.c index d36b50f115..0e1170be66 100644 --- a/src/document.c +++ b/src/document.c @@ -1025,15 +1025,8 @@ static gboolean load_text_file(const gchar *locale_filename, const gchar *displa if (filedata->readonly) { - const gchar *warn_msg = _( - "The file \"%s\" contains a NUL byte. " \ - "Geany does not handle such files very well and be aware that editing " - "it can cause data loss.\nThe file was set to read-only."); - - if (main_status.main_window_realized) - dialogs_show_msgbox(GTK_MESSAGE_WARNING, warn_msg, display_filename); - - ui_set_statusbar(TRUE, warn_msg, display_filename); + msgwin_status_add(_("The file \"%s\" contains a NUL byte, which is not properly " \ + "supported. Editing this file might lead to unexpected behavior."), display_filename); } return TRUE; @@ -1278,6 +1271,13 @@ void document_show_tab(GeanyDocument *doc) } +static void on_document_has_nul_byte_response(GtkWidget *bar, gint response, GeanyDocument *doc) +{ + doc->priv->info_bars[MSG_TYPE_HAS_NUL] = NULL; + gtk_widget_destroy(bar); +} + + /* To open a new file, set doc to NULL; filename should be locale encoded. * To reload a file, set the doc for the document to be reloaded; filename should be NULL. * pos is the cursor position, which can be overridden by --line and --column. @@ -1385,6 +1385,26 @@ GeanyDocument *document_open_file_full(GeanyDocument *doc, const gchar *filename sci_set_text_with_length(doc->editor->sci, filedata.data, filedata.len); queue_colourise(doc); /* Ensure the document gets colourised. */ + /* the file has embedded NULs, warn the user */ + if (filedata.readonly && doc->priv->info_bars[MSG_TYPE_HAS_NUL] == NULL) + { + doc->priv->info_bars[MSG_TYPE_HAS_NUL] = document_show_message( + doc, GTK_MESSAGE_WARNING, + on_document_has_nul_byte_response, + GTK_STOCK_OK, GTK_RESPONSE_ACCEPT, + NULL, 0, + NULL, 0, + _("Geany does not handle such files very well. Be aware that some features " + "might not work as expected and can lead to data loss. " + "The file has been set to read-only."), + _("The file contains a NUL byte.")); + } + else if (! filedata.readonly && doc->priv->info_bars[MSG_TYPE_HAS_NUL] != NULL) + { + /* dismiss the info bar if reloading a file that don't has embedded NULs anymore */ + gtk_info_bar_response(GTK_INFO_BAR(doc->priv->info_bars[MSG_TYPE_HAS_NUL]), GTK_RESPONSE_CANCEL); + } + /* detect & set line endings */ editor_mode = utils_get_line_endings(filedata.data, filedata.len); if (undo_reload_data) diff --git a/src/documentprivate.h b/src/documentprivate.h index eab2f0dbaf..2ffb3c890e 100644 --- a/src/documentprivate.h +++ b/src/documentprivate.h @@ -67,6 +67,7 @@ enum MSG_TYPE_RELOAD, MSG_TYPE_RESAVE, MSG_TYPE_POST_RELOAD, + MSG_TYPE_HAS_NUL, NUM_MSG_TYPES };