From 4a0ce3edc3effcd8e050484c464201ac9d0027e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrico=20Tr=C3=B6ger?= Date: Sun, 13 Oct 2013 13:16:41 +0200 Subject: [PATCH] Improve stripping of whitespace and punctuation characters (fixes #98) More expensive stripping of whitespace and punctuation characters, including Unicode characters before checking a word. However, this is rather a workaround than a solution. The proper way would be to read words using Scintilla but split a line into words manually with correct handling of Unicode characters. --- spellcheck/src/speller.c | 101 ++++++++++++++++++++++++++++++++++----- 1 file changed, 90 insertions(+), 11 deletions(-) diff --git a/spellcheck/src/speller.c b/spellcheck/src/speller.c index b48f7c51d..d5517f3c5 100644 --- a/spellcheck/src/speller.c +++ b/spellcheck/src/speller.c @@ -52,10 +52,81 @@ static void dict_describe(const gchar* const lang, const gchar* const name, } +static gboolean is_word_sep(gunichar c) +{ + return (g_unichar_isspace(c) || g_unichar_ispunct(c)) && c != (gunichar)'\''; +} + + +/* Strip punctuation and white space, more or less Unicode-safe. + * The offset of the start of the word is stored in offset if non-NULL. */ +static gchar *strip_word(const gchar *word_to_check, gint *result_offset) +{ + gunichar c; + gchar *word = g_strdup(word_to_check); + gchar *word_start = word; + gchar *word_end; + gint offset = 0; + gint word_len; + gint new_word_len; + + /* strip from the left */ + do + { + c = g_utf8_get_char_validated(word, -1); + if (is_word_sep(c)) + { /* skip this character */ + word = g_utf8_next_char(word); + } + else + break; + } while (c != (gunichar) -1 && c != 0 && *word != '\0'); + word_len = strlen(word_to_check); + offset = word - word_start; + new_word_len = word_len - offset; + + if (new_word_len <= 0) + { /* empty or only punctuation in input string */ + *result_offset = 0; + g_free(word_start); + return NULL; + } + /* move the string in-place and truncate it */ + g_memmove(word_start, word, new_word_len); + word = word_start; + word[new_word_len] = '\0'; + if (! NZV(word)) + { + g_free(word); + return NULL; + } + /* strip from the right */ + word_end = word + strlen(word); + do + { + word_end = g_utf8_prev_char(word_end); + c = g_utf8_get_char_validated(word_end, -1); + if (is_word_sep(c)) + { /* skip this character */ + *word_end = '\0'; + } + else + break; + } while (c != (gunichar) -1 && word_end >= word); + + if (result_offset != NULL) + *result_offset = offset; + + return word; +} + + static gint sc_speller_check_word(GeanyDocument *doc, gint line_number, const gchar *word, gint start_pos, gint end_pos) { gsize n_suggs = 0; + gchar *word_to_check; + gint offset; g_return_val_if_fail(sc_speller_dict != NULL, 0); g_return_val_if_fail(doc != NULL, 0); @@ -73,9 +144,24 @@ static gint sc_speller_check_word(GeanyDocument *doc, gint line_number, const gc if (! sc_speller_is_text(doc, start_pos)) return 0; + /* strip punctuation and white space */ + word_to_check = strip_word(word, &offset); + if (! NZV(word_to_check)) + { + g_free(word_to_check); + return 0; + } + + /* recalculate start_pos and end_pos */ + start_pos += offset; + end_pos = start_pos + strlen(word_to_check); + /* early out if the word is spelled correctly */ - if (enchant_dict_check(sc_speller_dict, word, -1) == 0) + if (enchant_dict_check(sc_speller_dict, word_to_check, -1) == 0) + { + g_free(word_to_check); return 0; + } editor_indicator_set_on_range(doc->editor, GEANY_INDICATOR_ERROR, start_pos, end_pos); @@ -86,10 +172,10 @@ static gint sc_speller_check_word(GeanyDocument *doc, gint line_number, const gc GString *str; str = g_string_sized_new(256); - suggs = enchant_dict_suggest(sc_speller_dict, word, -1, &n_suggs); + suggs = enchant_dict_suggest(sc_speller_dict, word_to_check, -1, &n_suggs); if (suggs != NULL) { - g_string_append_printf(str, "line %d: %s | ", line_number + 1, word); + g_string_append_printf(str, "line %d: %s | ", line_number + 1, word_to_check); g_string_append(str, _("Try: ")); @@ -108,6 +194,7 @@ static gint sc_speller_check_word(GeanyDocument *doc, gint line_number, const gc g_string_free(str, TRUE); } + g_free(word_to_check); return n_suggs; } @@ -118,7 +205,6 @@ gint sc_speller_process_line(GeanyDocument *doc, gint line_number, const gchar * gint wstart, wend; GString *str; gint suggestions_found = 0; - gchar c; g_return_val_if_fail(sc_speller_dict != NULL, 0); g_return_val_if_fail(doc != NULL, 0); @@ -135,13 +221,6 @@ gint sc_speller_process_line(GeanyDocument *doc, gint line_number, const gchar * wend = scintilla_send_message(doc->editor->sci, SCI_WORDENDPOSITION, wstart, FALSE); if (wstart == wend) break; - c = sci_get_char_at(doc->editor->sci, wstart); - /* hopefully it's enough to check for these both */ - if (ispunct(c) || isspace(c)) - { - pos_start++; - continue; - } /* ensure the string has enough allocated memory */ if (str->len < (guint)(wend - wstart))