Skip to content

Commit

Permalink
Properly handle non-normalized unicode headwords
Browse files Browse the repository at this point in the history
With that change users should be able to search headwords in
any form. For example:

U+03B5 GREEK SMALL LETTER EPSILON and U+0301 COMBINING ACUTE ACCENT

is considered equal to

U+03AD GREEK SMALL LETTER EPSILON WITH TONOS

And no matter in what form the headword is provided in the dictionary, users will be able to find it,
even using the different form.
  • Loading branch information
Tvangeste committed Jul 6, 2013
1 parent 89755f8 commit 27c4bf7
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 3 deletions.
5 changes: 3 additions & 2 deletions btreeidx.cc
Expand Up @@ -11,6 +11,7 @@
#include <string.h>
#include <stdlib.h>
#include "dprintf.hh"
#include "wstring_qt.hh"

//#define __BTREE_USE_LZO
// LZO mode is experimental and unsupported. Tests didn't show any substantial
Expand Down Expand Up @@ -710,13 +711,13 @@ vector< WordArticleLink > BtreeIndex::readChain( char const * & ptr )
void BtreeIndex::antialias( wstring const & str,
vector< WordArticleLink > & chain )
{
wstring caseFolded = Folding::applySimpleCaseOnly( str );
wstring caseFolded = Folding::applySimpleCaseOnly( gd::normalize( str ) );

for( unsigned x = chain.size(); x--; )
{
// If after applying case folding to each word they wouldn't match, we
// drop the entry.
if ( Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].prefix + chain[ x ].word ) ) !=
if ( Folding::applySimpleCaseOnly( gd::normalize( Utf8::decode( chain[ x ].prefix + chain[ x ].word ) ) ) !=
caseFolded )
chain.erase( chain.begin() + x );
else
Expand Down
2 changes: 1 addition & 1 deletion dsl.cc
Expand Up @@ -668,7 +668,7 @@ void DslDictionary::loadArticle( uint32_t address,
string DslDictionary::dslToHtml( wstring const & str )
{
// Normalize the string
wstring normalizedStr = gd::toWString( gd::toQString( str ).normalized( QString::NormalizationForm_C ) );
wstring normalizedStr = gd::normalize( str );

ArticleDom dom( normalizedStr );

Expand Down
6 changes: 6 additions & 0 deletions wstring_qt.cc
Expand Up @@ -33,4 +33,10 @@ namespace gd

return wstring( ( const wchar * ) v.constData(), v.size() );
}

wstring normalize( const wstring & str )
{
return gd::toWString( gd::toQString( str ).normalized( QString::NormalizationForm_C ) );
}

}
1 change: 1 addition & 0 deletions wstring_qt.hh
Expand Up @@ -14,6 +14,7 @@ namespace gd
{
QString toQString( wstring const & );
wstring toWString( QString const & );
wstring normalize( wstring const & );
}

#endif

0 comments on commit 27c4bf7

Please sign in to comment.