Skip to content

Commit

Permalink
issue #8375: Lowercase search does not find non-ASCII uppercase pages…
Browse files Browse the repository at this point in the history
… and vice versa
  • Loading branch information
doxygen committed Mar 22, 2021
1 parent fa1897b commit a4ecbee
Show file tree
Hide file tree
Showing 19 changed files with 3,334 additions and 208 deletions.
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ add_library(doxymain STATIC
template.cpp
textdocvisitor.cpp
tooltip.cpp
utf8.cpp
util.cpp
vhdldocgen.cpp
vhdljjparser.cpp
Expand Down
2,903 changes: 2,903 additions & 0 deletions src/caseconvert.h

Large diffs are not rendered by default.

56 changes: 56 additions & 0 deletions src/caseconvert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# python3 script to generate caseconvert.h.

This comment has been minimized.

Copy link
@albert-github

albert-github Mar 23, 2021

Collaborator

I don't think it is a good idea to have this file in the src directory

  • it is not integrated into e.g. CMakeList.txt for generation of the caseconvert.h file and is only used to generate the initial caseconvert.h
  • better to have it on a tools directory (note might require small change in the file as creates the caseconvert.h on the current directory)
# It uses difference in lower() and upper() on a character to make a mapping
# that maps a given unicode point to either a lower or upper case UTF-8 character.
# this also include multi-byte characters.

import codecs

toupper = {}
tolower = {}

def writeMapping(file,mapping):
for k,v in sorted(mapping.items()):
file.write(u" case %s /* %s */: return u8\"%s\" /* %s */;\n" %
(hex(ord(k[0])),k,v,",".join("0x{:02x}".format(ord(c)) for c in v)))

# create mappings of characters whose upper and lower case differs
for codeValue in range(0,0x1FFFF):
s = chr(codeValue)
sl = s.lower()
su = s.upper()
if ord(s[0])!=ord(sl[0]):
tolower[s]=sl
if ord(s[0])!=ord(su[0]):
toupper[s]=su

file = codecs.open("caseconvert.h", "w", "utf-8")
file.write(r'''/** This file is generated by python3 caseconvert.py. DO NOT EDIT! */
#ifndef CASECONVERT_H
#define CASECONVERT_H
#include <cstdint>
#include <string>
inline const char *convertUnicodeToUpper(uint32_t code)
{
switch(code)
{
''');
writeMapping(file,toupper);
file.write(r''' default: return nullptr;
}
}
inline const char *convertUnicodeToLower(uint32_t code)
{
switch(code)
{
''');
writeMapping(file,tolower);
file.write(r''' default: return nullptr;
}
}
#endif
''');
15 changes: 6 additions & 9 deletions src/clangparser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "membername.h"
#include "filename.h"
#include "tooltip.h"
#include "utf8.h"
#endif

//--------------------------------------------------------------------------
Expand Down Expand Up @@ -73,18 +74,14 @@ static QCString detab(const QCString &s)
col++;
break;
default: // non-whitespace => update minIndent
out.addChar(c);
if (c<0 && i<size) // multibyte sequence
{
out.addChar(data[i++]); // >= 2 bytes
if (((uchar)c&0xE0)==0xE0 && i<size)
{
out.addChar(data[i++]); // 3 bytes
}
if (((uchar)c&0xF0)==0xF0 && i<size)
int bytes = getUTF8CharNumBytes(c);
for (int j=0;j<bytes-1 && c!=0; j++)
{
out.addChar(data[i++]); // 4 byres
out.addChar(c);
c = data[i++];
}
out.addChar(c);
}
if (col<minIndent) minIndent=col;
col++;
Expand Down
22 changes: 3 additions & 19 deletions src/definition.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
/******************************************************************************
*
*
*
* Copyright (C) 1997-2015 by Dimitri van Heesch.
* Copyright (C) 1997-2021 by Dimitri van Heesch.
*
* Permission to use, copy, modify, and distribute this software and its
* documentation under the terms of the GNU General Public License is hereby
Expand Down Expand Up @@ -51,7 +49,7 @@
#include "pagedef.h"
#include "bufstr.h"
#include "reflist.h"

#include "utf8.h"

//-----------------------------------------------------------------------------------------

Expand Down Expand Up @@ -503,20 +501,6 @@ void DefinitionImpl::setDocumentation(const char *d,const char *docFile,int docL
_setDocumentation(d,docFile,docLine,stripWhiteSpace,FALSE);
}

#define uni_isupper(c) (QChar(c).category()==QChar::Letter_Uppercase)

// do a UTF-8 aware search for the last real character and return TRUE
// if that is a multibyte one.
static bool lastCharIsMultibyte(const QCString &s)
{
uint l = s.length();
int p = 0;
int pp = -1;
while ((p=nextUtf8CharPosition(s,l,(uint)p))<(int)l) pp=p;
if (pp==-1 || ((uchar)s[pp])<0x80) return FALSE;
return TRUE;
}

void DefinitionImpl::_setBriefDescription(const char *b,const char *briefFile,int briefLine)
{
static QCString outputLanguage = Config_getEnum(OUTPUT_LANGUAGE);
Expand All @@ -536,7 +520,7 @@ void DefinitionImpl::_setBriefDescription(const char *b,const char *briefFile,in
{
case '.': case '!': case '?': case '>': case ':': case ')': break;
default:
if (uni_isupper(brief.at(0)) && !lastCharIsMultibyte(brief)) brief+='.';
if (isUTF8CharUpperCase(brief.str(),0) && !lastUTF8CharIsMultibyte(brief.str())) brief+='.';
break;
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/htmlgen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include "growbuf.h"
#include "fileinfo.h"
#include "dir.h"
#include "utf8.h"

//#define DBG_HTML(x) x;
#define DBG_HTML(x)
Expand Down Expand Up @@ -585,7 +586,7 @@ void HtmlCodeGenerator::codify(const char *str)
}
else
{
p=writeUtf8Char(m_t,p-1);
p=writeUTF8Char(m_t,p-1);
m_col++;
}
}
Expand Down
39 changes: 21 additions & 18 deletions src/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include "namespacedef.h"
#include "filename.h"
#include "tooltip.h"
#include "utf8.h"

#define MAX_ITEMS_BEFORE_MULTIPAGE_INDEX 200
#define MAX_ITEMS_BEFORE_QUICK_INDEX 30
Expand Down Expand Up @@ -2052,13 +2053,12 @@ static void writeAlphabeticalClassList(OutputList &ol, ClassDef::CompoundType ct
if (cd->getLanguage()==SrcLangExt_VHDL && !((VhdlDocGen::VhdlClasses)cd->protection()==VhdlDocGen::ENTITYCLASS ))// no architecture
continue;

// get the first UTF8 character (after the part that should be ignored)
int index = getPrefixIndex(cd->className());
//printf("name=%s index=%d %d\n",cd->className().data(),index,cd->protection());
char charStr[MAX_UTF8_CHAR_SIZE];
if (getUtf8Char(cd->className().data()+index,charStr,CaseModifier::ToUpper)>0)
// get the first UTF8 character (after the part that should be ignored)
std::string letter = getUTF8CharAt(cd->className().str(),index);
if (!letter.empty())
{
indexLettersUsed.insert(charStr);
indexLettersUsed.insert(convertUTF8ToUpper(letter));
}
}
}
Expand Down Expand Up @@ -2092,21 +2092,21 @@ static void writeAlphabeticalClassList(OutputList &ol, ClassDef::CompoundType ct

if (cd->isLinkableInProject() && cd->templateMaster()==0)
{
int index = getPrefixIndex(cd->className());
char charStr[MAX_UTF8_CHAR_SIZE];
if (getUtf8Char(cd->className().data()+index,charStr,CaseModifier::ToUpper)>0)
// get the first UTF8 character (after the part that should be ignored)
QCString className = cd->className();
int index = getPrefixIndex(className);
std::string letter = getUTF8CharAt(className.str(),index);
if (!letter.empty())
{
auto it = classesByLetter.find(charStr);
letter = convertUTF8ToUpper(letter);
auto it = classesByLetter.find(letter);
if (it!=classesByLetter.end()) // add class to the existing list
{
it->second.push_back(cd.get());
}
else // new entry
{
classesByLetter.insert(
std::make_pair(std::string(charStr),
std::vector<const ClassDef*>({ cd.get() })));
std::make_pair(letter, std::vector<const ClassDef*>({ cd.get() })));
}
}
}
Expand Down Expand Up @@ -2633,9 +2633,10 @@ void addClassMemberNameToIndex(const MemberDef *md)
{
QCString n = md->name();
int index = getPrefixIndex(n);
char letter[MAX_UTF8_CHAR_SIZE];
if (getUtf8Char(n.data()+index,letter,CaseModifier::ToLower)>0)
std::string letter = getUTF8CharAt(n.str(),index);
if (!letter.empty())
{
letter = convertUTF8ToLower(letter);
bool isFriendToHide = hideFriendCompounds &&
(QCString(md->typeString())=="friend class" ||
QCString(md->typeString())=="friend struct" ||
Expand Down Expand Up @@ -2711,9 +2712,10 @@ void addNamespaceMemberNameToIndex(const MemberDef *md)
{
QCString n = md->name();
int index = getPrefixIndex(n);
char letter[MAX_UTF8_CHAR_SIZE];
if (getUtf8Char(n.data()+index,letter,CaseModifier::ToLower)>0)
std::string letter = getUTF8CharAt(n.str(),index);
if (!letter.empty())
{
letter = convertUTF8ToLower(letter);
if (!md->isEnumValue() || (md->getEnumScope() && !md->getEnumScope()->isStrong()))
{
MemberIndexMap_add(g_namespaceIndexLetterUsed[NMHL_All],letter,md);
Expand Down Expand Up @@ -2778,9 +2780,10 @@ void addFileMemberNameToIndex(const MemberDef *md)
{
QCString n = md->name();
int index = getPrefixIndex(n);
char letter[MAX_UTF8_CHAR_SIZE];
if (getUtf8Char(n.data()+index,letter,CaseModifier::ToLower)>0)
std::string letter = getUTF8CharAt(n.str(),index);
if (!letter.empty())
{
letter = convertUTF8ToLower(letter);
if (!md->isEnumValue() || (md->getEnumScope() && !md->getEnumScope()->isStrong()))
{
MemberIndexMap_add(g_fileIndexLetterUsed[FMHL_All],letter,md);
Expand Down
18 changes: 4 additions & 14 deletions src/latexgen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include "resourcemgr.h"
#include "portable.h"
#include "fileinfo.h"
#include "utf8.h"

static QCString g_header;
static QCString g_footer;
Expand Down Expand Up @@ -117,26 +118,15 @@ void LatexCodeGenerator::codify(const char *str)
#undef COPYCHAR
// helper macro to copy a single utf8 character, dealing with multibyte chars.
#define COPYCHAR() do { \
if (lresult < (i + 5)) \
int bytes = getUTF8CharNumBytes(c); \
if (lresult < (i + bytes + 1)) \
{ \
lresult += 512; \
result = (signed char *)realloc(result, lresult); \
} \
result[i++]=c; p++; \
if (c<0) /* multibyte utf-8 character */ \
for (int j=0; j<bytes && *p; j++) \
{ \
/* 1xxx.xxxx: >=2 byte character */ \
result[i++]=*p++; \
if (((uchar)c&0xE0)==0xE0) \
{ \
/* 111x.xxxx: >=3 byte character */ \
result[i++]=*p++; \
} \
if (((uchar)c&0xF0)==0xF0) \
{ \
/* 1111.xxxx: 4 byte character */ \
result[i++]=*p++; \
} \
} \
m_col++; \
} while(0)
Expand Down
3 changes: 2 additions & 1 deletion src/mangen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "mandocvisitor.h"
#include "language.h"
#include "dir.h"
#include "utf8.h"

static QCString getExtension()
{
Expand Down Expand Up @@ -332,7 +333,7 @@ void ManGenerator::codify(const char *str)
case '\n': t << "\n"; m_firstCol=TRUE; m_col=0; break;
case '\\': t << "\\"; m_col++; break;
case '\"': // no break!
default: p=writeUtf8Char(t,p-1); m_firstCol=FALSE; m_col++; break;
default: p=writeUTF8Char(t,p-1); m_firstCol=FALSE; m_col++; break;
}
}
//printf("%s",str);fflush(stdout);
Expand Down
19 changes: 9 additions & 10 deletions src/markdown.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
#include "portable.h"
#include "regex.h"
#include "fileinfo.h"
#include "utf8.h"

#if !defined(NDEBUG)
#define ENABLE_TRACING
Expand Down Expand Up @@ -2657,23 +2658,21 @@ QCString Markdown::detab(const QCString &s,int &refIndent)
if (c<0 && i<size) // multibyte sequence
{
// special handling of the UTF-8 nbsp character 0xC2 0xA0
if ((uchar)c == 0xC2 && (uchar)(data[i]) == 0xA0)
int nb = isUTF8NonBreakableSpace(data);
if (nb>0)
{
m_out.addStr(g_doxy_nsbp);
i++;
i+=nb-1;
}
else
{
m_out.addChar(c);
m_out.addChar(data[i++]); // >= 2 bytes
if (((uchar)c&0xE0)==0xE0 && i<size)
{
m_out.addChar(data[i++]); // 3 bytes
}
if (((uchar)c&0xF0)==0xF0 && i<size)
int bytes = getUTF8CharNumBytes(c);
for (int j=0;j<bytes-1 && c;j++)
{
m_out.addChar(data[i++]); // 4 byres
m_out.addChar(c);
c = data[i++];
}
m_out.addChar(c);
}
}
else
Expand Down
3 changes: 2 additions & 1 deletion src/rtfgen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include "filename.h"
#include "namespacedef.h"
#include "dir.h"
#include "utf8.h"


//#define DBG_RTF(x) x;
Expand Down Expand Up @@ -1809,7 +1810,7 @@ void RTFGenerator::codify(const char *str)
case '{': t << "\\{"; m_col++; break;
case '}': t << "\\}"; m_col++; break;
case '\\': t << "\\\\"; m_col++; break;
default: p=(const unsigned char *)writeUtf8Char(t,(const char *)p-1); m_col++; break;
default: p=(const unsigned char *)writeUTF8Char(t,(const char *)p-1); m_col++; break;
}
}
}
Expand Down
Loading

0 comments on commit a4ecbee

Please sign in to comment.