issue #8375: Lowercase search does not find non-ASCII uppercase pages…

… and vice versa
doxygen · Mar 22, 2021 · a4ecbee · albert-github · Mar 23, 2021 · a4ecbee
1 parent fa1897b
commit a4ecbee
Show file tree

Hide file tree

Showing 19 changed files with 3,334 additions and 208 deletions.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -301,6 +301,7 @@ add_library(doxymain STATIC
     template.cpp
     textdocvisitor.cpp
     tooltip.cpp
+    utf8.cpp
     util.cpp
     vhdldocgen.cpp
     vhdljjparser.cpp

diff --git a/src/caseconvert.h b/src/caseconvert.h
diff --git a/src/caseconvert.py b/src/caseconvert.py
@@ -0,0 +1,56 @@
+# python3 script to generate caseconvert.h.
+# It uses difference in lower() and upper() on a character to make a mapping
+# that maps a given unicode point to either a lower or upper case UTF-8 character.
+# this also include multi-byte characters.
+
+import codecs
+
+toupper = {}
+tolower = {}
+
+def writeMapping(file,mapping):
+    for k,v in sorted(mapping.items()):
+        file.write(u"    case %s /* %s */: return u8\"%s\" /* %s */;\n" %
+                (hex(ord(k[0])),k,v,",".join("0x{:02x}".format(ord(c)) for c in v)))
+
+# create mappings of characters whose upper and lower case differs
+for codeValue in range(0,0x1FFFF):
+        s = chr(codeValue)
+        sl = s.lower()
+        su = s.upper()
+        if ord(s[0])!=ord(sl[0]):
+            tolower[s]=sl
+        if ord(s[0])!=ord(su[0]):
+            toupper[s]=su
+
+file = codecs.open("caseconvert.h", "w", "utf-8")
+file.write(r'''/** This file is generated by python3 caseconvert.py. DO NOT EDIT! */
+
+#ifndef CASECONVERT_H
+#define CASECONVERT_H
+
+#include <cstdint>
+#include <string>
+
+inline const char *convertUnicodeToUpper(uint32_t code)
+{
+  switch(code)
+  {
+''');
+writeMapping(file,toupper);
+file.write(r'''    default: return nullptr;
+  }
+}
+
+inline const char *convertUnicodeToLower(uint32_t code)
+{
+  switch(code)
+  {
+''');
+writeMapping(file,tolower);
+file.write(r'''    default: return nullptr;
+  }
+}
+
+#endif
+''');
diff --git a/src/clangparser.cpp b/src/clangparser.cpp
@@ -19,6 +19,7 @@
 #include "membername.h"
 #include "filename.h"
 #include "tooltip.h"
+#include "utf8.h"
 #endif
 
 //--------------------------------------------------------------------------
@@ -73,18 +74,14 @@ static QCString detab(const QCString &s)
         col++;
         break;
       default: // non-whitespace => update minIndent
-        out.addChar(c);
-        if (c<0 && i<size) // multibyte sequence
         {
-          out.addChar(data[i++]); // >= 2 bytes
-          if (((uchar)c&0xE0)==0xE0 && i<size)
-          {
-            out.addChar(data[i++]); // 3 bytes
-          }
-          if (((uchar)c&0xF0)==0xF0 && i<size)
+          int bytes = getUTF8CharNumBytes(c);
+          for (int j=0;j<bytes-1 && c!=0; j++)
           {
-            out.addChar(data[i++]); // 4 byres
+            out.addChar(c);
+            c = data[i++];
           }
+          out.addChar(c);
         }
         if (col<minIndent) minIndent=col;
         col++;

diff --git a/src/definition.cpp b/src/definition.cpp
@@ -1,8 +1,6 @@
 /******************************************************************************
  *
- *
- *
- * Copyright (C) 1997-2015 by Dimitri van Heesch.
+ * Copyright (C) 1997-2021 by Dimitri van Heesch.
  *
  * Permission to use, copy, modify, and distribute this software and its
  * documentation under the terms of the GNU General Public License is hereby
@@ -51,7 +49,7 @@
 #include "pagedef.h"
 #include "bufstr.h"
 #include "reflist.h"
-
+#include "utf8.h"
 
 //-----------------------------------------------------------------------------------------
 
@@ -503,20 +501,6 @@ void DefinitionImpl::setDocumentation(const char *d,const char *docFile,int docL
   _setDocumentation(d,docFile,docLine,stripWhiteSpace,FALSE);
 }
 
-#define uni_isupper(c) (QChar(c).category()==QChar::Letter_Uppercase)
-
-// do a UTF-8 aware search for the last real character and return TRUE
-// if that is a multibyte one.
-static bool lastCharIsMultibyte(const QCString &s)
-{
-  uint l = s.length();
-  int p = 0;
-  int pp = -1;
-  while ((p=nextUtf8CharPosition(s,l,(uint)p))<(int)l) pp=p;
-  if (pp==-1 || ((uchar)s[pp])<0x80) return FALSE;
-  return TRUE;
-}
-
 void DefinitionImpl::_setBriefDescription(const char *b,const char *briefFile,int briefLine)
 {
   static QCString outputLanguage = Config_getEnum(OUTPUT_LANGUAGE);
@@ -536,7 +520,7 @@ void DefinitionImpl::_setBriefDescription(const char *b,const char *briefFile,in
     {
       case '.': case '!': case '?': case '>': case ':': case ')': break;
       default:
-        if (uni_isupper(brief.at(0)) && !lastCharIsMultibyte(brief)) brief+='.';
+        if (isUTF8CharUpperCase(brief.str(),0) && !lastUTF8CharIsMultibyte(brief.str())) brief+='.';
         break;
     }
   }

diff --git a/src/htmlgen.cpp b/src/htmlgen.cpp
@@ -51,6 +51,7 @@
 #include "growbuf.h"
 #include "fileinfo.h"
 #include "dir.h"
+#include "utf8.h"
 
 //#define DBG_HTML(x) x;
 #define DBG_HTML(x)
@@ -585,7 +586,7 @@ void HtmlCodeGenerator::codify(const char *str)
             }
             else
             {
-              p=writeUtf8Char(m_t,p-1);
+              p=writeUTF8Char(m_t,p-1);
               m_col++;
             }
           }

diff --git a/src/index.cpp b/src/index.cpp
@@ -47,6 +47,7 @@
 #include "namespacedef.h"
 #include "filename.h"
 #include "tooltip.h"
+#include "utf8.h"
 
 #define MAX_ITEMS_BEFORE_MULTIPAGE_INDEX 200
 #define MAX_ITEMS_BEFORE_QUICK_INDEX 30
@@ -2052,13 +2053,12 @@ static void writeAlphabeticalClassList(OutputList &ol, ClassDef::CompoundType ct
       if (cd->getLanguage()==SrcLangExt_VHDL && !((VhdlDocGen::VhdlClasses)cd->protection()==VhdlDocGen::ENTITYCLASS ))// no architecture
         continue;
 
+      // get the first UTF8 character (after the part that should be ignored)
       int index = getPrefixIndex(cd->className());
-      //printf("name=%s index=%d %d\n",cd->className().data(),index,cd->protection());
-      char charStr[MAX_UTF8_CHAR_SIZE];
-      if (getUtf8Char(cd->className().data()+index,charStr,CaseModifier::ToUpper)>0)
-         // get the first UTF8 character (after the part that should be ignored)
+      std::string letter = getUTF8CharAt(cd->className().str(),index);
+      if (!letter.empty())
       {
-        indexLettersUsed.insert(charStr);
+        indexLettersUsed.insert(convertUTF8ToUpper(letter));
       }
     }
   }
@@ -2092,21 +2092,21 @@ static void writeAlphabeticalClassList(OutputList &ol, ClassDef::CompoundType ct
 
     if (cd->isLinkableInProject() && cd->templateMaster()==0)
     {
-      int index = getPrefixIndex(cd->className());
-      char charStr[MAX_UTF8_CHAR_SIZE];
-      if (getUtf8Char(cd->className().data()+index,charStr,CaseModifier::ToUpper)>0)
-         // get the first UTF8 character (after the part that should be ignored)
+      QCString className = cd->className();
+      int index = getPrefixIndex(className);
+      std::string letter = getUTF8CharAt(className.str(),index);
+      if (!letter.empty())
       {
-        auto it = classesByLetter.find(charStr);
+        letter = convertUTF8ToUpper(letter);
+        auto it = classesByLetter.find(letter);
         if (it!=classesByLetter.end()) // add class to the existing list
         {
           it->second.push_back(cd.get());
         }
         else // new entry
         {
           classesByLetter.insert(
-              std::make_pair(std::string(charStr),
-                             std::vector<const ClassDef*>({ cd.get() })));
+              std::make_pair(letter, std::vector<const ClassDef*>({ cd.get() })));
         }
       }
     }
@@ -2633,9 +2633,10 @@ void addClassMemberNameToIndex(const MemberDef *md)
   {
     QCString n = md->name();
     int index = getPrefixIndex(n);
-    char letter[MAX_UTF8_CHAR_SIZE];
-    if (getUtf8Char(n.data()+index,letter,CaseModifier::ToLower)>0)
+    std::string letter = getUTF8CharAt(n.str(),index);
+    if (!letter.empty())
     {
+      letter = convertUTF8ToLower(letter);
       bool isFriendToHide = hideFriendCompounds &&
         (QCString(md->typeString())=="friend class" ||
          QCString(md->typeString())=="friend struct" ||
@@ -2711,9 +2712,10 @@ void addNamespaceMemberNameToIndex(const MemberDef *md)
   {
     QCString n = md->name();
     int index = getPrefixIndex(n);
-    char letter[MAX_UTF8_CHAR_SIZE];
-    if (getUtf8Char(n.data()+index,letter,CaseModifier::ToLower)>0)
+    std::string letter = getUTF8CharAt(n.str(),index);
+    if (!letter.empty())
     {
+      letter = convertUTF8ToLower(letter);
       if (!md->isEnumValue() || (md->getEnumScope() && !md->getEnumScope()->isStrong()))
       {
         MemberIndexMap_add(g_namespaceIndexLetterUsed[NMHL_All],letter,md);
@@ -2778,9 +2780,10 @@ void addFileMemberNameToIndex(const MemberDef *md)
   {
     QCString n = md->name();
     int index = getPrefixIndex(n);
-    char letter[MAX_UTF8_CHAR_SIZE];
-    if (getUtf8Char(n.data()+index,letter,CaseModifier::ToLower)>0)
+    std::string letter = getUTF8CharAt(n.str(),index);
+    if (!letter.empty())
     {
+      letter = convertUTF8ToLower(letter);
       if (!md->isEnumValue() || (md->getEnumScope() && !md->getEnumScope()->isStrong()))
       {
         MemberIndexMap_add(g_fileIndexLetterUsed[FMHL_All],letter,md);

diff --git a/src/latexgen.cpp b/src/latexgen.cpp
@@ -44,6 +44,7 @@
 #include "resourcemgr.h"
 #include "portable.h"
 #include "fileinfo.h"
+#include "utf8.h"
 
 static QCString g_header;
 static QCString g_footer;
@@ -117,26 +118,15 @@ void LatexCodeGenerator::codify(const char *str)
 #undef  COPYCHAR
 // helper macro to copy a single utf8 character, dealing with multibyte chars.
 #define COPYCHAR() do {                                           \
-                     if (lresult < (i + 5))                       \
+                     int bytes = getUTF8CharNumBytes(c);          \
+                     if (lresult < (i + bytes + 1))               \
                      {                                            \
                        lresult += 512;                            \
                        result = (signed char *)realloc(result, lresult); \
                      }                                            \
-                     result[i++]=c; p++;                          \
-                     if (c<0) /* multibyte utf-8 character */     \
+                     for (int j=0; j<bytes && *p; j++)            \
                      {                                            \
-                       /* 1xxx.xxxx: >=2 byte character */        \
                        result[i++]=*p++;                          \
-                       if (((uchar)c&0xE0)==0xE0)                 \
-                       {                                          \
-                         /* 111x.xxxx: >=3 byte character */      \
-                         result[i++]=*p++;                        \
-                       }                                          \
-                       if (((uchar)c&0xF0)==0xF0)                 \
-                       {                                          \
-                         /* 1111.xxxx: 4 byte character */        \
-                         result[i++]=*p++;                        \
-                       }                                          \
                      }                                            \
                      m_col++;                                     \
                    } while(0)

diff --git a/src/mangen.cpp b/src/mangen.cpp
@@ -30,6 +30,7 @@
 #include "mandocvisitor.h"
 #include "language.h"
 #include "dir.h"
+#include "utf8.h"
 
 static QCString getExtension()
 {
@@ -332,7 +333,7 @@ void ManGenerator::codify(const char *str)
         case '\n':  t << "\n"; m_firstCol=TRUE; m_col=0; break;
         case '\\':  t << "\\"; m_col++; break;
         case '\"':  // no break!
-        default:    p=writeUtf8Char(t,p-1); m_firstCol=FALSE; m_col++; break;
+        default:    p=writeUTF8Char(t,p-1); m_firstCol=FALSE; m_col++; break;
       }
     }
     //printf("%s",str);fflush(stdout);

diff --git a/src/markdown.cpp b/src/markdown.cpp
@@ -50,6 +50,7 @@
 #include "portable.h"
 #include "regex.h"
 #include "fileinfo.h"
+#include "utf8.h"
 
 #if !defined(NDEBUG)
 #define ENABLE_TRACING
@@ -2657,23 +2658,21 @@ QCString Markdown::detab(const QCString &s,int &refIndent)
         if (c<0 && i<size) // multibyte sequence
         {
           // special handling of the UTF-8 nbsp character 0xC2 0xA0
-          if ((uchar)c == 0xC2 && (uchar)(data[i]) == 0xA0)
+          int nb = isUTF8NonBreakableSpace(data);
+          if (nb>0)
           {
             m_out.addStr(g_doxy_nsbp);
-            i++;
+            i+=nb-1;
           }
           else
           {
-            m_out.addChar(c);
-            m_out.addChar(data[i++]); // >= 2 bytes
-            if (((uchar)c&0xE0)==0xE0 && i<size)
-            {
-              m_out.addChar(data[i++]); // 3 bytes
-            }
-              if (((uchar)c&0xF0)==0xF0 && i<size)
+            int bytes = getUTF8CharNumBytes(c);
+            for (int j=0;j<bytes-1 && c;j++)
             {
-              m_out.addChar(data[i++]); // 4 byres
+              m_out.addChar(c);
+              c = data[i++];
             }
+            m_out.addChar(c);
           }
         }
         else

diff --git a/src/rtfgen.cpp b/src/rtfgen.cpp
@@ -46,6 +46,7 @@
 #include "filename.h"
 #include "namespacedef.h"
 #include "dir.h"
+#include "utf8.h"
 
 
 //#define DBG_RTF(x) x;
@@ -1809,7 +1810,7 @@ void RTFGenerator::codify(const char *str)
         case '{':   t << "\\{"; m_col++;          break;
         case '}':   t << "\\}"; m_col++;          break;
         case '\\':  t << "\\\\"; m_col++;         break;
-        default:    p=(const unsigned char *)writeUtf8Char(t,(const char *)p-1); m_col++; break;
+        default:    p=(const unsigned char *)writeUTF8Char(t,(const char *)p-1); m_col++; break;
       }
     }
   }