diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt index dc49e782..c53738d1 100644 --- a/data/CMakeLists.txt +++ b/data/CMakeLists.txt @@ -25,11 +25,11 @@ add_custom_target(lm-predict ALL DEPENDS "${LM_PREDICT_OUTPUT}") install(FILES "${LM_OUTPUT}" RENAME zh_CN.lm DESTINATION "${LIBIME_INSTALL_LIBDATADIR}") install(FILES "${LM_PREDICT_OUTPUT}" RENAME zh_CN.lm.predict DESTINATION "${LIBIME_INSTALL_LIBDATADIR}") -set(DICT_TAR "dict-20250327.tar.zst") +set(DICT_TAR "dict-20260430.tar.zst") set(DICT_URL "https://download.fcitx-im.org/data/${DICT_TAR}") fcitx5_download(dict-download ${DICT_URL} ${DICT_TAR} - 7ca6be4754c0d4c27ba7702c0dce651659bd2ca1faa5cbf2848d81a0053c8c13) + 3edc008d90fcd61b9967b9e590f396189d7a00fa74c45ecde0d9850a0fdd6241) fcitx5_extract(dict-extract ${DICT_TAR} DEPENDS dict-download OUTPUT dict_sc.txt dict_extb.txt) diff --git a/src/libime/pinyin/pinyindata.cpp b/src/libime/pinyin/pinyindata.cpp index 5316b776..2172bf90 100644 --- a/src/libime/pinyin/pinyindata.cpp +++ b/src/libime/pinyin/pinyindata.cpp @@ -25,40 +25,49 @@ const std::vector &getEncodedInitialFinal() { static const auto encodedInitialFinal = []() { std::vector a; const std::unordered_set encodedInitialFinalSet = { - 660, 241, 384, 481, 388, 409, 415, 326, 497, 425, 327, 329, 220, - 331, 55, 332, 336, 350, 352, 253, 43, 255, 799, 256, 417, 257, - 272, 268, 567, 269, 353, 224, 264, 144, 36, 448, 277, 271, 217, - 283, 107, 72, 73, 74, 75, 78, 79, 533, 450, 275, 254, 115, - 80, 85, 90, 182, 583, 360, 87, 91, 237, 330, 95, 77, 410, - 605, 221, 10, 727, 222, 542, 335, 862, 234, 236, 232, 231, 196, - 785, 233, 347, 239, 245, 247, 158, 838, 840, 733, 38, 652, 76, - 44, 619, 162, 328, 228, 18, 49, 54, 51, 218, 37, 52, 57, - 46, 447, 198, 424, 449, 460, 455, 251, 465, 461, 614, 615, 160, - 390, 616, 413, 532, 775, 416, 7, 802, 267, 349, 484, 367, 648, - 620, 798, 756, 262, 589, 485, 280, 548, 419, 749, 386, 451, 411, - 649, 759, 496, 564, 625, 656, 59, 219, 777, 508, 201, 490, 227, - 659, 445, 487, 745, 606, 793, 209, 676, 270, 324, 312, 768, 348, - 795, 184, 767, 482, 834, 314, 412, 512, 654, 454, 815, 639, 223, - 603, 761, 244, 385, 88, 675, 486, 723, 289, 637, 779, 517, 292, - 651, 6, 452, 527, 183, 784, 661, 325, 155, 835, 429, 769, 89, - 207, 789, 483, 653, 195, 724, 489, 229, 551, 531, 576, 171, 792, - 760, 260, 581, 208, 453, 623, 677, 530, 266, 418, 650, 2, 515, - 516, 351, 510, 509, 265, 671, 501, 281, 504, 235, 636, 655, 534, - 640, 528, 186, 511, 641, 263, 371, 582, 387, 553, 721, 588, 673, - 506, 541, 203, 192, 40, 191, 617, 193, 758, 188, 794, 216, 185, - 248, 258, 584, 748, 505, 361, 725, 766, 488, 48, 82, 93, 414, - 149, 491, 181, 284, 180, 577, 600, 747, 159, 205, 743, 732, 731, - 13, 800, 836, 601, 42, 728, 604, 579, 84, 16, 602, 599, 580, - 354, 252, 711, 722, 39, 720, 389, 647, 624, 578, 507, 273, 635, - 529, 446, 383, 317, 372, 368, 366, 365, 364, 566, 363, 362, 313, - 315, 685, 318, 316, 3, 311, 300, 299, 296, 295, 294, 543, 293, - 291, 290, 288, 15, 131, 120, 118, 124, 156, 116, 114, 111, 110, - 108, 837, 833, 169, 173, 172, 167, 161, 165, 157, 152, 151, 150, - 687, 148, 147, 146, 145, 709, 713, 4, 712, 707, 696, 695, 697, - 565, 569, 570, 568, 563, 552, 547, 545, 544, 540, 692, 691, 689, - 688, 686, 684, 23, 21, 19, 8, 1, 0, 832, 831, 830, 829, - 828, 230, 20, 163, 803, 11}; - a.resize(900); + 560, 1067, 561, 569, 521, 581, 584, 1430, 586, 588, 435, + 764, 455, 447, 452, 457, 438, 465, 1465, 466, 159, 373, + 126, 626, 874, 129, 137, 142, 140, 139, 143, 507, 145, + 339, 141, 134, 498, 128, 375, 147, 1182, 132, 376, 378, + 379, 760, 1460, 1201, 1265, 392, 131, 391, 386, 19, 400, + 390, 389, 385, 401, 403, 374, 62, 63, 66, 64, 1479, + 393, 388, 70, 446, 565, 75, 404, 80, 1436, 1120, 439, + 436, 621, 1015, 78, 328, 1252, 77, 960, 587, 81, 83, + 85, 895, 835, 757, 1387, 759, 763, 766, 1434, 767, 1466, + 772, 450, 127, 1253, 520, 773, 1057, 628, 372, 1116, 1306, + 259, 1143, 777, 1059, 1061, 1062, 1066, 442, 1077, 1082, 1467, + 1058, 459, 562, 189, 645, 941, 649, 1081, 1251, 894, 1366, + 827, 1083, 1079, 1240, 444, 1242, 125, 1244, 21, 136, 823, + 453, 643, 898, 758, 893, 1144, 765, 267, 1243, 1307, 1005, + 1321, 407, 266, 311, 65, 992, 839, 264, 701, 1123, 192, + 1056, 829, 1313, 1374, 996, 828, 820, 1119, 72, 337, 1325, + 1330, 935, 1432, 582, 822, 559, 1335, 263, 1438, 454, 1145, + 1304, 999, 1305, 387, 326, 1122, 1367, 397, 1004, 434, 897, + 1129, 1139, 1128, 1020, 437, 316, 1372, 868, 273, 1241, 879, + 312, 0, 1141, 997, 1431, 1365, 377, 896, 993, 1302, 704, + 68, 566, 1315, 1248, 1364, 583, 1127, 252, 451, 881, 74, + 1482, 130, 265, 440, 892, 891, 314, 875, 880, 1470, 463, + 445, 310, 333, 871, 1331, 1323, 1118, 869, 870, 20, 124, + 585, 1314, 873, 1065, 384, 994, 1268, 501, 1267, 1481, 1117, + 335, 1312, 69, 18, 198, 570, 821, 1181, 931, 186, 695, + 824, 1018, 325, 1178, 761, 1022, 834, 277, 1371, 1370, 620, + 448, 1078, 1017, 462, 338, 1000, 1016, 825, 331, 321, 383, + 254, 563, 395, 1019, 872, 323, 380, 953, 322, 1179, 819, + 959, 315, 526, 318, 261, 696, 313, 1263, 564, 1269, 504, + 1121, 1469, 876, 496, 449, 1473, 826, 762, 1021, 1375, 1124, + 523, 622, 998, 1089, 1183, 271, 995, 1247, 1245, 558, 1477, + 711, 648, 715, 710, 942, 705, 1475, 699, 503, 703, 697, + 698, 702, 269, 700, 15, 624, 647, 1429, 627, 650, 646, + 16, 644, 632, 631, 625, 623, 525, 248, 524, 522, 519, + 508, 502, 500, 499, 497, 1207, 209, 196, 1486, 2, 202, + 194, 193, 188, 1435, 6, 1433, 275, 276, 260, 262, 256, + 250, 255, 253, 8, 251, 249, 1203, 1205, 1186, 1206, 1190, + 1480, 1189, 1191, 955, 957, 958, 956, 10, 954, 1468, 943, + 938, 937, 934, 933, 932, 930, 1185, 1180, 23, 11, 13, + 7, 4, 3, 1, 1428, 1427, 1426, 1462, 1463, 1464, 1471, + 1472, 1474, 1476, 1478, 1483, 1484, 1485, 1487, + }; + a.resize(1500); std::fill(a.begin(), a.end(), false); for (auto i : encodedInitialFinalSet) { a[i] = true; @@ -998,6 +1007,58 @@ const PinyinMap &getPinyinMap() { {"an", PinyinInitial::Zero, PinyinFinal::AN, PinyinFuzzyFlag::None}, {"ai", PinyinInitial::Zero, PinyinFinal::AI, PinyinFuzzyFlag::None}, {"a", PinyinInitial::Zero, PinyinFinal::A, PinyinFuzzyFlag::None}, + {"A", PinyinInitial::Zero, PinyinFinal::Letter_A, + PinyinFuzzyFlag::None}, + {"B", PinyinInitial::Zero, PinyinFinal::Letter_B, + PinyinFuzzyFlag::None}, + {"C", PinyinInitial::Zero, PinyinFinal::Letter_C, + PinyinFuzzyFlag::None}, + {"D", PinyinInitial::Zero, PinyinFinal::Letter_D, + PinyinFuzzyFlag::None}, + {"E", PinyinInitial::Zero, PinyinFinal::Letter_E, + PinyinFuzzyFlag::None}, + {"F", PinyinInitial::Zero, PinyinFinal::Letter_F, + PinyinFuzzyFlag::None}, + {"G", PinyinInitial::Zero, PinyinFinal::Letter_G, + PinyinFuzzyFlag::None}, + {"H", PinyinInitial::Zero, PinyinFinal::Letter_H, + PinyinFuzzyFlag::None}, + {"I", PinyinInitial::Zero, PinyinFinal::Letter_I, + PinyinFuzzyFlag::None}, + {"J", PinyinInitial::Zero, PinyinFinal::Letter_J, + PinyinFuzzyFlag::None}, + {"K", PinyinInitial::Zero, PinyinFinal::Letter_K, + PinyinFuzzyFlag::None}, + {"L", PinyinInitial::Zero, PinyinFinal::Letter_L, + PinyinFuzzyFlag::None}, + {"M", PinyinInitial::Zero, PinyinFinal::Letter_M, + PinyinFuzzyFlag::None}, + {"N", PinyinInitial::Zero, PinyinFinal::Letter_N, + PinyinFuzzyFlag::None}, + {"O", PinyinInitial::Zero, PinyinFinal::Letter_O, + PinyinFuzzyFlag::None}, + {"P", PinyinInitial::Zero, PinyinFinal::Letter_P, + PinyinFuzzyFlag::None}, + {"Q", PinyinInitial::Zero, PinyinFinal::Letter_Q, + PinyinFuzzyFlag::None}, + {"R", PinyinInitial::Zero, PinyinFinal::Letter_R, + PinyinFuzzyFlag::None}, + {"S", PinyinInitial::Zero, PinyinFinal::Letter_S, + PinyinFuzzyFlag::None}, + {"T", PinyinInitial::Zero, PinyinFinal::Letter_T, + PinyinFuzzyFlag::None}, + {"U", PinyinInitial::Zero, PinyinFinal::Letter_U, + PinyinFuzzyFlag::None}, + {"V", PinyinInitial::Zero, PinyinFinal::Letter_V, + PinyinFuzzyFlag::None}, + {"W", PinyinInitial::Zero, PinyinFinal::Letter_W, + PinyinFuzzyFlag::None}, + {"X", PinyinInitial::Zero, PinyinFinal::Letter_X, + PinyinFuzzyFlag::None}, + {"Y", PinyinInitial::Zero, PinyinFinal::Letter_Y, + PinyinFuzzyFlag::None}, + {"Z", PinyinInitial::Zero, PinyinFinal::Letter_Z, + PinyinFuzzyFlag::None}, }; return pinyinMap; } diff --git a/src/libime/pinyin/pinyinencoder.cpp b/src/libime/pinyin/pinyinencoder.cpp index ee2712f1..d7dd43d7 100644 --- a/src/libime/pinyin/pinyinencoder.cpp +++ b/src/libime/pinyin/pinyinencoder.cpp @@ -82,24 +82,37 @@ static const auto initialMap = makeBimap({ }); static const auto finalMap = makeBimap({ - {PinyinFinal::A, "a"}, {PinyinFinal::AI, "ai"}, - {PinyinFinal::AN, "an"}, {PinyinFinal::ANG, "ang"}, - {PinyinFinal::AO, "ao"}, {PinyinFinal::E, "e"}, - {PinyinFinal::EI, "ei"}, {PinyinFinal::EN, "en"}, - {PinyinFinal::ENG, "eng"}, {PinyinFinal::ER, "er"}, - {PinyinFinal::O, "o"}, {PinyinFinal::ONG, "ong"}, - {PinyinFinal::OU, "ou"}, {PinyinFinal::I, "i"}, - {PinyinFinal::IA, "ia"}, {PinyinFinal::IE, "ie"}, - {PinyinFinal::IAO, "iao"}, {PinyinFinal::IU, "iu"}, - {PinyinFinal::IAN, "ian"}, {PinyinFinal::IN, "in"}, - {PinyinFinal::IANG, "iang"}, {PinyinFinal::ING, "ing"}, - {PinyinFinal::IONG, "iong"}, {PinyinFinal::U, "u"}, - {PinyinFinal::UA, "ua"}, {PinyinFinal::UO, "uo"}, - {PinyinFinal::UAI, "uai"}, {PinyinFinal::UI, "ui"}, - {PinyinFinal::UAN, "uan"}, {PinyinFinal::UN, "un"}, - {PinyinFinal::UANG, "uang"}, {PinyinFinal::V, "v"}, - {PinyinFinal::UE, "ue"}, {PinyinFinal::VE, "ve"}, - {PinyinFinal::NG, "ng"}, {PinyinFinal::Zero, ""}, + {PinyinFinal::A, "a"}, {PinyinFinal::AI, "ai"}, + {PinyinFinal::AN, "an"}, {PinyinFinal::ANG, "ang"}, + {PinyinFinal::AO, "ao"}, {PinyinFinal::E, "e"}, + {PinyinFinal::EI, "ei"}, {PinyinFinal::EN, "en"}, + {PinyinFinal::ENG, "eng"}, {PinyinFinal::ER, "er"}, + {PinyinFinal::O, "o"}, {PinyinFinal::ONG, "ong"}, + {PinyinFinal::OU, "ou"}, {PinyinFinal::I, "i"}, + {PinyinFinal::IA, "ia"}, {PinyinFinal::IE, "ie"}, + {PinyinFinal::IAO, "iao"}, {PinyinFinal::IU, "iu"}, + {PinyinFinal::IAN, "ian"}, {PinyinFinal::IN, "in"}, + {PinyinFinal::IANG, "iang"}, {PinyinFinal::ING, "ing"}, + {PinyinFinal::IONG, "iong"}, {PinyinFinal::U, "u"}, + {PinyinFinal::UA, "ua"}, {PinyinFinal::UO, "uo"}, + {PinyinFinal::UAI, "uai"}, {PinyinFinal::UI, "ui"}, + {PinyinFinal::UAN, "uan"}, {PinyinFinal::UN, "un"}, + {PinyinFinal::UANG, "uang"}, {PinyinFinal::V, "v"}, + {PinyinFinal::UE, "ue"}, {PinyinFinal::VE, "ve"}, + {PinyinFinal::NG, "ng"}, {PinyinFinal::Zero, ""}, + {PinyinFinal::Letter_A, "A"}, {PinyinFinal::Letter_B, "B"}, + {PinyinFinal::Letter_C, "C"}, {PinyinFinal::Letter_D, "D"}, + {PinyinFinal::Letter_E, "E"}, {PinyinFinal::Letter_F, "F"}, + {PinyinFinal::Letter_G, "G"}, {PinyinFinal::Letter_H, "H"}, + {PinyinFinal::Letter_I, "I"}, {PinyinFinal::Letter_J, "J"}, + {PinyinFinal::Letter_K, "K"}, {PinyinFinal::Letter_L, "L"}, + {PinyinFinal::Letter_M, "M"}, {PinyinFinal::Letter_N, "N"}, + {PinyinFinal::Letter_O, "O"}, {PinyinFinal::Letter_P, "P"}, + {PinyinFinal::Letter_Q, "Q"}, {PinyinFinal::Letter_R, "R"}, + {PinyinFinal::Letter_S, "S"}, {PinyinFinal::Letter_T, "T"}, + {PinyinFinal::Letter_U, "U"}, {PinyinFinal::Letter_V, "V"}, + {PinyinFinal::Letter_W, "W"}, {PinyinFinal::Letter_X, "X"}, + {PinyinFinal::Letter_Y, "Y"}, {PinyinFinal::Letter_Z, "Z"}, }); static const int maxPinyinLength = 6; @@ -129,9 +142,11 @@ template LongestMatchResult longestMatch(Iter iter, Iter end, PinyinFuzzyFlags flags, const PinyinMap &map) { if ((*iter == 'i' || *iter == 'u' || *iter == 'v') && - !flags.test(PinyinFuzzyFlag::Correction)) { - return {false, std::string_view(&*iter, std::distance(iter, end)), - false}; + !flags.testAny(PinyinFuzzyFlags{PinyinFuzzyFlag::Correction, + PinyinFuzzyFlag::Letter})) { + return {.valid = false, + .match = std::string_view(&*iter, std::distance(iter, end)), + .isCompletePinyin = false}; } if (std::distance(iter, end) > maxPinyinLength) { end = iter + maxPinyinLength; @@ -140,13 +155,16 @@ LongestMatchResult longestMatch(Iter iter, Iter end, PinyinFuzzyFlags flags, for (; !range.empty(); range.remove_suffix(1)) { if (hasMatchInMap(map, range, flags)) { // do not consider m/n/r as complete pinyin - return {true, range, - (range != "m" && range != "n" && range != "r")}; + return {.valid = true, + .match = range, + .isCompletePinyin = + (range != "m" && range != "n" && range != "r")}; } if (range.size() <= 2) { auto iter = initialMap.right.find(std::string{range}); if (iter != initialMap.right.end()) { - return {true, range, false}; + return { + .valid = true, .match = range, .isCompletePinyin = false}; } } } @@ -154,7 +172,7 @@ LongestMatchResult longestMatch(Iter iter, Iter end, PinyinFuzzyFlags flags, assert(range.empty()); range = std::string_view(&*iter, 1); - return {false, range, false}; + return {.valid = false, .match = range, .isCompletePinyin = false}; } std::string PinyinSyllable::toString() const { @@ -172,8 +190,6 @@ PinyinEncoder::parseUserPinyin(std::string userPinyin, PinyinFuzzyFlags flags) { SegmentGraph result{std::move(userPinyin)}; auto pinyin = result.data(); - std::transform(pinyin.begin(), pinyin.end(), pinyin.begin(), - fcitx::charutils::tolower); const auto end = pinyin.end(); if (!profile) { @@ -213,6 +229,11 @@ PinyinEncoder::parseUserPinyin(std::string userPinyin, } continue; } + if (fcitx::charutils::isupper(*iter)) { + result.addNext(top, top + 1); + q.push(top + 1); + continue; + } for (const auto fuzzyFlags : flagsToTry) { auto [valid, str, isCompletePinyin] = longestMatch(iter, end, fuzzyFlags, pinyinMap); @@ -225,10 +246,10 @@ PinyinEncoder::parseUserPinyin(std::string userPinyin, // check fuzzy seg // pinyin may end with aegimnoruv(h) // and may start with abcdefghjklmnopqrstwxyz. - // the intersection is aegmnor(h), while for m, it only 'm', so - // don't consider it also, make sure current pinyin does not end - // with a separator, other wise, jin'an may be parsed into ji'n - // because, nextMatch is starts with "'". + // the intersection is aegmnor(h), while for m, it only 'm', + // so don't consider it also, make sure current pinyin does + // not end with a separator, other wise, jin'an may be + // parsed into ji'n because, nextMatch is starts with "'". std::array nextSize; size_t nNextSize = 0; // Check if we can do fuzzy segement, e.g. @@ -251,12 +272,13 @@ PinyinEncoder::parseUserPinyin(std::string userPinyin, str.size() - 1 + nextMatchAlt.match.size(); // comparator is (validPinyin, whole size>= lhs pinyin, - // isCompletePinyin) validPinyin means it's at least some - // pinyin, instead of things startsWith i,u,v. Since - // longestMatch will now treat string startsWith iuv a whole - // segment, we need to compare validity before the length. - // If whole size is equal to lhs pinyin, then it should be - // handled by inner segement flag. + // isCompletePinyin) validPinyin means it's at least + // some pinyin, instead of things startsWith i,u,v. + // Since longestMatch will now treat string startsWith + // iuv a whole segment, we need to compare validity + // before the length. If whole size is equal to lhs + // pinyin, then it should be handled by inner segement + // flag. std::tuple compare( nextMatch.valid, true, nextMatch.isCompletePinyin); std::tuple compareAlt( @@ -309,8 +331,9 @@ PinyinEncoder::parseUserPinyin(std::string userPinyin, } else if (nextPinyin.size() == 2 && flags.test(PinyinFuzzyFlag::InnerShort) && nextPinyin == "ng") { - // Handle ng -> n'g, the condition is so simple so we - // don't make it go through the inner segment lookup. + // Handle ng -> n'g, the condition is so simple so + // we don't make it go through the inner segment + // lookup. result.addNext(top, top + 1); result.addNext(top + 1, top + 2); } @@ -327,8 +350,6 @@ SegmentGraph PinyinEncoder::parseUserShuangpin(std::string userPinyin, flags = flags.unset(PinyinFuzzyFlag::AdvancedTypo); SegmentGraph result{std::move(userPinyin)}; auto pinyin = result.data(); - std::transform(pinyin.begin(), pinyin.end(), pinyin.begin(), - fcitx::charutils::tolower); // assume user always type valid shuangpin first, if not keep one. size_t i = 0; @@ -344,6 +365,11 @@ SegmentGraph PinyinEncoder::parseUserShuangpin(std::string userPinyin, continue; } auto initial = pinyin[i]; + if (fcitx::charutils::isupper(initial)) { + result.addNext(i, i + 1); + i = i + 1; + continue; + } char final = '\0'; if (i + 1 < pinyin.size() && pinyin[i + 1] != '\'') { final = pinyin[i + 1]; @@ -525,15 +551,15 @@ PinyinInitial PinyinEncoder::stringToInitial(const std::string &str) { const std::string &PinyinEncoder::finalToString(PinyinFinal final) { const static std::vector s = []() { std::vector s; - s.resize(lastFinal - firstFinal + 1); - for (char c = firstFinal; c <= lastFinal; c++) { + s.resize(lastLetter - firstFinal + 1); + for (char c = firstFinal; c <= lastLetter; c++) { auto iter = finalMap.left.find(static_cast(c)); s[c - firstFinal] = iter->second; } return s; }(); auto c = static_cast(final); - if (c >= firstFinal && c <= lastFinal) { + if (c >= firstFinal && c <= lastLetter) { return s[c - firstFinal]; } return emptyString; @@ -552,7 +578,7 @@ bool PinyinEncoder::isValidInitialFinal(PinyinInitial initial, if (initial != PinyinInitial::Invalid && final != PinyinFinal::Invalid) { int16_t encode = ((static_cast(initial) - PinyinEncoder::firstInitial) * - (PinyinEncoder::lastFinal - PinyinEncoder::firstFinal + 1)) + + (PinyinEncoder::lastLetter - PinyinEncoder::firstFinal + 1)) + (static_cast(final) - PinyinEncoder::firstFinal); const auto &a = getEncodedInitialFinal(); return encode < static_cast(a.size()) && a[encode]; @@ -718,8 +744,6 @@ stringToSyllablesImpl(std::string_view pinyinView, const PinyinMap &map, PinyinFuzzyFlags flags, const Adjuster &adjuster) { FuzzyPinyinSyllables result; std::string pinyin(pinyinView); - std::transform(pinyin.begin(), pinyin.end(), pinyin.begin(), - fcitx::charutils::tolower); // we only want {M,N,R}/Invalid instead of {M,N,R}/Zero, so we could get // match for everything. if (pinyin != "m" && pinyin != "n" && pinyin != "r") { @@ -736,6 +760,16 @@ stringToSyllablesImpl(std::string_view pinyinView, const PinyinMap &map, } } + if (pinyin.size() == 1 && fcitx::charutils::islower(pinyin[0]) && + flags.test(PinyinFuzzyFlag::Letter)) { + getFuzzy(result, + {PinyinInitial::Zero, PinyinEncoder::letterToFinal(pinyin[0])}, + flags, + /*isSp=*/false, [&adjuster](PinyinFuzzyFlags flags) { + return adjuster(flags | PinyinFuzzyFlag::Letter); + }); + } + auto iter = initialMap.right.find(pinyin); if (initialMap.right.end() != iter) { getFuzzy(result, {iter->second, PinyinFinal::Invalid}, flags, @@ -803,8 +837,7 @@ shuangpinToSyllablesImpl(std::string_view pinyinView, const Adjuster &adjuster) { assert(pinyinView.size() <= 2); std::string pinyin(pinyinView); - std::transform(pinyin.begin(), pinyin.end(), pinyin.begin(), - fcitx::charutils::tolower); + const auto &table = sp.table(); auto iter = table.find(pinyin); @@ -827,6 +860,21 @@ shuangpinToSyllablesImpl(std::string_view pinyinView, } } + if (pinyin.length() == 1 && ((fcitx::charutils::islower(pinyin[0]) && + flags.test(PinyinFuzzyFlag::Letter)) || + fcitx::charutils::isupper(pinyin[0]))) { + bool isLower = fcitx::charutils::islower(pinyin[0]); + getFuzzy(result, + {PinyinInitial::Zero, PinyinEncoder::letterToFinal(pinyin[0])}, + flags, + /*isSp=*/true, [&adjuster, isLower](PinyinFuzzyFlags flags) { + if (isLower) { + flags |= PinyinFuzzyFlag::Letter; + } + return adjuster(flags); + }); + } + if (result.empty()) { result.emplace_back( std::piecewise_construct, @@ -873,4 +921,20 @@ PinyinEncoder::shuangpinToPinyin(std::string_view pinyinView, return ""; } +bool PinyinEncoder::isFinalLetter(PinyinFinal final) { + return final >= PinyinFinal::Letter_A && final <= PinyinFinal::Letter_Z; +} + +PinyinFinal PinyinEncoder::letterToFinal(char c) { + if (c >= 'a' && c <= 'z') { + return static_cast( + static_cast(PinyinFinal::Letter_A) + (c - 'a')); + } + if (c >= 'A' && c <= 'Z') { + return static_cast( + static_cast(PinyinFinal::Letter_A) + (c - 'A')); + } + return PinyinFinal::Invalid; +} + } // namespace libime diff --git a/src/libime/pinyin/pinyinencoder.h b/src/libime/pinyin/pinyinencoder.h index 1a59ea78..b5caa2c3 100644 --- a/src/libime/pinyin/pinyinencoder.h +++ b/src/libime/pinyin/pinyinencoder.h @@ -64,6 +64,10 @@ enum class PinyinFuzzyFlag { * @since 1.1.11 */ L_R = 1 << 20, + /** + * Enable matching for lower case single pinyin as English letter. + */ + Letter = 1 << 21, }; using PinyinFuzzyFlags = fcitx::Flags; @@ -151,7 +155,33 @@ enum class PinyinFinal : char { VE, UE, NG, - Zero + Zero, + Letter_A, + Letter_B, + Letter_C, + Letter_D, + Letter_E, + Letter_F, + Letter_G, + Letter_H, + Letter_I, + Letter_J, + Letter_K, + Letter_L, + Letter_M, + Letter_N, + Letter_O, + Letter_P, + Letter_Q, + Letter_R, + Letter_S, + Letter_T, + Letter_U, + Letter_V, + Letter_W, + Letter_X, + Letter_Y, + Letter_Z, }; inline bool operator<(PinyinFinal l, PinyinFinal r) { @@ -278,6 +308,15 @@ class LIBIMEPINYIN_EXPORT PinyinEncoder { } static bool isValidInitialFinal(PinyinInitial initial, PinyinFinal final); + + /** + * Check if the final is a letter. + * @since 1.1.14 + */ + static bool isFinalLetter(PinyinFinal final); + + static PinyinFinal letterToFinal(char c); + // This will use "ü" when possible. static std::string initialFinalToPinyinString(PinyinInitial initial, PinyinFinal final); @@ -298,10 +337,13 @@ class LIBIMEPINYIN_EXPORT PinyinEncoder { const ShuangpinProfile &sp, PinyinFuzzyFlags flags); - static const char firstInitial = static_cast(PinyinInitial::B); - static const char lastInitial = static_cast(PinyinInitial::Zero); - static const char firstFinal = static_cast(PinyinFinal::A); - static const char lastFinal = static_cast(PinyinFinal::Zero); + static constexpr char firstInitial = static_cast(PinyinInitial::B); + static constexpr char lastInitial = static_cast(PinyinInitial::Zero); + static constexpr char firstFinal = static_cast(PinyinFinal::A); + static constexpr char lastFinal = static_cast(PinyinFinal::Zero); + static constexpr char firstLetter = + static_cast(PinyinFinal::Letter_A); + static constexpr char lastLetter = static_cast(PinyinFinal::Letter_Z); }; } // namespace libime diff --git a/test/testpinyindata.cpp b/test/testpinyindata.cpp index b1e1a141..7f5da32a 100644 --- a/test/testpinyindata.cpp +++ b/test/testpinyindata.cpp @@ -163,7 +163,7 @@ int main() { int16_t encode = ((static_cast(initial) - PinyinEncoder::firstInitial) * - (PinyinEncoder::lastFinal - PinyinEncoder::firstFinal + 1)) + + (PinyinEncoder::lastLetter - PinyinEncoder::firstFinal + 1)) + (static_cast(final) - PinyinEncoder::firstFinal); FCITX_ASSERT(PinyinEncoder::isValidInitialFinal(initial, final)) << " " << encode; diff --git a/test/testpinyindictionary.cpp b/test/testpinyindictionary.cpp index f0da8bf0..bd23d401 100644 --- a/test/testpinyindictionary.cpp +++ b/test/testpinyindictionary.cpp @@ -114,10 +114,27 @@ void testEscape() { << "dump: " << dump.str(); } +void testLetter() { + std::stringstream ss; + constexpr std::string_view input = R"( +X光 X'guang +)"; + + ss << input; + PinyinDictionary dict; + dict.load(PinyinDictionary::SystemDict, ss, PinyinDictFormat::Text); + FCITX_ASSERT( + dict.lookupWord(PinyinDictionary::SystemDict, "X'guang", "X光")); + std::stringstream dump; + dict.save(PinyinDictionary::SystemDict, dump, PinyinDictFormat::Text); + FCITX_ASSERT(dump.str() == "X光 X'guang 0\n") << "dump: " << dump.str(); +} + } // namespace int main() { testBasic(); testEscape(); + testLetter(); return 0; } diff --git a/test/testpinyinencoder.cpp b/test/testpinyinencoder.cpp index cb3200a9..b7cd152e 100644 --- a/test/testpinyinencoder.cpp +++ b/test/testpinyinencoder.cpp @@ -141,6 +141,16 @@ int main() { {PinyinInitial::D, {{PinyinFinal::ING, true}, {PinyinFinal::IN, false}}}}); + FCITX_ASSERT(PinyinEncoder::stringToSyllables("E", PinyinFuzzyFlags{}) == + MatchedPinyinSyllables{ + {PinyinInitial::Zero, {{PinyinFinal::Letter_E, false}}}}); + + FCITX_ASSERT(PinyinEncoder::stringToSyllables( + "e", PinyinFuzzyFlags{PinyinFuzzyFlag::Letter}) == + MatchedPinyinSyllables{{PinyinInitial::Zero, + {{PinyinFinal::E, false}, + {PinyinFinal::Letter_E, true}}}}); + for (const auto &syl : PinyinEncoder::stringToSyllables( "e", PinyinFuzzyFlags{PinyinFuzzyFlag::PartialFinal})) { for (auto f : syl.second) { @@ -165,7 +175,7 @@ int main() { check("xion", PinyinFuzzyFlag::None, {"xi", "o", "n"}); check("xiana", PinyinFuzzyFlag::None, {"xian", "a"}); - check("Nihao", PinyinFuzzyFlag::None, {"Ni", "hao"}); + check("Xguang", PinyinFuzzyFlag::None, {"X", "guang"}); } { diff --git a/test/testpinyinime.cpp b/test/testpinyinime.cpp index 47183ef2..ad524275 100644 --- a/test/testpinyinime.cpp +++ b/test/testpinyinime.cpp @@ -77,6 +77,7 @@ int main(int argc, char *argv[]) { } } else if (word.size() == 1 && (('a' <= word[0] && word[0] <= 'z') || + ('A' <= word[0] && word[0] <= 'Z') || (!c.userInput().empty() && word[0] == '\''))) { c.type(word); } else if (word.size() == 1 && ('0' <= word[0] && word[0] <= '9')) { diff --git a/test/testshuangpinprofile.cpp b/test/testshuangpinprofile.cpp index 8f0fe637..9b6c33a4 100644 --- a/test/testshuangpinprofile.cpp +++ b/test/testshuangpinprofile.cpp @@ -33,6 +33,9 @@ void checkProfile(const ShuangpinProfile &profile, bool hasSemicolon) { std::set validSyls; for (const auto &p : getPinyinMap()) { + if (PinyinEncoder::isFinalLetter(p.final())) { + continue; + } validSyls.emplace(p.initial(), p.final()); } validSyls.erase(PinyinSyllable(PinyinInitial::M, PinyinFinal::Zero));