Skip to content

Commit

Permalink
[ENHANCEMENT] - Better unicode matching (#35)
Browse files Browse the repository at this point in the history
  • Loading branch information
joshmcrae committed Oct 28, 2022
1 parent b9acb69 commit f13cf10
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 17 deletions.
114 changes: 114 additions & 0 deletions bin/generate-unicode-regex.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/usr/bin/env php
<?php

// build ordered list of codepoints
$shortcodes = require(__DIR__ . '/../src/shortcodes-array.php');

$codepoints = [];
foreach ($shortcodes as $hexcodes) {
foreach (explode('-', $hexcodes) as $hexcode) {
$codepoints[] = hexdec($hexcode);
}
}

sort($codepoints);

// convert codepoints to UTF-8 and build structured list
$utf8 = [];
foreach ($codepoints as $dec) {
$chr = mb_chr($dec, 'UTF-8');
$hex = unpack('H*', $chr);
$chars = array_chunk(str_split(reset($hex)), 2);
$bytes = array_map('implode', $chars);

foreach (range(0, 3) as $padding) {
if ($padding >= count($bytes)) {
array_unshift($bytes, '00');
}
}

$arr = &$utf8;
foreach ($bytes as $offset => $byte) {
if ($offset === 3) {
$arr[] = $byte;
continue;
}

if (!isset($arr[$byte])) {
$arr[$byte] = [];
}

$arr = &$arr[$byte];
}
}

// build simplified regex
$expressions = [];
foreach ($utf8 as $firstByte => $secondBytes) {
foreach ($secondBytes as $secondByte => $thirdBytes) {
$first = $last = null;
$little = [];

foreach ($thirdBytes as $thirdByte => $fourthBytes) {
if (!$first) {
$first = $last = hexdec($thirdByte);
$little = $fourthBytes;
continue;
}

if ((hexdec($thirdByte) - $last) > 1) {
sort($little);

$littleFirst = array_shift($little);
$littleLast = array_pop($little);

if (!$littleLast) {
$littleLast = $littleFirst;
}

$expression = sprintf(
'\x%s\x%s%s%s',
strtoupper($firstByte),
strtoupper($secondByte),
$first === $last ? sprintf('\x%s', strtoupper(dechex($first))) : sprintf('[\x%s-\x%s]', strtoupper(dechex($first)), strtoupper(dechex($last))),
$littleFirst === $littleLast ? sprintf('\x%s', strtoupper($littleFirst)) : sprintf('[\x%s-\x%s]', strtoupper($littleFirst), strtoupper($littleLast)),
);

$expression = preg_replace('/^(\\\\x00)+/', '', $expression);
$expressions[] = $expression;

$first = $last = hexdec($thirdByte);
$little = [];
}

$last = hexdec($thirdByte);

foreach ($fourthBytes as $fourthByte) {
$little[] = $fourthByte;
}
}

sort($little);

$littleFirst = array_shift($little);
$littleLast = array_pop($little);

if (!$littleLast) {
$littleLast = $littleFirst;
}

$expression = sprintf(
'\x%s\x%s%s%s',
strtoupper($firstByte),
strtoupper($secondByte),
$first === $last ? sprintf('\x%s', strtoupper(dechex($first))) : sprintf('[\x%s-\x%s]', strtoupper(dechex($first)), strtoupper(dechex($last))),
$littleFirst === $littleLast ? sprintf('\x%s', strtoupper($littleFirst)) : sprintf('[\x%s-\x%s]', strtoupper($littleFirst), strtoupper($littleLast)),
);

$expression = preg_replace('/^(\\\\x00)+/', '', $expression);
$expressions[] = $expression;
}
}

$match = sprintf('/(%s)/x', implode(PHP_EOL . '|', $expressions));
file_put_contents(__DIR__ . '/../src/unicode-patterns.php', sprintf("<?php\nreturn '%s';", $match));
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
},
"scripts": {
"update-resources": [
"@php bin/generate-shortcodes-array.php"
"@php bin/generate-shortcodes-array.php",
"@php bin/generate-unicode-regex.php"
]
}
}
29 changes: 14 additions & 15 deletions src/LitEmoji.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,7 @@

class LitEmoji
{
public const MB_REGEX = '/(
\x23\xE2\x83\xA3 # Digits
[\x30-\x39]\xE2\x83\xA3
| \xE2[\x9C-\x9E][\x80-\xBF] # Dingbats
| \xF0\x9F[\x85-\x88][\xA6-\xBF] # Enclosed characters
| \xF0\x9F[\x8C-\x97][\x80-\xBF] # Misc
| \xF0\x9F\x98[\x80-\xBF] # Smilies
| \xF0\x9F\x99[\x80-\x8F]
| \xF0\x9F[\x9A-\x9B][\x80-\xBF] # Transport and map symbols
| \xF0\x9F[\xA4-\xA7][\x80-\xBF] # Supplementary symbols and pictographs
)/x';

private static $regex = null;
private static $shortcodes = [];
private static $shortcodeCodepoints = [];
private static $shortcodeEntities = [];
Expand Down Expand Up @@ -113,7 +102,7 @@ public static function unicodeToShortcode(string $content): string

/* Break content along codepoint boundaries */
$parts = preg_split(
self::MB_REGEX,
self::getRegex(),
$content,
-1,
PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
Expand All @@ -122,7 +111,7 @@ public static function unicodeToShortcode(string $content): string
/* Reconstruct content using shortcodes */
$sequence = [];
foreach ($parts as $offset => $part) {
if (preg_match(self::MB_REGEX, $part)) {
if (preg_match(self::getRegex(), $part)) {
$part = mb_convert_encoding($part, 'UTF-32', $encoding);
$words = unpack('N*', $part);
$codepoint = sprintf('%X', reset($words));
Expand Down Expand Up @@ -188,7 +177,7 @@ public static function config(string $property, $value): void
break;
}
}

/**
* Removes all emoji-sequences from string.
*
Expand All @@ -202,6 +191,16 @@ public static function removeEmoji(string $source): string
return $content;
}

private static function getRegex()
{
if (!is_null(self::$regex)) {
return self::$regex;
}

self::$regex = require(__DIR__ . '/unicode-patterns.php');
return self::$regex;
}

private static function getShortcodes()
{
if (!empty(self::$shortcodes)) {
Expand Down
22 changes: 22 additions & 0 deletions src/unicode-patterns.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?php
return '/(\xC2[\xA9-\xAE]
|\xE2[\x80-\x81][\x89-\xBC]
|\xE2[\x83-\x84][\xA2-\xB9]
|\xE2\x86[\x94-\xAA]
|\xE2\x8C[\x9A-\xA8]
|\xE2\x8F[\x8F-\xBA]
|\xE2\x93\x82
|\xE2[\x96-\x9E][\x80-\xBF]
|\xE2\xA4[\xB4-\xB5]
|\xE2[\xAC-\xAD][\x85-\x9C]
|\xE3\x80[\xB0-\xBD]
|\xE3\x8A[\x97-\x99]
|\xEF\xB8\x8F
|\xF0\x9F\x80\x84
|\xF0\x9F\x83\x8F
|\xF0\x9F[\x85-\x89][\x81-\xBF]
|\xF0\x9F[\x8C-\x9B][\x80-\xBF]
|\xF0\x9F\x9F[\xA0-\xB0]
|\xF0\x9F[\xA4-\xA7][\x80-\xBF]
|\xF0\x9F[\xA9-\xAB][\x80-\xBC]
|\xF3\xA0\x81[\xA2-\xBF])/x';
15 changes: 14 additions & 1 deletion tests/LitEmojiTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public function testUnicodeToShortcodeTiming()
$text = LitEmoji::encodeShortcode(file_get_contents(__DIR__ . '/UnicodeIpsum'));
$this->assertEquals(file_get_contents(__DIR__ . '/ShortcodeIpsum'), $text);
}

public function testRemoveEmoji()
{
$text = LitEmoji::removeEmoji('Some text 馃槉 including emoji 馃殌');
Expand All @@ -60,6 +60,19 @@ public function testConfigExcludeShortcodes()
$this->assertEquals(':iphone:', LitEmoji::encodeShortcode('馃摫'));
}

public function testUnicodeMatching()
{
$shortcodes = require(__DIR__ . '/../src/shortcodes-array.php');
$shortcodes = array_flip($shortcodes);

foreach ($shortcodes as $shortcode) {
$unicode = LitEmoji::encodeUnicode(':' . $shortcode . ':');
$matched = LitEmoji::unicodeToShortcode($unicode);

$this->assertNotEquals($unicode, $matched);
}
}

public function testIssue25()
{
$text = LitEmoji::encodeShortcode('馃殌馃洅');
Expand Down

0 comments on commit f13cf10

Please sign in to comment.