Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
1650 lines (1462 sloc) 70.1 KB
<?php
/**
* Provides string functions for UTF-8 strings
*
* This class is implemented to provide a UTF-8 version of almost every built-in
* PHP string function. For more information about UTF-8, please visit
* http://flourishlib.com/docs/UTF-8.
*
* @copyright Copyright (c) 2008-2012 Will Bond
* @author Will Bond [wb] <will@flourishlib.com>
* @license http://flourishlib.com/license
*
* @package Flourish
* @link http://flourishlib.com/fUTF8
*
* @version 1.0.0b16
* @changes 1.0.0b16 Added code to ::clean() to use mbstring if available since recent versions of iconv and `//IGNORE` now return `FALSE` for bad encodings [wb, 2012-09-21]
* @changes 1.0.0b15 Fixed a bug with using IBM's iconv implementation on AIX [wb, 2011-07-29]
* @changes 1.0.0b14 Added a workaround for iconv having issues in MAMP 1.9.4+ [wb, 2011-07-26]
* @changes 1.0.0b13 Fixed notices from being thrown when invalid data is sent to ::clean() [wb, 2011-06-10]
* @changes 1.0.0b12 Fixed a variable name typo in ::sub() [wb, 2011-05-09]
* @changes 1.0.0b11 Updated the class to not using phpinfo() to determine the iconv implementation [wb, 2010-11-04]
* @changes 1.0.0b10 Fixed a bug with capitalizing a lowercase i resulting in a dotted upper-case I [wb, 2010-11-01]
* @changes 1.0.0b9 Updated class to use fCore::startErrorCapture() instead of `error_reporting()` [wb, 2010-08-09]
* @changes 1.0.0b8 Removed `e` flag from preg_replace() calls [wb, 2010-06-08]
* @changes 1.0.0b7 Added the methods ::trim(), ::rtrim() and ::ltrim() [wb, 2010-05-11]
* @changes 1.0.0b6 Fixed ::clean() to work with PHP installs that use an iconv library that doesn't support //IGNORE [wb, 2010-03-02]
* @changes 1.0.0b5 Changed ::ucwords() to also uppercase words right after various punctuation [wb, 2009-09-18]
* @changes 1.0.0b4 Changed replacement values in preg_replace() calls to be properly escaped [wb, 2009-06-11]
* @changes 1.0.0b3 Fixed a parameter name in ::rpos() from `$search` to `$needle` [wb, 2009-02-06]
* @changes 1.0.0b2 Fixed a bug in ::explode() with newlines and zero-length delimiters [wb, 2009-02-05]
* @changes 1.0.0b The initial implementation [wb, 2008-06-01]
*/
class fUTF8
{
// The following constants allow for nice looking callbacks to static methods
const ascii = 'fUTF8::ascii';
const chr = 'fUTF8::chr';
const clean = 'fUTF8::clean';
const cmp = 'fUTF8::cmp';
const explode = 'fUTF8::explode';
const icmp = 'fUTF8::icmp';
const inatcmp = 'fUTF8::inatcmp';
const ipos = 'fUTF8::ipos';
const ireplace = 'fUTF8::ireplace';
const irpos = 'fUTF8::irpos';
const istr = 'fUTF8::istr';
const len = 'fUTF8::len';
const lower = 'fUTF8::lower';
const ltrim = 'fUTF8::ltrim';
const natcmp = 'fUTF8::natcmp';
const ord = 'fUTF8::ord';
const pad = 'fUTF8::pad';
const pos = 'fUTF8::pos';
const replace = 'fUTF8::replace';
const reset = 'fUTF8::reset';
const rev = 'fUTF8::rev';
const rpos = 'fUTF8::rpos';
const rtrim = 'fUTF8::rtrim';
const str = 'fUTF8::str';
const sub = 'fUTF8::sub';
const trim = 'fUTF8::trim';
const ucfirst = 'fUTF8::ucfirst';
const ucwords = 'fUTF8::ucwords';
const upper = 'fUTF8::upper';
const wordwrap = 'fUTF8::wordwrap';
/**
* Depending how things are compiled, NetBSD and Solaris don't support //IGNORE in iconv()
*
* If //IGNORE support is not provided strings with invalid characters will be truncated
*
* @var boolean
*/
static private $can_ignore_invalid = NULL;
/**
* All lowercase UTF-8 characters mapped to uppercase characters
*
* @var array
*/
static private $lower_to_upper = array(
'a' => 'A', 'b' => 'B', 'c' => 'C', 'd' => 'D', 'e' => 'E', 'f' => 'F',
'g' => 'G', 'h' => 'H', 'i' => 'I', 'j' => 'J', 'k' => 'K', 'l' => 'L',
'm' => 'M', 'n' => 'N', 'o' => 'O', 'p' => 'P', 'q' => 'Q', 'r' => 'R',
's' => 'S', 't' => 'T', 'u' => 'U', 'v' => 'V', 'w' => 'W', 'x' => 'X',
'y' => 'Y', 'z' => 'Z', 'à' => 'À', 'á' => 'Á', 'â' => 'Â', 'ã' => 'Ã',
'ä' => 'Ä', 'å' => 'Å', 'æ' => 'Æ', 'ç' => 'Ç', 'è' => 'È', 'é' => 'É',
'ê' => 'Ê', 'ë' => 'Ë', 'ì' => 'Ì', 'í' => 'Í', 'î' => 'Î', 'ï' => 'Ï',
'ð' => 'Ð', 'ñ' => 'Ñ', 'ò' => 'Ò', 'ó' => 'Ó', 'ô' => 'Ô', 'õ' => 'Õ',
'ö' => 'Ö', 'ø' => 'Ø', 'ù' => 'Ù', 'ú' => 'Ú', 'û' => 'Û', 'ü' => 'Ü',
'ý' => 'Ý', 'þ' => 'Þ', 'ā' => 'Ā', 'ă' => 'Ă', 'ą' => 'Ą', 'ć' => 'Ć',
'ĉ' => 'Ĉ', 'ċ' => 'Ċ', 'č' => 'Č', 'ď' => 'Ď', 'đ' => 'Đ', 'ē' => 'Ē',
'ĕ' => 'Ĕ', 'ė' => 'Ė', 'ę' => 'Ę', 'ě' => 'Ě', 'ĝ' => 'Ĝ', 'ğ' => 'Ğ',
'ġ' => 'Ġ', 'ģ' => 'Ģ', 'ĥ' => 'Ĥ', 'ħ' => 'Ħ', 'ĩ' => 'Ĩ', 'ī' => 'Ī',
'ĭ' => 'Ĭ', 'į' => 'Į', 'ij' => 'IJ', 'ĵ' => 'Ĵ', 'ķ' => 'Ķ', 'ĺ' => 'Ĺ',
'ļ' => 'Ļ', 'ľ' => 'Ľ', 'ŀ' => 'Ŀ', 'ł' => 'Ł', 'ń' => 'Ń', 'ņ' => 'Ņ',
'ň' => 'Ň', 'ŋ' => 'Ŋ', 'ō' => 'Ō', 'ŏ' => 'Ŏ', 'ő' => 'Ő', 'œ' => 'Œ',
'ŕ' => 'Ŕ', 'ŗ' => 'Ŗ', 'ř' => 'Ř', 'ś' => 'Ś', 'ŝ' => 'Ŝ', 'ş' => 'Ş',
'š' => 'Š', 'ţ' => 'Ţ', 'ť' => 'Ť', 'ŧ' => 'Ŧ', 'ũ' => 'Ũ', 'ū' => 'Ū',
'ŭ' => 'Ŭ', 'ů' => 'Ů', 'ű' => 'Ű', 'ų' => 'Ų', 'ŵ' => 'Ŵ', 'ŷ' => 'Ŷ',
'ÿ' => 'Ÿ', 'ź' => 'Ź', 'ż' => 'Ż', 'ž' => 'Ž', 'ɓ' => 'Ɓ', 'ƃ' => 'Ƃ',
'ƅ' => 'Ƅ', 'ɔ' => 'Ɔ', 'ƈ' => 'Ƈ', 'ɗ' => 'Ɗ', 'ƌ' => 'Ƌ', 'ɘ' => 'Ǝ',
'ə' => 'Ə', 'ɛ' => 'Ɛ', 'ƒ' => 'Ƒ', 'ɠ' => 'Ɠ', 'ɣ' => 'Ɣ', 'ɩ' => 'Ɩ',
'ɨ' => 'Ɨ', 'ƙ' => 'Ƙ', 'ɯ' => 'Ɯ', 'ɲ' => 'Ɲ', 'ɵ' => 'Ɵ', 'ơ' => 'Ơ',
'ƣ' => 'Ƣ', 'ƥ' => 'Ƥ', 'ƨ' => 'Ƨ', 'ʃ' => 'Ʃ', 'ƭ' => 'Ƭ', 'ʈ' => 'Ʈ',
'ư' => 'Ư', 'ʊ' => 'Ʊ', 'ʋ' => 'Ʋ', 'ƴ' => 'Ƴ', 'ƶ' => 'Ƶ', 'ʒ' => 'Ʒ',
'ƹ' => 'Ƹ', 'ƽ' => 'Ƽ', 'dž' => 'DŽ', 'dž' => 'Dž', 'lj' => 'LJ', 'lj' => 'Lj',
'nj' => 'NJ', 'nj' => 'Nj', 'ǎ' => 'Ǎ', 'ǐ' => 'Ǐ', 'ǒ' => 'Ǒ', 'ǔ' => 'Ǔ',
'ǖ' => 'Ǖ', 'ǘ' => 'Ǘ', 'ǚ' => 'Ǚ', 'ǜ' => 'Ǜ', 'ǟ' => 'Ǟ', 'ǡ' => 'Ǡ',
'ǣ' => 'Ǣ', 'ǥ' => 'Ǥ', 'ǧ' => 'Ǧ', 'ǩ' => 'Ǩ', 'ǫ' => 'Ǫ', 'ǭ' => 'Ǭ',
'ǯ' => 'Ǯ', 'dz' => 'DZ', 'ǵ' => 'Ǵ', 'ǻ' => 'Ǻ', 'ǽ' => 'Ǽ', 'ǿ' => 'Ǿ',
'ȁ' => 'Ȁ', 'ȃ' => 'Ȃ', 'ȅ' => 'Ȅ', 'ȇ' => 'Ȇ', 'ȉ' => 'Ȉ', 'ȋ' => 'Ȋ',
'ȍ' => 'Ȍ', 'ȏ' => 'Ȏ', 'ȑ' => 'Ȑ', 'ȓ' => 'Ȓ', 'ȕ' => 'Ȕ', 'ȗ' => 'Ȗ',
'ά' => 'Ά', 'έ' => 'Έ', 'ή' => 'Ή', 'ί' => 'Ί', 'ό' => 'Ό', 'ύ' => 'Ύ',
'ώ' => 'Ώ', 'α' => 'Α', 'β' => 'Β', 'γ' => 'Γ', 'δ' => 'Δ', 'ε' => 'Ε',
'ζ' => 'Ζ', 'η' => 'Η', 'θ' => 'Θ', 'ι' => 'Ι', 'κ' => 'Κ', 'λ' => 'Λ',
'μ' => 'Μ', 'ν' => 'Ν', 'ξ' => 'Ξ', 'ο' => 'Ο', 'π' => 'Π', 'ρ' => 'Ρ',
'σ' => 'Σ', 'τ' => 'Τ', 'υ' => 'Υ', 'φ' => 'Φ', 'χ' => 'Χ', 'ψ' => 'Ψ',
'ω' => 'Ω', 'ϊ' => 'Ϊ', 'ϋ' => 'Ϋ', 'ϣ' => 'Ϣ', 'ϥ' => 'Ϥ', 'ϧ' => 'Ϧ',
'ϩ' => 'Ϩ', 'ϫ' => 'Ϫ', 'ϭ' => 'Ϭ', 'ϯ' => 'Ϯ', 'ё' => 'Ё', 'ђ' => 'Ђ',
'ѓ' => 'Ѓ', 'є' => 'Є', 'ѕ' => 'Ѕ', 'і' => 'І', 'ї' => 'Ї', 'ј' => 'Ј',
'љ' => 'Љ', 'њ' => 'Њ', 'ћ' => 'Ћ', 'ќ' => 'Ќ', 'ў' => 'Ў', 'џ' => 'Џ',
'а' => 'А', 'б' => 'Б', 'в' => 'В', 'г' => 'Г', 'д' => 'Д', 'е' => 'Е',
'ж' => 'Ж', 'з' => 'З', 'и' => 'И', 'й' => 'Й', 'к' => 'К', 'л' => 'Л',
'м' => 'М', 'н' => 'Н', 'о' => 'О', 'п' => 'П', 'р' => 'Р', 'с' => 'С',
'т' => 'Т', 'у' => 'У', 'ф' => 'Ф', 'х' => 'Х', 'ц' => 'Ц', 'ч' => 'Ч',
'ш' => 'Ш', 'щ' => 'Щ', 'ъ' => 'Ъ', 'ы' => 'Ы', 'ь' => 'Ь', 'э' => 'Э',
'ю' => 'Ю', 'я' => 'Я', 'ѡ' => 'Ѡ', 'ѣ' => 'Ѣ', 'ѥ' => 'Ѥ', 'ѧ' => 'Ѧ',
'ѩ' => 'Ѩ', 'ѫ' => 'Ѫ', 'ѭ' => 'Ѭ', 'ѯ' => 'Ѯ', 'ѱ' => 'Ѱ', 'ѳ' => 'Ѳ',
'ѵ' => 'Ѵ', 'ѷ' => 'Ѷ', 'ѹ' => 'Ѹ', 'ѻ' => 'Ѻ', 'ѽ' => 'Ѽ', 'ѿ' => 'Ѿ',
'ҁ' => 'Ҁ', 'ґ' => 'Ґ', 'ғ' => 'Ғ', 'ҕ' => 'Ҕ', 'җ' => 'Җ', 'ҙ' => 'Ҙ',
'қ' => 'Қ', 'ҝ' => 'Ҝ', 'ҟ' => 'Ҟ', 'ҡ' => 'Ҡ', 'ң' => 'Ң', 'ҥ' => 'Ҥ',
'ҧ' => 'Ҧ', 'ҩ' => 'Ҩ', 'ҫ' => 'Ҫ', 'ҭ' => 'Ҭ', 'ү' => 'Ү', 'ұ' => 'Ұ',
'ҳ' => 'Ҳ', 'ҵ' => 'Ҵ', 'ҷ' => 'Ҷ', 'ҹ' => 'Ҹ', 'һ' => 'Һ', 'ҽ' => 'Ҽ',
'ҿ' => 'Ҿ', 'ӂ' => 'Ӂ', 'ӄ' => 'Ӄ', 'ӈ' => 'Ӈ', 'ӌ' => 'Ӌ', 'ӑ' => 'Ӑ',
'ӓ' => 'Ӓ', 'ӕ' => 'Ӕ', 'ӗ' => 'Ӗ', 'ә' => 'Ә', 'ӛ' => 'Ӛ', 'ӝ' => 'Ӝ',
'ӟ' => 'Ӟ', 'ӡ' => 'Ӡ', 'ӣ' => 'Ӣ', 'ӥ' => 'Ӥ', 'ӧ' => 'Ӧ', 'ө' => 'Ө',
'ӫ' => 'Ӫ', 'ӯ' => 'Ӯ', 'ӱ' => 'Ӱ', 'ӳ' => 'Ӳ', 'ӵ' => 'Ӵ', 'ӹ' => 'Ӹ',
'ա' => 'Ա', 'բ' => 'Բ', 'գ' => 'Գ', 'դ' => 'Դ', 'ե' => 'Ե', 'զ' => 'Զ',
'է' => 'Է', 'ը' => 'Ը', 'թ' => 'Թ', 'ժ' => 'Ժ', 'ի' => 'Ի', 'լ' => 'Լ',
'խ' => 'Խ', 'ծ' => 'Ծ', 'կ' => 'Կ', 'հ' => 'Հ', 'ձ' => 'Ձ', 'ղ' => 'Ղ',
'ճ' => 'Ճ', 'մ' => 'Մ', 'յ' => 'Յ', 'ն' => 'Ն', 'շ' => 'Շ', 'ո' => 'Ո',
'չ' => 'Չ', 'պ' => 'Պ', 'ջ' => 'Ջ', 'ռ' => 'Ռ', 'ս' => 'Ս', 'վ' => 'Վ',
'տ' => 'Տ', 'ր' => 'Ր', 'ց' => 'Ց', 'ւ' => 'Ւ', 'փ' => 'Փ', 'ք' => 'Ք',
'օ' => 'Օ', 'ֆ' => 'Ֆ', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', 'ḿ' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', 'ṿ' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', 'ế' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => 'Ἷ', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => ''
);
/**
* All lowercase UTF-8 characters not properly handled by [http://php.net/mb_strtoupper mb_strtoupper()] mapped to uppercase characters
*
* @var array
*/
static private $mb_lower_to_upper_fix = array(
'ɘ' => 'Ǝ', 'Dz' => 'DZ', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => ''
);
/**
* All uppercase UTF-8 characters not properly handled by [http://php.net/mb_strtolower mb_strtolower()] mapped to lowercase characters
*
* @var array
*/
static private $mb_upper_to_lower_fix = array(
'ǝ' => 'ɘ', 'Dž' => 'dž', 'Lj' => 'lj', 'Nj' => 'nj', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => ''
);
/**
* All uppercase UTF-8 characters mapped to lowercase characters
*
* @var array
*/
static private $upper_to_lower = array(
'A' => 'a', 'B' => 'b', 'C' => 'c', 'D' => 'd', 'E' => 'e', 'F' => 'f',
'G' => 'g', 'H' => 'h', 'I' => 'i', 'J' => 'j', 'K' => 'k', 'L' => 'l',
'M' => 'm', 'N' => 'n', 'O' => 'o', 'P' => 'p', 'Q' => 'q', 'R' => 'r',
'S' => 's', 'T' => 't', 'U' => 'u', 'V' => 'v', 'W' => 'w', 'X' => 'x',
'Y' => 'y', 'Z' => 'z', 'À' => 'à', 'Á' => 'á', 'Â' => 'â', 'Ã' => 'ã',
'Ä' => 'ä', 'Å' => 'å', 'Æ' => 'æ', 'Ç' => 'ç', 'È' => 'è', 'É' => 'é',
'Ê' => 'ê', 'Ë' => 'ë', 'Ì' => 'ì', 'Í' => 'í', 'Î' => 'î', 'Ï' => 'ï',
'Ð' => 'ð', 'Ñ' => 'ñ', 'Ò' => 'ò', 'Ó' => 'ó', 'Ô' => 'ô', 'Õ' => 'õ',
'Ö' => 'ö', 'Ø' => 'ø', 'Ù' => 'ù', 'Ú' => 'ú', 'Û' => 'û', 'Ü' => 'ü',
'Ý' => 'ý', 'Þ' => 'þ', 'Ā' => 'ā', 'Ă' => 'ă', 'Ą' => 'ą', 'Ć' => 'ć',
'Ĉ' => 'ĉ', 'Ċ' => 'ċ', 'Č' => 'č', 'Ď' => 'ď', 'Đ' => 'đ', 'Ē' => 'ē',
'Ĕ' => 'ĕ', 'Ė' => 'ė', 'Ę' => 'ę', 'Ě' => 'ě', 'Ĝ' => 'ĝ', 'Ğ' => 'ğ',
'Ġ' => 'ġ', 'Ģ' => 'ģ', 'Ĥ' => 'ĥ', 'Ħ' => 'ħ', 'Ĩ' => 'ĩ', 'Ī' => 'ī',
'Ĭ' => 'ĭ', 'Į' => 'į', 'İ' => 'i', 'IJ' => 'ij', 'Ĵ' => 'ĵ', 'Ķ' => 'ķ',
'Ĺ' => 'ĺ', 'Ļ' => 'ļ', 'Ľ' => 'ľ', 'Ŀ' => 'ŀ', 'Ł' => 'ł', 'Ń' => 'ń',
'Ņ' => 'ņ', 'Ň' => 'ň', 'Ŋ' => 'ŋ', 'Ō' => 'ō', 'Ŏ' => 'ŏ', 'Ő' => 'ő',
'Œ' => 'œ', 'Ŕ' => 'ŕ', 'Ŗ' => 'ŗ', 'Ř' => 'ř', 'Ś' => 'ś', 'Ŝ' => 'ŝ',
'Ş' => 'ş', 'Š' => 'š', 'Ţ' => 'ţ', 'Ť' => 'ť', 'Ŧ' => 'ŧ', 'Ũ' => 'ũ',
'Ū' => 'ū', 'Ŭ' => 'ŭ', 'Ů' => 'ů', 'Ű' => 'ű', 'Ų' => 'ų', 'Ŵ' => 'ŵ',
'Ŷ' => 'ŷ', 'Ÿ' => 'ÿ', 'Ź' => 'ź', 'Ż' => 'ż', 'Ž' => 'ž', 'Ɓ' => 'ɓ',
'Ƃ' => 'ƃ', 'Ƅ' => 'ƅ', 'Ɔ' => 'ɔ', 'Ƈ' => 'ƈ', 'Ɗ' => 'ɗ', 'Ƌ' => 'ƌ',
'Ǝ' => 'ɘ', 'Ə' => 'ə', 'Ɛ' => 'ɛ', 'Ƒ' => 'ƒ', 'Ɠ' => 'ɠ', 'Ɣ' => 'ɣ',
'Ɩ' => 'ɩ', 'Ɨ' => 'ɨ', 'Ƙ' => 'ƙ', 'Ɯ' => 'ɯ', 'Ɲ' => 'ɲ', 'Ɵ' => 'ɵ',
'Ơ' => 'ơ', 'Ƣ' => 'ƣ', 'Ƥ' => 'ƥ', 'Ƨ' => 'ƨ', 'Ʃ' => 'ʃ', 'Ƭ' => 'ƭ',
'Ʈ' => 'ʈ', 'Ư' => 'ư', 'Ʊ' => 'ʊ', 'Ʋ' => 'ʋ', 'Ƴ' => 'ƴ', 'Ƶ' => 'ƶ',
'Ʒ' => 'ʒ', 'Ƹ' => 'ƹ', 'Ƽ' => 'ƽ', 'DŽ' => 'dž', 'Dž' => 'dž', 'LJ' => 'lj',
'Lj' => 'lj', 'NJ' => 'nj', 'Nj' => 'nj', 'Ǎ' => 'ǎ', 'Ǐ' => 'ǐ', 'Ǒ' => 'ǒ',
'Ǔ' => 'ǔ', 'Ǖ' => 'ǖ', 'Ǘ' => 'ǘ', 'Ǚ' => 'ǚ', 'Ǜ' => 'ǜ', 'Ǟ' => 'ǟ',
'Ǡ' => 'ǡ', 'Ǣ' => 'ǣ', 'Ǥ' => 'ǥ', 'Ǧ' => 'ǧ', 'Ǩ' => 'ǩ', 'Ǫ' => 'ǫ',
'Ǭ' => 'ǭ', 'Ǯ' => 'ǯ', 'DZ' => 'dz', 'Ǵ' => 'ǵ', 'Ǻ' => 'ǻ', 'Ǽ' => 'ǽ',
'Ǿ' => 'ǿ', 'Ȁ' => 'ȁ', 'Ȃ' => 'ȃ', 'Ȅ' => 'ȅ', 'Ȇ' => 'ȇ', 'Ȉ' => 'ȉ',
'Ȋ' => 'ȋ', 'Ȍ' => 'ȍ', 'Ȏ' => 'ȏ', 'Ȑ' => 'ȑ', 'Ȓ' => 'ȓ', 'Ȕ' => 'ȕ',
'Ȗ' => 'ȗ', 'Ά' => 'ά', 'Έ' => 'έ', 'Ή' => 'ή', 'Ί' => 'ί', 'Ό' => 'ό',
'Ύ' => 'ύ', 'Ώ' => 'ώ', 'Α' => 'α', 'Β' => 'β', 'Γ' => 'γ', 'Δ' => 'δ',
'Ε' => 'ε', 'Ζ' => 'ζ', 'Η' => 'η', 'Θ' => 'θ', 'Ι' => 'ι', 'Κ' => 'κ',
'Λ' => 'λ', 'Μ' => 'μ', 'Ν' => 'ν', 'Ξ' => 'ξ', 'Ο' => 'ο', 'Π' => 'π',
'Ρ' => 'ρ', 'Σ' => 'σ', 'Τ' => 'τ', 'Υ' => 'υ', 'Φ' => 'φ', 'Χ' => 'χ',
'Ψ' => 'ψ', 'Ω' => 'ω', 'Ϊ' => 'ϊ', 'Ϋ' => 'ϋ', 'Ϣ' => 'ϣ', 'Ϥ' => 'ϥ',
'Ϧ' => 'ϧ', 'Ϩ' => 'ϩ', 'Ϫ' => 'ϫ', 'Ϭ' => 'ϭ', 'Ϯ' => 'ϯ', 'Ё' => 'ё',
'Ђ' => 'ђ', 'Ѓ' => 'ѓ', 'Є' => 'є', 'Ѕ' => 'ѕ', 'І' => 'і', 'Ї' => 'ї',
'Ј' => 'ј', 'Љ' => 'љ', 'Њ' => 'њ', 'Ћ' => 'ћ', 'Ќ' => 'ќ', 'Ў' => 'ў',
'Џ' => 'џ', 'А' => 'а', 'Б' => 'б', 'В' => 'в', 'Г' => 'г', 'Д' => 'д',
'Е' => 'е', 'Ж' => 'ж', 'З' => 'з', 'И' => 'и', 'Й' => 'й', 'К' => 'к',
'Л' => 'л', 'М' => 'м', 'Н' => 'н', 'О' => 'о', 'П' => 'п', 'Р' => 'р',
'С' => 'с', 'Т' => 'т', 'У' => 'у', 'Ф' => 'ф', 'Х' => 'х', 'Ц' => 'ц',
'Ч' => 'ч', 'Ш' => 'ш', 'Щ' => 'щ', 'Ъ' => 'ъ', 'Ы' => 'ы', 'Ь' => 'ь',
'Э' => 'э', 'Ю' => 'ю', 'Я' => 'я', 'Ѡ' => 'ѡ', 'Ѣ' => 'ѣ', 'Ѥ' => 'ѥ',
'Ѧ' => 'ѧ', 'Ѩ' => 'ѩ', 'Ѫ' => 'ѫ', 'Ѭ' => 'ѭ', 'Ѯ' => 'ѯ', 'Ѱ' => 'ѱ',
'Ѳ' => 'ѳ', 'Ѵ' => 'ѵ', 'Ѷ' => 'ѷ', 'Ѹ' => 'ѹ', 'Ѻ' => 'ѻ', 'Ѽ' => 'ѽ',
'Ѿ' => 'ѿ', 'Ҁ' => 'ҁ', 'Ґ' => 'ґ', 'Ғ' => 'ғ', 'Ҕ' => 'ҕ', 'Җ' => 'җ',
'Ҙ' => 'ҙ', 'Қ' => 'қ', 'Ҝ' => 'ҝ', 'Ҟ' => 'ҟ', 'Ҡ' => 'ҡ', 'Ң' => 'ң',
'Ҥ' => 'ҥ', 'Ҧ' => 'ҧ', 'Ҩ' => 'ҩ', 'Ҫ' => 'ҫ', 'Ҭ' => 'ҭ', 'Ү' => 'ү',
'Ұ' => 'ұ', 'Ҳ' => 'ҳ', 'Ҵ' => 'ҵ', 'Ҷ' => 'ҷ', 'Ҹ' => 'ҹ', 'Һ' => 'һ',
'Ҽ' => 'ҽ', 'Ҿ' => 'ҿ', 'Ӂ' => 'ӂ', 'Ӄ' => 'ӄ', 'Ӈ' => 'ӈ', 'Ӌ' => 'ӌ',
'Ӑ' => 'ӑ', 'Ӓ' => 'ӓ', 'Ӕ' => 'ӕ', 'Ӗ' => 'ӗ', 'Ә' => 'ә', 'Ӛ' => 'ӛ',
'Ӝ' => 'ӝ', 'Ӟ' => 'ӟ', 'Ӡ' => 'ӡ', 'Ӣ' => 'ӣ', 'Ӥ' => 'ӥ', 'Ӧ' => 'ӧ',
'Ө' => 'ө', 'Ӫ' => 'ӫ', 'Ӯ' => 'ӯ', 'Ӱ' => 'ӱ', 'Ӳ' => 'ӳ', 'Ӵ' => 'ӵ',
'Ӹ' => 'ӹ', 'Ա' => 'ա', 'Բ' => 'բ', 'Գ' => 'գ', 'Դ' => 'դ', 'Ե' => 'ե',
'Զ' => 'զ', 'Է' => 'է', 'Ը' => 'ը', 'Թ' => 'թ', 'Ժ' => 'ժ', 'Ի' => 'ի',
'Լ' => 'լ', 'Խ' => 'խ', 'Ծ' => 'ծ', 'Կ' => 'կ', 'Հ' => 'հ', 'Ձ' => 'ձ',
'Ղ' => 'ղ', 'Ճ' => 'ճ', 'Մ' => 'մ', 'Յ' => 'յ', 'Ն' => 'ն', 'Շ' => 'շ',
'Ո' => 'ո', 'Չ' => 'չ', 'Պ' => 'պ', 'Ջ' => 'ջ', 'Ռ' => 'ռ', 'Ս' => 'ս',
'Վ' => 'վ', 'Տ' => 'տ', 'Ր' => 'ր', 'Ց' => 'ց', 'Ւ' => 'ւ', 'Փ' => 'փ',
'Ք' => 'ք', 'Օ' => 'օ', 'Ֆ' => 'ֆ', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => 'ḿ', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => 'ṿ', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => 'ế',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', 'Ἷ' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
'' => '', '' => '', '' => ''
);
/**
* A mapping of all ASCII-based latin characters, puntuation, symbols and number forms to ASCII.
*
* Includes elements form the following unicode blocks:
*
* - Latin-1 Supplement
* - Latin Extended-A
* - Latin Extended-B
* - IPA Extensions
* - Latin Extended Additional
* - General Punctuation
* - Letterlike symbols
* - Number Forms
*
* @var array
*/
static private $utf8_to_ascii = array(
// Latin-1 Supplement
'©' => '(c)', '«' => '<<', '®' => '(R)', '»' => '>>', '¼' => '1/4',
'½' => '1/2', '¾' => '3/4', 'À' => 'A', 'Á' => 'A', 'Â' => 'A',
'Ã' => 'A', 'Ä' => 'A', 'Å' => 'A', 'Æ' => 'AE', 'Ç' => 'C',
'È' => 'E', 'É' => 'E', 'Ê' => 'E', 'Ë' => 'E', 'Ì' => 'I',
'Í' => 'I', 'Î' => 'I', 'Ï' => 'I', 'Ñ' => 'N', 'Ò' => 'O',
'Ó' => 'O', 'Ô' => 'O', 'Õ' => 'O', 'Ö' => 'O', 'Ø' => 'O',
'Ù' => 'U', 'Ú' => 'U', 'Û' => 'U', 'Ü' => 'U', 'Ý' => 'Y',
'à' => 'a', 'á' => 'a', 'â' => 'a', 'ã' => 'a', 'ä' => 'a',
'å' => 'a', 'æ' => 'ae', 'ç' => 'c', 'è' => 'e', 'é' => 'e',
'ê' => 'e', 'ë' => 'e', 'ì' => 'i', 'í' => 'i', 'î' => 'i',
'ï' => 'i', 'ñ' => 'n', 'ò' => 'o', 'ó' => 'o', 'ô' => 'o',
'õ' => 'o', 'ö' => 'o', 'ø' => 'o', 'ù' => 'u', 'ú' => 'u',
'û' => 'u', 'ü' => 'u', 'ý' => 'y', 'ÿ' => 'y',
// Latin Extended-A
'Ā' => 'A', 'ā' => 'a', 'Ă' => 'A', 'ă' => 'a', 'Ą' => 'A',
'ą' => 'a', 'Ć' => 'C', 'ć' => 'c', 'Ĉ' => 'C', 'ĉ' => 'c',
'Ċ' => 'C', 'ċ' => 'c', 'Č' => 'C', 'č' => 'c', 'Ď' => 'D',
'ď' => 'd', 'Đ' => 'D', 'đ' => 'd', 'Ē' => 'E', 'ē' => 'e',
'Ĕ' => 'E', 'ĕ' => 'e', 'Ė' => 'E', 'ė' => 'e', 'Ę' => 'E',
'ę' => 'e', 'Ě' => 'E', 'ě' => 'e', 'Ĝ' => 'G', 'ĝ' => 'g',
'Ğ' => 'G', 'ğ' => 'g', 'Ġ' => 'G', 'ġ' => 'g', 'Ģ' => 'G',
'ģ' => 'g', 'Ĥ' => 'H', 'ĥ' => 'h', 'Ħ' => 'H', 'ħ' => 'h',
'Ĩ' => 'I', 'ĩ' => 'i', 'Ī' => 'I', 'ī' => 'i', 'Ĭ' => 'I',
'ĭ' => 'i', 'Į' => 'I', 'į' => 'i', 'İ' => 'I', 'ı' => 'i',
'IJ' => 'IJ', 'ij' => 'ij', 'Ĵ' => 'J', 'ĵ' => 'j', 'Ķ' => 'K',
'ķ' => 'k', 'Ĺ' => 'L', 'ĺ' => 'l', 'Ļ' => 'L', 'ļ' => 'l',
'Ľ' => 'L', 'ľ' => 'l', 'Ŀ' => 'L', 'ŀ' => 'l', 'Ł' => 'L',
'ł' => 'l', 'Ń' => 'N', 'ń' => 'n', 'Ņ' => 'N', 'ņ' => 'n',
'Ň' => 'N', 'ň' => 'n', 'ʼn' => "'n", 'Ŋ' => 'N', 'ŋ' => 'n',
'Ō' => 'O', 'ō' => 'o', 'Ŏ' => 'O', 'ŏ' => 'o', 'Ő' => 'O',
'ő' => 'o', 'Œ' => 'OE', 'œ' => 'oe', 'Ŕ' => 'R', 'ŕ' => 'r',
'Ŗ' => 'R', 'ŗ' => 'r', 'Ř' => 'R', 'ř' => 'r', 'Ś' => 'S',
'ś' => 's', 'Ŝ' => 'S', 'ŝ' => 's', 'Ş' => 'S', 'ş' => 's',
'Š' => 'S', 'š' => 's', 'Ţ' => 'T', 'ţ' => 't', 'Ť' => 'T',
'ť' => 't', 'Ŧ' => 'T', 'ŧ' => 't', 'Ũ' => 'U', 'ũ' => 'u',
'Ū' => 'U', 'ū' => 'u', 'Ŭ' => 'U', 'ŭ' => 'u', 'Ů' => 'U',
'ů' => 'u', 'Ű' => 'U', 'ű' => 'u', 'Ų' => 'U', 'ų' => 'u',
'Ŵ' => 'W', 'ŵ' => 'w', 'Ŷ' => 'Y', 'ŷ' => 'y', 'Ÿ' => 'Y',
'Ź' => 'Z', 'ź' => 'z', 'Ż' => 'Z', 'ż' => 'z', 'Ž' => 'Z',
'ž' => 'z',
// Latin Extended-B
'ƀ' => 'b', 'Ɓ' => 'B', 'Ƃ' => 'B', 'ƃ' => 'b', 'Ɔ' => 'O',
'Ƈ' => 'C', 'ƈ' => 'c', 'Ɖ' => 'D', 'Ɗ' => 'D', 'Ƌ' => 'D',
'ƌ' => 'd', 'Ǝ' => 'E', 'Ɛ' => 'E', 'Ƒ' => 'F', 'ƒ' => 'f',
'Ɠ' => 'G', 'Ɨ' => 'I', 'Ƙ' => 'K', 'ƙ' => 'k', 'ƚ' => 'l',
'Ɯ' => 'M', 'Ɲ' => 'N', 'ƞ' => 'n', 'Ɵ' => 'O', 'Ơ' => 'O',
'ơ' => 'o', 'Ƣ' => 'OI', 'ƣ' => 'oi', 'Ƥ' => 'P', 'ƥ' => 'p',
'ƫ' => 't', 'Ƭ' => 'T', 'ƭ' => 't', 'Ʈ' => 'T', 'Ư' => 'U',
'ư' => 'u', 'Ʋ' => 'V', 'Ƴ' => 'Y', 'ƴ' => 'y', 'Ƶ' => 'Z',
'ƶ' => 'z', 'ƻ' => '2', 'DŽ' => 'DZ', 'Dž' => 'Dz', 'dž' => 'dz',
'LJ' => 'LJ', 'Lj' => 'Lj', 'lj' => 'lj', 'NJ' => 'Nj', 'Nj' => 'Nj',
'nj' => 'nj', 'Ǎ' => 'A', 'ǎ' => 'a', 'Ǐ' => 'I', 'ǐ' => 'i',
'Ǒ' => 'O', 'ǒ' => 'o', 'Ǔ' => 'U', 'ǔ' => 'u', 'Ǖ' => 'U',
'ǖ' => 'u', 'Ǘ' => 'U', 'ǘ' => 'u', 'Ǚ' => 'U', 'ǚ' => 'u',
'Ǜ' => 'U', 'ǜ' => 'u', 'ǝ' => 'e', 'Ǟ' => 'A', 'ǟ' => 'a',
'Ǡ' => 'A', 'ǡ' => 'a', 'Ǣ' => 'AE', 'ǣ' => 'ae', 'Ǥ' => 'G',
'ǥ' => 'g', 'Ǧ' => 'G', 'ǧ' => 'g', 'Ǩ' => 'K', 'ǩ' => 'k',
'Ǫ' => 'O', 'ǫ' => 'o', 'Ǭ' => 'O', 'ǭ' => 'o', 'ǰ' => 'j',
'DZ' => 'DZ', 'Dz' => 'Dz', 'dz' => 'dz', 'Ǵ' => 'G', 'ǵ' => 'g',
'Ǹ' => 'N', 'ǹ' => 'n', 'Ǻ' => 'A', 'ǻ' => 'a', 'Ǽ' => 'AE',
'ǽ' => 'ae', 'Ǿ' => 'O', 'ǿ' => 'o', 'Ȁ' => 'A', 'ȁ' => 'a',
'Ȃ' => 'A', 'ȃ' => 'a', 'Ȅ' => 'E', 'ȅ' => 'e', 'Ȇ' => 'E',
'ȇ' => 'e', 'Ȉ' => 'I', 'ȉ' => 'i', 'Ȋ' => 'I', 'ȋ' => 'i',
'Ȍ' => 'O', 'ȍ' => 'o', 'Ȏ' => 'O', 'ȏ' => 'o', 'Ȑ' => 'R',
'ȑ' => 'r', 'Ȓ' => 'R', 'ȓ' => 'r', 'Ȕ' => 'U', 'ȕ' => 'u',
'Ȗ' => 'U', 'ȗ' => 'u', 'Ș' => 'S', 'ș' => 's', 'Ț' => 'T',
'ț' => 't', 'Ȟ' => 'H', 'ȟ' => 'h', 'Ƞ' => 'N', 'ȡ' => 'd',
'Ȥ' => 'Z', 'ȥ' => 'z', 'Ȧ' => 'A', 'ȧ' => 'a', 'Ȩ' => 'E',
'ȩ' => 'e', 'Ȫ' => 'O', 'ȫ' => 'o', 'Ȭ' => 'O', 'ȭ' => 'o',
'Ȯ' => 'O', 'ȯ' => 'o', 'Ȱ' => 'O', 'ȱ' => 'o', 'Ȳ' => 'Y',
'ȳ' => 'y', 'ȴ' => 'l', 'ȵ' => 'n', 'ȶ' => 't', 'ȷ' => 'j',
'ȸ' => 'db', 'ȹ' => 'qp', 'Ⱥ' => 'A', 'Ȼ' => 'C', 'ȼ' => 'c',
'Ƚ' => 'L', 'Ⱦ' => 'T', 'ȿ' => 's', 'ɀ' => 'z', 'Ƀ' => 'B',
'Ʉ' => 'U', 'Ʌ' => 'V', 'Ɇ' => 'E', 'ɇ' => 'e', 'Ɉ' => 'J',
'ɉ' => 'j', 'Ɋ' => 'Q', 'ɋ' => 'q', 'Ɍ' => 'R', 'ɍ' => 'r',
'Ɏ' => 'Y', 'ɏ' => 'y',
// IPA Extensions
'ɐ' => 'a', 'ɓ' => 'b', 'ɔ' => 'o', 'ɕ' => 'c', 'ɖ' => 'd',
'ɗ' => 'd', 'ɘ' => 'e', 'ɛ' => 'e', 'ɜ' => 'e', 'ɝ' => 'e',
'ɞ' => 'e', 'ɟ' => 'j', 'ɠ' => 'g', 'ɡ' => 'g', 'ɢ' => 'G',
'ɥ' => 'h', 'ɦ' => 'h', 'ɨ' => 'i', 'ɪ' => 'I', 'ɫ' => 'l',
'ɬ' => 'l', 'ɭ' => 'l', 'ɯ' => 'm', 'ɰ' => 'm', 'ɱ' => 'm',
'ɲ' => 'n', 'ɳ' => 'n', 'ɴ' => 'N', 'ɵ' => 'o', 'ɶ' => 'OE',
'ɹ' => 'r', 'ɺ' => 'r', 'ɻ' => 'r', 'ɼ' => 'r', 'ɽ' => 'r',
'ɾ' => 'r', 'ɿ' => 'r', 'ʀ' => 'R', 'ʁ' => 'R', 'ʂ' => 's',
'ʇ' => 't', 'ʈ' => 't', 'ʉ' => 'u', 'ʋ' => 'v', 'ʌ' => 'v',
'ʍ' => 'w', 'ʎ' => 'y', 'ʏ' => 'Y', 'ʐ' => 'z', 'ʑ' => 'z',
'ʗ' => 'C', 'ʙ' => 'B', 'ʚ' => 'e', 'ʛ' => 'G', 'ʜ' => 'H',
'ʝ' => 'j', 'ʞ' => 'k', 'ʟ' => 'L', 'ʠ' => 'q', 'ʣ' => 'dz',
'ʥ' => 'dz', 'ʦ' => 'ts', 'ʨ' => 'tc', 'ʪ' => 'ls', 'ʫ' => 'lz',
'ʮ' => 'h', 'ʯ' => 'h',
// Latin Extended Additional
'' => 'A', '' => 'a', '' => 'B', '' => 'b', '' => 'B',
'' => 'b', '' => 'B', '' => 'b', '' => 'C', '' => 'c',
'' => 'D', '' => 'd', '' => 'D', '' => 'd', '' => 'D',
'' => 'd', '' => 'D', '' => 'd', '' => 'D', '' => 'd',
'' => 'E', '' => 'e', '' => 'E', '' => 'e', '' => 'E',
'' => 'e', '' => 'E', '' => 'e', '' => 'E', '' => 'e',
'' => 'F', '' => 'f', '' => 'G', '' => 'g', '' => 'H',
'' => 'h', '' => 'H', '' => 'h', '' => 'H', '' => 'h',
'' => 'H', '' => 'h', '' => 'H', '' => 'h', '' => 'I',
'' => 'i', '' => 'I', '' => 'i', '' => 'K', '' => 'k',
'' => 'K', '' => 'k', '' => 'K', '' => 'k', '' => 'L',
'' => 'l', '' => 'L', '' => 'l', '' => 'L', '' => 'l',
'' => 'L', '' => 'l', '' => 'M', 'ḿ' => 'm', '' => 'M',
'' => 'm', '' => 'M', '' => 'm', '' => 'N', '' => 'n',
'' => 'N', '' => 'n', '' => 'N', '' => 'n', '' => 'N',
'' => 'n', '' => 'O', '' => 'o', '' => 'O', '' => 'o',
'' => 'O', '' => 'o', '' => 'O', '' => 'o', '' => 'P',
'' => 'p', '' => 'P', '' => 'p', '' => 'R', '' => 'r',
'' => 'R', '' => 'r', '' => 'R', '' => 'r', '' => 'R',
'' => 'r', '' => 'S', '' => 's', '' => 'S', '' => 's',
'' => 'S', '' => 's', '' => 'S', '' => 's', '' => 'S',
'' => 's', '' => 'T', '' => 't', '' => 'T', '' => 't',
'' => 'T', '' => 't', '' => 'T', '' => 't', '' => 'U',
'' => 'u', '' => 'U', '' => 'u', '' => 'U', '' => 'u',
'' => 'U', '' => 'u', '' => 'U', '' => 'u', '' => 'V',
'' => 'v', '' => 'V', 'ṿ' => 'v', '' => 'W', '' => 'w',
'' => 'W', '' => 'w', '' => 'W', '' => 'w', '' => 'W',
'' => 'w', '' => 'W', '' => 'w', '' => 'X', '' => 'x',
'' => 'X', '' => 'x', '' => 'Y', '' => 'y', '' => 'Z',
'' => 'z', '' => 'Z', '' => 'z', '' => 'Z', '' => 'z',
'' => 'h', '' => 't', '' => 'w', '' => 'y', '' => 'a',
'' => 'A', '' => 'a', '' => 'A', '' => 'a', '' => 'A',
'' => 'a', '' => 'A', '' => 'a', '' => 'A', '' => 'a',
'' => 'A', '' => 'a', '' => 'A', '' => 'a', '' => 'A',
'' => 'a', '' => 'A', '' => 'a', '' => 'A', '' => 'a',
'' => 'A', '' => 'a', '' => 'A', '' => 'a', '' => 'E',
'' => 'e', '' => 'E', '' => 'e', '' => 'E', '' => 'e',
'' => 'E', 'ế' => 'e', '' => 'E', '' => 'e', '' => 'E',
'' => 'e', '' => 'E', '' => 'e', '' => 'E', '' => 'e',
'' => 'I', '' => 'i', '' => 'I', '' => 'i', '' => 'O',
'' => 'o', '' => 'O', '' => 'o', '' => 'O', '' => 'o',
'' => 'O', '' => 'o', '' => 'O', '' => 'o', '' => 'O',
'' => 'o', '' => 'O', '' => 'o', '' => 'O', '' => 'o',
'' => 'O', '' => 'o', '' => 'O', '' => 'o', '' => 'O',
'' => 'o', '' => 'O', '' => 'o', '' => 'U', '' => 'u',
'' => 'U', '' => 'u', '' => 'U', '' => 'u', '' => 'U',
'' => 'u', '' => 'U', '' => 'u', '' => 'U', '' => 'u',
'' => 'U', '' => 'u', '' => 'Y', '' => 'y', '' => 'Y',
'' => 'y', '' => 'Y', '' => 'y', '' => 'Y', '' => 'y',
// General Punctuation
' ' => ' ', '' => ' ', '' => ' ', '' => ' ', '' => ' ',
'' => ' ', '' => ' ', '' => ' ', '' => ' ', '' => ' ',
'' => ' ', '' => '', '' => '', '' => '', '' => '-',
'' => '-', '' => '-', '' => '-', '' => '-', '' => '-',
'' => '||', '' => "'", '' => "'", '' => ',', '' => "'",
'' => '"', '' => '"', '' => '"', '' => '.', '' => '..',
'' => '...', '' => ' ', '' => "'", '' => '"', '' => '\'"',
'' => "'", '' => '"', '' => '"\'', '' => '<', '' => '>',
'' => '!!', '' => '?!', '' => '/', '' => '?/', '' => '?!',
'' => '!?',
// Letterlike Symbols
'' => 'SM', '' => 'TM',
// Number Forms
'' => '1/3', '' => '2/3', '' => '1/5', '' => '2/5', '' => '3/5',
'' => '4/5', '' => '1/6', '' => '5/6', '' => '1/8', '' => '3/8',
'' => '5/8', '' => '7/8', '' => 'I', '' => 'II', '' => 'III',
'' => 'IV', '' => 'V', '' => 'Vi', '' => 'VII', '' => 'VIII',
'' => 'IX', '' => 'X', '' => 'XI', '' => 'XII', '' => 'L',
'' => 'C', '' => 'D', '' => 'M', '' => 'i', '' => 'ii',
'' => 'iii', '' => 'iv', '' => 'v', '' => 'vi', '' => 'vii',
'' => 'viii','' => 'ix', '' => 'x', '' => 'xi', '' => 'xii',
'' => 'l', '' => 'c', '' => 'd', '' => 'm'
);
/**
* If the [http://php.net/mbstring mbstring] extension is available
*
* @var boolean
*/
static private $mbstring_available = NULL;
/**
* Maps UTF-8 ASCII-based latin characters, puntuation, symbols and number forms to ASCII
*
* Any characters or symbols that can not be translated will be removed.
*
* This function is most useful for situation that only allows ASCII, such
* as in URLs.
*
* Translates elements form the following unicode blocks:
*
* - Latin-1 Supplement
* - Latin Extended-A
* - Latin Extended-B
* - IPA Extensions
* - Latin Extended Additional
* - General Punctuation
* - Letterlike symbols
* - Number Forms
*
* @internal
*
* @param string $string The string to convert
* @return string The input string in pure ASCII
*/
static public function ascii($string)
{
if (!self::detect($string)) {
return $string;
}
$string = strtr($string, self::$utf8_to_ascii);
return preg_replace('#[^\x00-\x7F]#', '', $string);
}
/**
* Checks to see if the [http://php.net/mbstring mbstring] extension is available
*
* @return void
*/
static private function checkMbString()
{
self::$mbstring_available = extension_loaded('mbstring');
}
/**
* Converts a unicode value into a UTF-8 character
*
* @param mixed $unicode_code_point The character to create, either the `U+hex` or decimal code point
* @return string The UTF-8 character
*/
static public function chr($unicode_code_point)
{
if (is_string($unicode_code_point) && substr($unicode_code_point, 0, 2) == 'U+') {
$unicode_code_point = substr($unicode_code_point, 2);
$unicode_code_point = hexdec($unicode_code_point);
}
$bin = decbin($unicode_code_point);
$digits = strlen($bin);
$first = $second = $third = $fourth = NULL;
// One byte characters
if ($digits <= 7) {
$first = chr(bindec($bin));
// Two byte characters
} elseif ($digits <= 11) {
$first = chr(bindec('110' . str_pad(substr($bin, 0, -6), 5, '0', STR_PAD_LEFT)));
$second = chr(bindec('10' . substr($bin, -6)));
// Three byte characters
} elseif ($digits <= 16) {
$first = chr(bindec('1110' . str_pad(substr($bin, 0, -12), 4, '0', STR_PAD_LEFT)));
$second = chr(bindec('10' . substr($bin, -12, -6)));
$third = chr(bindec('10' . substr($bin, -6)));
// Four byte characters
} elseif ($digits <= 21) {
$first = chr(bindec('11110' . str_pad(substr($bin, 0, -18), 3, '0', STR_PAD_LEFT)));
$second = chr(bindec('10' . substr($bin, -18, -12)));
$third = chr(bindec('10' . substr($bin, -12, -6)));
$fourth = chr(bindec('10' . substr($bin, -6)));
}
$ord = ord($first);
if ($digits > 21 || $ord == 0xC0 || $ord == 0xC1 || $ord > 0xF4) {
throw new fProgrammerException(
'The code point specified, %s, is invalid.',
$unicode_code_point
);
}
return $first . $second . $third . $fourth;
}
/**
* Removes any invalid UTF-8 characters from a string or array of strings
*
* @param array|string $value The string or array of strings to clean
* @return string The cleaned string
*/
static public function clean($value)
{
if (!is_array($value)) {
self::checkMbString();
if (self::$mbstring_available) {
$old_sub = ini_get('mbstring.substitute_character');
ini_set('mbstring.substitute_character', 'none');
$value = mb_convert_encoding($value, 'UTF-8', 'UTF-8');
ini_set('mbstring.substitute_character', $old_sub);
return $value;
}
if (self::$can_ignore_invalid === NULL) {
self::$can_ignore_invalid = !in_array(strtolower(ICONV_IMPL), array('unknown', 'ibm iconv'));
}
fCore::startErrorCapture(E_NOTICE);
$value = self::iconv('UTF-8', 'UTF-8' . (self::$can_ignore_invalid ? '//IGNORE' : ''), (string) $value);
fCore::stopErrorCapture();
return $value;
}
$keys = array_keys($value);
$num_keys = sizeof($keys);
for ($i=0; $i<$num_keys; $i++) {
$value[$keys[$i]] = self::clean($value[$keys[$i]]);
}
return $value;
}
/**
* Compares strings, with the resulting order having latin characters that are based on ASCII letters placed after the relative ASCII characters
*
* Please note that this function sorts based on English language sorting
* rules only. Locale-sepcific sorting is done by
* [http://php.net/strcoll strcoll()], however there are technical
* limitations.
*
* @param string $str1 The first string to compare
* @param string $str2 The second string to compare
* @return integer < 0 if $str1 < $str2, 0 if they are equal, > 0 if $str1 > $str2
*/
static public function cmp($str1, $str2)
{
$ascii_str1 = strtr($str1, self::$utf8_to_ascii);
$ascii_str2 = strtr($str2, self::$utf8_to_ascii);
$res = strcmp($ascii_str1, $ascii_str2);
// If the ASCII representations are the same, sort by the UTF-8 representations
if ($res === 0) {
$res = strcmp($str1, $str2);
}
return $res;
}
/**
* Converts an offset in characters to an offset in bytes to that we can use the built-in functions for some operations
*
* @param string $string The string to base the offset on
* @param integer $offset The character offset to conver to bytes
* @return integer The converted offset
*/
static private function convertOffsetToBytes($string, $offset)
{
if ($offset == 0) {
return 0;
}
$len = strlen($string);
$byte_offset = 0;
$measured_offset = 0;
$sign = 1;
// Negative offsets require us to reverse some stuff
if ($offset < 0) {
$string = strrev($string);
$sign = -1;
$offset = abs($offset);
}
for ($i=0; $i<$len && $measured_offset<$offset; $i++) {
$char = $string[$i];
++$byte_offset;
if (ord($char) < 0x80) {
++$measured_offset;
} else {
switch (ord($char) & 0xF0) {
case 0xF0:
case 0xE0:
case 0xD0:
case 0xC0:
++$measured_offset;
break;
}
}
}
return $byte_offset * $sign;
}
/**
* Detects if a UTF-8 string contains any non-ASCII characters
*
* @param string $string The string to check
* @return boolean If the string contains any non-ASCII characters
*/
static private function detect($string)
{
return (boolean) preg_match('#[^\x00-\x7F]#', $string);
}
/**
* Explodes a string on a delimiter
*
* If no delimiter is provided, the string will be exploded with each
* characters being an element in the array.
*
* @param string $string The string to explode
* @param string $delimiter The string to explode on. If `NULL` or `''` this method will return one character per array index.
* @return array The exploded string
*/
static public function explode($string, $delimiter=NULL)
{
// If a delimiter was passed, we just do an explode
if ($delimiter || (!$delimiter && is_numeric($delimiter))) {
return explode($delimiter, $string);
}
// If no delimiter was passed, we explode the characters into an array
preg_match_all('#.|^\z#us', $string, $matches);
return $matches[0];
}
/**
* This works around a bug in MAMP 1.9.4+ and PHP 5.3 where iconv()
* does not seem to properly assign the return value to a variable, but
* does work when returning the value.
*
* @param string $in_charset The incoming character encoding
* @param string $out_charset The outgoing character encoding
* @param string $string The string to convert
* @return string The converted string
*/
static private function iconv($in_charset, $out_charset, $string)
{
return iconv($in_charset, $out_charset, $string);
}
/**
* Compares strings in a case-insensitive manner, with the resulting order having characters that are based on ASCII letters placed after the relative ASCII characters
*
* Please note that this function sorts based on English language sorting
* rules only. Locale-sepcific sorting is done by
* [http://php.net/strcoll strcoll()], however there are technical
* limitations.
*
* @param string $str1 The first string to compare
* @param string $str2 The second string to compare
* @return integer < 0 if $str1 < $str2, 0 if they are equal, > 0 if $str1 > $str2
*/
static public function icmp($str1, $str2)
{
$str1 = self::lower($str1);
$str2 = self::lower($str2);
return self::cmp($str1, $str2);
}
/**
* Compares strings using a natural order algorithm in a case-insensitive manner, with the resulting order having latin characters that are based on ASCII letters placed after the relative ASCII characters
*
* Please note that this function sorts based on English language sorting
* rules only. Locale-sepcific sorting is done by
* [http://php.net/strcoll strcoll()], however there are technical
* limitations.
*
* @param string $str1 The first string to compare
* @param string $str2 The second string to compare
* @return integer `< 0` if `$str1 < $str2`, `0` if they are equal, `> 0` if `$str1 > $str2`
*/
static public function inatcmp($str1, $str2)
{
$str1 = self::lower($str1);
$str2 = self::lower($str2);
return self::natcmp($str1, $str2);
}
/**
* Finds the first position (in characters) of the search value in the string - case is ignored when doing performing a match
*
* @param string $haystack The string to search in
* @param string $needle The string to search for. This match will be done in a case-insensitive manner.
* @param integer $offset The character position to start searching from
* @return mixed The integer character position of the first occurence of the needle or `FALSE` if no match
*/
static public function ipos($haystack, $needle, $offset=0)
{
// We get better performance falling back for ASCII strings
if (!self::detect($haystack)) {
return stripos($haystack, $needle, $offset);
}
if (self::$mbstring_available === NULL) {
self::checkMbString();
}
if (self::$mbstring_available && function_exists('mb_stripos')) {
return mb_stripos($haystack, $needle, $offset, 'UTF-8');
}
$haystack = self::lower($haystack);
$needle = self::lower($needle);
return self::pos($haystack, $needle, $offset);
}
/**
* Replaces matching parts of the string, with matches being done in a a case-insensitive manner
*
* If `$search` and `$replace` are both arrays and `$replace` is shorter,
* the extra `$search` string will be replaced with an empty string. If
* `$search` is an array and `$replace` is a string, all `$search` values
* will be replaced with the string specified.
*
* @param string $string The string to perform the replacements on
* @param mixed $search The string (or array of strings) to search for - see method description for details
* @param mixed $replace The string (or array of strings) to replace with - see method description for details
* @return string The input string with the specified replacements
*/
static public function ireplace($string, $search, $replace)
{
if (is_array($search)) {
foreach ($search as &$needle) {
$needle = '#' . preg_quote($needle, '#') . '#ui';
}
} else {
$search = '#' . preg_quote($search, '#') . '#ui';
}
return preg_replace(
$search,
strtr($replace, array('\\' => '\\\\', '$' => '\\$')),
$string
);
}
/**
* Finds the last position (in characters) of the search value in the string - case is ignored when doing performing a match
*
* @param string $haystack The string to search in
* @param string $needle The string to search for. This match will be done in a case-insensitive manner.
* @param integer $offset The character position to start searching from. A negative value will stop looking that many characters from the end of the string
* @return mixed The integer character position of the last occurence of the needle or `FALSE` if no match
*/
static public function irpos($haystack, $needle, $offset=0)
{
// We get better performance falling back for ASCII strings
if (!self::detect($haystack)) {
return strripos($haystack, $needle, $offset);
}
if (self::$mbstring_available === NULL) {
self::checkMbString();
}
if (self::$mbstring_available && function_exists('mb_strripos')) {
return mb_strripos($haystack, $needle, $offset, 'UTF-8');
}
$haystack = self::lower($haystack);
$needle = self::lower($needle);
return self::rpos($haystack, $needle, $offset);
}
/**
* Matches a string needle in the string haystack, returning a substring from the beginning of the needle to the end of the haystack
*
* Can optionally return the part of the haystack before the needle. Matching
* is done in a case-insensitive manner.
*
* @param string $haystack The string to search in
* @param string $needle The string to search for. This match will be done in a case-insensitive manner.
* @param boolean $before_needle If a substring of the haystack before the needle should be returned instead of the substring from the needle to the end of the haystack
* @return mixed The specified part of the haystack, or `FALSE` if the needle was not found
*/
static public function istr($haystack, $needle, $before_needle=FALSE)
{
// We get better performance falling back for ASCII strings
if ($before_needle == FALSE && !self::detect($haystack)) {
return stristr($haystack, $needle);
}
if (self::$mbstring_available === NULL) {
self::checkMbString();
}
if (self::$mbstring_available && function_exists('mb_stristr')) {
return mb_stristr($haystack, $needle, $before_needle, 'UTF-8');
}
$lower_haystack = self::lower($haystack);
$lower_needle = self::lower($needle);
$pos = strpos($lower_haystack, $lower_needle);
if ($before_needle) {
return substr($haystack, 0, $pos);
}
return substr($haystack, $pos);
}
/**
* Determines the length (in characters) of a string
*
* @param string $string The string to measure
* @return integer The number of characters in the string
*/
static public function len($string)
{
if (self::$mbstring_available === NULL) {
self::checkMbString();
}
if (self::$mbstring_available) {
return mb_strlen($string, 'UTF-8');
}
return strlen(utf8_decode($string));
}
/**
* Converts all uppercase characters to lowercase
*
* @param string $string The string to convert
* @return string The input string with all uppercase characters in lowercase
*/
static public function lower($string)
{
// We get better performance falling back for ASCII strings
if (!self::detect($string)) {
return strtolower($string);
}
if (self::$mbstring_available === NULL) {
self::checkMbString();
}
if (self::$mbstring_available) {
$string = mb_strtolower($string, 'utf-8');
// For some reason mb_strtolower misses some character
return strtr($string, self::$mb_upper_to_lower_fix);
}
return strtr($string, self::$upper_to_lower);
}
/**
* Trims whitespace, or any specified characters, from the beginning of a string
*
* @param string $string The string to trim
* @param string $charlist The characters to trim
* @return string The trimmed string
*/
static public function ltrim($string, $charlist=NULL)
{
if (strlen($charlist) === 0) {
return ltrim($string);
}
$search = preg_quote($charlist, '#');
$search = str_replace('-', '\-', $search);
$search = str_replace('\.\.', '-', $search);
return preg_replace('#^[' . $search . ']+#Du', '', $string);
}
/**
* Compares strings using a natural order algorithm, with the resulting order having latin characters that are based on ASCII letters placed after the relative ASCII characters
*
* Please note that this function sorts based on English language sorting
* rules only. Locale-sepcific sorting is done by
* [http://php.net/strcoll strcoll()], however there are technical
* limitations.
*
* @param string $str1 The first string to compare
* @param string $str2 The second string to compare
* @return integer `< 0` if `$str1 < $str2`, `0` if they are equal, `> 0` if `$str1 > $str2`
*/
static public function natcmp($str1, $str2)
{
$ascii_str1 = strtr($str1, self::$utf8_to_ascii);
$ascii_str2 = strtr($str2, self::$utf8_to_ascii);
$res = strnatcmp($ascii_str1, $ascii_str2);
// If the ASCII representations are the same, sort by the UTF-8 representations
if ($res === 0) {
$res = strnatcmp($str1, $str2);
}
return $res;
}
/**
* Converts a UTF-8 character to a unicode code point
*
* @param string $character The character to decode
* @return string The U+hex unicode code point for the character
*/
static public function ord($character)
{
$b = array_map('ord', str_split($character));
$invalid = FALSE;
switch (strlen($character)) {
case 1:
if ($b[0] > 0x7F) {
$invalid = TRUE;
break;
}
$bin = decbin($b[0]);
break;
case 2:
if ($b[0] < 0xC2 || $b[0] > 0xDF ||
$b[1] < 0x80 || $b[1] > 0xBF) {
$invalid = TRUE;
break;
}
$bin = substr(decbin($b[0]), 3) .
substr(decbin($b[1]), 2);
break;
case 3:
if ($b[0] < 0xE0 || $b[0] > 0xEF ||
$b[1] < 0x80 || $b[1] > 0xBF ||
$b[2] < 0x80 || $b[2] > 0xBF) {
$invalid = TRUE;
break;
}
$bin = substr(decbin($b[0]), 4) .
substr(decbin($b[1]), 2) .
substr(decbin($b[2]), 2);
break;
case 4:
if ($b[0] < 0xF0 || $b[0] > 0xF4 ||
$b[1] < 0x80 || $b[1] > 0xBF ||
$b[2] < 0x80 || $b[2] > 0xBF ||
$b[3] < 0x80 || $b[3] > 0xBF) {
$invalid = TRUE;
break;
}
$bin = substr(decbin($b[0]), 5) .
substr(decbin($b[1]), 2) .
substr(decbin($b[2]), 2) .
substr(decbin($b[3]), 2);
break;
default:
$invalid = TRUE;
break;
}
if ($invalid) {
throw new fProgrammerException(
'The UTF-8 character specified is invalid'
);
}
$hex = strtoupper(dechex(bindec($bin)));
return 'U+' . str_pad($hex, 4, '0', STR_PAD_LEFT);
}
/**
* Pads a string to the number of characters specified
*
* @param string $string The string to pad
* @param integer $pad_length The character length to pad the string to
* @param string $pad_string The string to pad the source string with
* @param string $pad_type The type of padding to do: `'left'`, `'right'`, `'both'`
* @return string The input string padded to the specified character width
*/
static public function pad($string, $pad_length, $pad_string=' ', $pad_type='right')
{
$valid_pad_types = array('right', 'left', 'both');
if (!in_array($pad_type, $valid_pad_types)) {
throw new fProgrammerException(
'The pad type specified, %1$s, is not valid. Must be one of: %2$s.',
$pad_type,
join(', ', $valid_pad_types)
);
}
// We get better performance falling back for ASCII strings
if (!self::detect($string) && !self::detect($pad_string)) {
static $type_map = array(
'left' => STR_PAD_LEFT,
'right' => STR_PAD_RIGHT,
'both' => STR_PAD_BOTH
);
return str_pad($string, $pad_length, $pad_string, $type_map[$pad_type]);
}
$string_length = self::len($string);
$pad_string_length = self::len($pad_string);
$pad_to_length = $pad_length - $string_length;
if ($pad_to_length < 1) {
return $string;
}
$padded = 0;
$next_side = 'left';
$left_pad_string = '';
$right_pad_string = '';
while ($padded < $pad_to_length) {
// For pad strings over 1 characters long, they may be too long to fit
if ($pad_to_length - $padded < $pad_string_length) {
$pad_string = self::sub($pad_string, 0, $pad_to_length - $padded);
}
switch (($pad_type != 'both') ? $pad_type : $next_side) {
case 'right':
$right_pad_string .= $pad_string;
$next_side = 'left';
break;
case 'left':
$left_pad_string .= $pad_string;
$next_side = 'right';
break;
}
$padded += $pad_string_length;
}
return $left_pad_string . $string . $right_pad_string;
}
/**
* Finds the first position (in characters) of the search value in the string
*
* @param string $haystack The string to search in
* @param string $needle The string to search for
* @param integer $offset The character position to start searching from
* @return mixed The integer character position of the first occurence of the needle or `FALSE` if no match
*/
static public function pos($haystack, $needle, $offset=0)
{
if (self::$mbstring_available === NULL) {
self::checkMbString();
}
if (self::$mbstring_available) {
return mb_strpos($haystack, $needle, $offset, 'UTF-8');
}
$offset = self::convertOffsetToBytes($haystack, $offset);
$position = strpos($haystack, $needle, $offset);
if ($position === FALSE) {
return FALSE;
}
return strlen(utf8_decode(substr($haystack, 0, $position)));
}
/**
* Replaces matching parts of the string
*
* If `$search` and `$replace` are both arrays and `$replace` is shorter,
* the extra `$search` string will be replaced with an empty string. If
* `$search` is an array and `$replace` is a string, all `$search` values
* will be replaced with the string specified.
*
* @param string $string The string to perform the replacements on
* @param mixed $search The string (or array of strings) to search for - see method description for details
* @param mixed $replace The string (or array of strings) to replace with - see method description for details
* @return string The input string with the specified replacements
*/
static public function replace($string, $search, $replace)
{
return str_replace($search, $replace, $string);
}
/**
* Resets the configuration of the class
*
* @internal
*
* @return void
*/
static public function reset()
{
self::$mbstring_available = NULL;
}
/**
* Reverses a string
*
* @param string $string The string to reverse
* @return string The reversed string
*/
static public function rev($string)
{
$output = '';
$len = strlen($string);
static $char_lens = array(
0xF0 => 4,
0xE0 => 3,
0xD0 => 2,
0xC0 => 2
);
$mb_char = '';
for ($i=0; $i<$len; $i++) {
$char = $string[$i];
if (ord($char) < 128) {
$output = $char . $output;
} else {
switch (ord($char) & 0xF0) {
case 0xF0:
$output = $string[$i] . $string[$i+1] . $string[$i+2] . $string[$i+3] . $output;
$i += 3;
break;
case 0xE0:
$output = $string[$i] . $string[$i+1] . $string[$i+2] . $output;
$i += 2;
break;
case 0xD0:
case 0xC0:
$output = $string[$i] . $string[$i+1] . $output;
$i += 1;
break;
}
}
}
return $output;
}
/**
* Finds the last position (in characters) of the search value in the string
*
* @param string $haystack The string to search in
* @param string $needle The string to search for.
* @param integer $offset The character position to start searching from. A negative value will stop looking that many characters from the end of the string
* @return mixed The integer character position of the last occurence of the needle or `FALSE` if no match
*/
static public function rpos($haystack, $needle, $offset=0)
{
// We get better performance falling back for ASCII strings
if (!self::detect($haystack)) {
return strrpos($haystack, $needle, $offset);
}
// We don't even both trying mb_strrpos since this method is faster
$offset = self::convertOffsetToBytes($haystack, $offset);
$position = strrpos($haystack, $needle, $offset);
if ($position === FALSE) {
return FALSE;
}
return strlen(utf8_decode(substr($haystack, 0, $position)));
}
/**
* Trims whitespace, or any specified characters, from the end of a string
*
* @param string $string The string to trim
* @param string $charlist The characters to trim
* @return string The trimmed string
*/
static public function rtrim($string, $charlist=NULL)
{
if (strlen($charlist) === 0) {
return rtrim($string);
}
$search = preg_quote($charlist, '#');
$search = str_replace('-', '\-', $search);
$search = str_replace('\.\.', '-', $search);
return preg_replace('#[' . $search . ']+$#Du', '', $string);
}
/**
* Matches a string needle in the string haystack, returning a substring from the beginning of the needle to the end of the haystack
*
* Can optionally return the part of the haystack before the needle.
*
* @param string $haystack The string to search in
* @param string $needle The string to search for
* @param boolean $before_needle If a substring of the haystack before the needle should be returned instead of the substring from the needle to the end of the haystack
* @return mixed The specified part of the haystack, or `FALSE` if the needle was not found
*/
static public function str($haystack, $needle, $before_needle=FALSE)
{
if (self::$mbstring_available === NULL) {
self::checkMbString();
}
if (self::$mbstring_available && function_exists('mb_strstr')) {
return mb_strstr($haystack, $needle, $before_needle, 'UTF-8');
}
$pos = strpos($haystack, $needle);
if ($pos === FALSE) {
return $pos;
}
if ($before_needle) {
return substr($haystack, 0, $pos);
}
return substr($haystack, $pos);
}
/**
* Extracts part of a string
*
* @param string $string The string to extract from
* @param integer $start The zero-based starting index to extract from. Negative values will start the extraction that many characters from the end of the string.
* @param integer $length The length of the string to extract. If an empty value is provided, the remainder of the string will be returned.
* @return mixed The extracted subtring or `FALSE` if the start is out of bounds
*/
static public function sub($string, $start, $length=NULL)
{
if (self::$mbstring_available === NULL) {
self::checkMbString();
}
if (self::$mbstring_available) {
$str_len = mb_strlen($string, 'UTF-8');
if (abs($start) > $str_len) {
return FALSE;
}
if ($length === NULL) {
if ($start >= 0) {
$length = $str_len-$start;
} else {
$length = abs($start);
}
}
return mb_substr($string, $start, $length, 'UTF-8');
}
// We get better performance falling back for ASCII strings
if (!self::detect($string)) {
if ($length === NULL) {
if ($start >= 0) {
$length = strlen($string)-$start;
} else {
$length = abs($start);
}
}
return substr($string, $start, $length);
}
// This is the slowest version
$str_len = strlen(utf8_decode($string));
if (abs($start) > $str_len) {
return FALSE;
}
// Optimize looking by changing to negative start positions if the
// start is in the second half of the string
if ($start > $str_len/2) {
$start = 0-($str_len-$start);
}
// Substrings to the end of the string are pretty simple
$start = self::convertOffsetToBytes($string, $start);
$string = substr($string, $start);
if ($length === NULL) {
return $string;
}
$length = self::convertOffsetToBytes($string, $length);
return substr($string, 0, $length);
}
/**
* Trims whitespace, or any specified characters, from the beginning and end of a string
*
* @param string $string The string to trim
* @param string $charlist The characters to trim, .. indicates a range
* @return string The trimmed string
*/
static public function trim($string, $charlist=NULL)
{
if (strlen($charlist) === 0) {
return trim($string);
}
$search = preg_quote($charlist, '#');
$search = str_replace('-', '\-', $search);
$search = str_replace('\.\.', '-', $search);
return preg_replace('#^[' . $search . ']+|[' . $search . ']+$#Du', '', $string);
}
/**
* Converts the first character of the string to uppercase.
*
* @param string $string The string to process
* @return string The processed string
*/
static public function ucfirst($string)
{
return self::upper(self::sub($string, 0, 1)) . self::sub($string, 1);
}
/**
* Converts the first character of every word to uppercase
*
* Words are considered to start at the beginning of the string, or after any
* whitespace character.
*
* @param string $string The string to process
* @return string The processed string
*/
static public function ucwords($string)
{
return preg_replace_callback(
'#(?<=^|\s|[\x{2000}-\x{200A}]|/|-|\(|\[|\{|\||"|^\'|\s\'|‘|“)(.)#u',
array('self', 'ucwordsCallback'),
$string
);
}
/**
* Handles converting a character to uppercase for ::ucwords()
*
* @param array $match The regex match from ::ucwords()
* @return string The uppercase character
*/
static private function ucwordsCallback($match)
{
return self::upper($match[1]);
}
/**
* Converts all lowercase characters to uppercase
*
* @param string $string The string to convert
* @return string The input string with all lowercase characters in uppercase
*/
static public function upper($string)
{
// We get better performance falling back for ASCII strings
if (!self::detect($string)) {
return strtoupper($string);
}
if (self::$mbstring_available === NULL) {
self::checkMbString();
}
if (self::$mbstring_available) {
$string = mb_strtoupper($string, 'utf-8');
// For some reason mb_strtoupper misses some character
return strtr($string, self::$mb_lower_to_upper_fix);
}
return strtr($string, self::$lower_to_upper);
}
/**
* Wraps a string to a specific character width
*
* @param string $string The string to wrap
* @param integer $width The character width to wrap to
* @param string $break The string to insert as a break
* @param boolean $cut If words longer than the character width should be split to fit
* @return string The input string with all lowercase characters in uppercase
*/
static public function wordwrap($string, $width=75, $break="\n", $cut=FALSE)
{
// We get better performance falling back for ASCII strings
if (!self::detect($string)) {
return wordwrap($string, $width, $break, $cut);
}
$words = preg_split('#(?<=\s|[\x{2000}-\x{200A}])#ue', $string);
$output = '';
$line_len = 0;
foreach ($words as $word) {
$word_len = self::len($word);
// Shorten up words that are too long
while ($cut && $word_len > $width) {
$output .= $break;
$output .= self::sub($word, 0, $width);
$line_len = $width;
$word = self::sub($word, $width);
$word_len = self::len($word);
}
if ($line_len && $line_len + $word_len > $width) {
$output .= $break;
$line_len = 0;
}
$output .= $word;
$line_len += $word_len;
}
return $output;
}
/**
* Forces use as a static class
*
* @return fUTF8
*/
private function __construct() { }
}
/**
* Copyright (c) 2008-2012 Will Bond <will@flourishlib.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/