use translitcodec to speed up Unicode diacritic suppression

fazalmajid · Mar 8, 2017 · bc7870f · bc7870f
1 parent db8c96b
commit bc7870f
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 9 deletions.
diff --git a/setup.py b/setup.py
@@ -16,6 +16,7 @@
     'html5lib',
     'passlib',
     'argon2_cffi',
+    'translitcodec',
     'yappi'
   ],
   description='The Temboz RSS/Atom feed reader and aggregator.',

diff --git a/tembozapp/normalize.py b/tembozapp/normalize.py
@@ -16,15 +16,20 @@
 # strip out class attributes from articles
 feedparser._HTMLSanitizer.acceptable_attributes.remove('class')
 
-# strip diacritics. Unicode normalization form D (NFD) maps letters with
-# diacritics into the base letter followed by a combining diacritic, all
-# we need to do is get rid of the combining diacritics
-# this probably does not work with exotic characters like
-# U+FDF2 (Arabic ligature Allah)
-def stripc(c):
-  return unicodedata.normalize('NFD', c)[0]
-def strip_diacritics(s):
-  return u''.join(map(stripc, s))
+try:
+  import translitcodec
+  def strip_diacritics(s):
+    return translitcodec.short_encode(s)[0]
+except ImportError:
+  # strip diacritics. Unicode normalization form D (NFD) maps letters with
+  # diacritics into the base letter followed by a combining diacritic, all
+  # we need to do is get rid of the combining diacritics
+  # this probably does not work with exotic characters like
+  # U+FDF2 (Arabic ligature Allah)
+  def stripc(c):
+    return unicodedata.normalize('NFD', c)[0]
+  def strip_diacritics(s):
+    return u''.join(map(stripc, s))
 
 stop_words = ['i', 't', 'am', 'no', 'do', 's', 'my', 'don', 'm', 'on',
               'get', 'in', 'you', 'me', 'd', 've']