Permalink
Browse files

use translitcodec to speed up Unicode diacritic suppression

  • Loading branch information...
Fazal Majid
Fazal Majid committed Mar 8, 2017
1 parent db8c96b commit bc7870fcb1538b9e8340c052b1233d9b0ceb959c
Showing with 15 additions and 9 deletions.
  1. +1 −0 setup.py
  2. +14 −9 tembozapp/normalize.py
View
@@ -16,6 +16,7 @@
'html5lib',
'passlib',
'argon2_cffi',
'translitcodec',
'yappi'
],
description='The Temboz RSS/Atom feed reader and aggregator.',
View
@@ -16,15 +16,20 @@
# strip out class attributes from articles
feedparser._HTMLSanitizer.acceptable_attributes.remove('class')
# strip diacritics. Unicode normalization form D (NFD) maps letters with
# diacritics into the base letter followed by a combining diacritic, all
# we need to do is get rid of the combining diacritics
# this probably does not work with exotic characters like
# U+FDF2 (Arabic ligature Allah)
def stripc(c):
return unicodedata.normalize('NFD', c)[0]
def strip_diacritics(s):
return u''.join(map(stripc, s))
try:
import translitcodec
def strip_diacritics(s):
return translitcodec.short_encode(s)[0]
except ImportError:
# strip diacritics. Unicode normalization form D (NFD) maps letters with
# diacritics into the base letter followed by a combining diacritic, all
# we need to do is get rid of the combining diacritics
# this probably does not work with exotic characters like
# U+FDF2 (Arabic ligature Allah)
def stripc(c):
return unicodedata.normalize('NFD', c)[0]
def strip_diacritics(s):
return u''.join(map(stripc, s))
stop_words = ['i', 't', 'am', 'no', 'do', 's', 'my', 'don', 'm', 'on',
'get', 'in', 'you', 'me', 'd', 've']

0 comments on commit bc7870f

Please sign in to comment.