Skip to content

Commit

Permalink
use translitcodec to speed up Unicode diacritic suppression
Browse files Browse the repository at this point in the history
  • Loading branch information
fazalmajid committed Mar 8, 2017
1 parent db8c96b commit bc7870f
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 9 deletions.
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -16,6 +16,7 @@
'html5lib',
'passlib',
'argon2_cffi',
'translitcodec',
'yappi'
],
description='The Temboz RSS/Atom feed reader and aggregator.',
Expand Down
23 changes: 14 additions & 9 deletions tembozapp/normalize.py
Expand Up @@ -16,15 +16,20 @@
# strip out class attributes from articles
feedparser._HTMLSanitizer.acceptable_attributes.remove('class')

# strip diacritics. Unicode normalization form D (NFD) maps letters with
# diacritics into the base letter followed by a combining diacritic, all
# we need to do is get rid of the combining diacritics
# this probably does not work with exotic characters like
# U+FDF2 (Arabic ligature Allah)
def stripc(c):
return unicodedata.normalize('NFD', c)[0]
def strip_diacritics(s):
return u''.join(map(stripc, s))
try:
import translitcodec
def strip_diacritics(s):
return translitcodec.short_encode(s)[0]
except ImportError:
# strip diacritics. Unicode normalization form D (NFD) maps letters with
# diacritics into the base letter followed by a combining diacritic, all
# we need to do is get rid of the combining diacritics
# this probably does not work with exotic characters like
# U+FDF2 (Arabic ligature Allah)
def stripc(c):
return unicodedata.normalize('NFD', c)[0]
def strip_diacritics(s):
return u''.join(map(stripc, s))

stop_words = ['i', 't', 'am', 'no', 'do', 's', 'my', 'don', 'm', 'on',
'get', 'in', 'you', 'me', 'd', 've']
Expand Down

0 comments on commit bc7870f

Please sign in to comment.