Skip to content

Commit

Permalink
ugly fix for unicode
Browse files Browse the repository at this point in the history
  • Loading branch information
cl4u2 committed Jun 17, 2012
1 parent 744fb9b commit 36ea01b
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions resources.py
Expand Up @@ -86,10 +86,20 @@ def makeTags(self):
tmptagsnew.append("".join(currentword))
tmptags += tmptagsnew


#try to deal with unicode
def utf8tag(tag):
try:
return tag.encode('utf-8', 'ignore')
except:
return ""
tmptags = [utf8tag(e) for e in tmptags]

# delete duplicates and the empty string
tmptags = list(set(tmptags))
stopwords = ['THE', 'IL', 'UN', 'UNA', 'GLI', 'LE', 'LO', 'A', 'E', 'I', 'O', 'L', 'OF']
tmptags = [e for e in tmptags if len(e) > 0 and not e.upper() in stopwords]

self.addTags(tmptags)

def tokenize(self):
Expand Down

0 comments on commit 36ea01b

Please sign in to comment.