Skip to content

Commit

Permalink
support flag '_'; ignore white space
Browse files Browse the repository at this point in the history
  • Loading branch information
fxsjy committed Apr 12, 2013
1 parent afdcb8a commit 45591bb
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 5 deletions.
5 changes: 3 additions & 2 deletions jieba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def cut(sentence,cut_all=False):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile(ur"(\s+)")
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\s+)")
if cut_all:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
blocks = re_han.split(sentence)
Expand All @@ -169,7 +169,8 @@ def cut(sentence,cut_all=False):
tmp = re_skip.split(blk)
for x in tmp:
if re_skip.match(x):
yield x
if x!=' ':
yield x
else:
for xx in x:
yield xx
Expand Down
2 changes: 1 addition & 1 deletion jieba/posseg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def cut(sentence):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile(ur"(\s+)")
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\s+)")
re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
blocks = re_han.split(sentence)
for blk in blocks:
Expand Down
10 changes: 10 additions & 0 deletions test/test_userdict.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,13 @@

for w in result:
print w.word, "/", w.flag, ", ",

print "\n========"

terms = jieba.cut('easy_install is great')
for t in terms:
print t
print '-------------------------'
terms = jieba.cut('python 的正则表达式是好用的')
for t in terms:
print t
6 changes: 4 additions & 2 deletions test/userdict.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
云计算 5
云计算 5
李小福 2 nr
创新办 3 i
创新办 3 i
easy_install 3 eng
好用 300

0 comments on commit 45591bb

Please sign in to comment.