Permalink
Browse files

support user defined word tag

  • Loading branch information...
1 parent 44e19a2 commit 58c363655c5b0c327b40608925b5967beb01e83d @fxsjy committed Mar 25, 2013
Showing with 31 additions and 2 deletions.
  1. +9 −2 jieba/__init__.py
  2. +3 −0 jieba/posseg/__init__.py
  3. +16 −0 test/test_userdict.py
  4. +3 −0 test/userdict.txt
View
@@ -11,7 +11,7 @@
FREQ = {}
total =0.0
-
+user_word_tag_tab={}
def gen_trie(f_name):
lfreq = {}
@@ -191,9 +191,16 @@ def load_userdict(f):
if isinstance(f, (str, unicode)):
f = open(f, 'rb')
content = f.read().decode('utf-8')
+ line_no = 0
for line in content.split("\n"):
+ line_no+=1
if line.rstrip()=='': continue
- word,freq = line.split(" ")
+ tup =line.split(" ")
+ word,freq = tup[0],tup[1]
+ if line_no==1:
+ word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
+ if len(tup)==3:
+ user_word_tag_tab[word]=tup[2].strip()
freq = float(freq)
FREQ[word] = log(freq / total)
p = trie
View
@@ -27,6 +27,9 @@ def load_model(f_name):
char_state_tab = load_model("char_state_tab.py")
word_tag_tab = load_model("../dict.txt")
+if jieba.user_word_tag_tab:
+ word_tag_tab.update(jieba.user_word_tag_tab)
+
class pair(object):
def __init__(self,word,flag):
self.word = word
View
@@ -0,0 +1,16 @@
+#encoding=utf-8
+import sys
+sys.path.append("../")
+import jieba
+jieba.load_userdict("userdict.txt")
+import jieba.posseg as pseg
+
+test_sent = "李小福是创新办主任也是云计算方面的专家"
+words = jieba.cut(test_sent)
+for w in words:
+ print w
+
+result = pseg.cut(test_sent)
+
+for w in result:
+ print w.word, "/", w.flag, ", ",
View
@@ -0,0 +1,3 @@
+云计算 5
+李小福 2 nr
+创新办 3 i

0 comments on commit 58c3636

Please sign in to comment.