Skip to content

Commit

Permalink
support user defined word tag
Browse files Browse the repository at this point in the history
  • Loading branch information
fxsjy committed Mar 25, 2013
1 parent 44e19a2 commit 58c3636
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 2 deletions.
11 changes: 9 additions & 2 deletions jieba/__init__.py
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


FREQ = {} FREQ = {}
total =0.0 total =0.0

user_word_tag_tab={}


def gen_trie(f_name): def gen_trie(f_name):
lfreq = {} lfreq = {}
Expand Down Expand Up @@ -191,9 +191,16 @@ def load_userdict(f):
if isinstance(f, (str, unicode)): if isinstance(f, (str, unicode)):
f = open(f, 'rb') f = open(f, 'rb')
content = f.read().decode('utf-8') content = f.read().decode('utf-8')
line_no = 0
for line in content.split("\n"): for line in content.split("\n"):
line_no+=1
if line.rstrip()=='': continue if line.rstrip()=='': continue
word,freq = line.split(" ") tup =line.split(" ")
word,freq = tup[0],tup[1]
if line_no==1:
word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
if len(tup)==3:
user_word_tag_tab[word]=tup[2].strip()
freq = float(freq) freq = float(freq)
FREQ[word] = log(freq / total) FREQ[word] = log(freq / total)
p = trie p = trie
Expand Down
3 changes: 3 additions & 0 deletions jieba/posseg/__init__.py
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ def load_model(f_name):
char_state_tab = load_model("char_state_tab.py") char_state_tab = load_model("char_state_tab.py")
word_tag_tab = load_model("../dict.txt") word_tag_tab = load_model("../dict.txt")


if jieba.user_word_tag_tab:
word_tag_tab.update(jieba.user_word_tag_tab)

class pair(object): class pair(object):
def __init__(self,word,flag): def __init__(self,word,flag):
self.word = word self.word = word
Expand Down
16 changes: 16 additions & 0 deletions test/test_userdict.py
Original file line number Original file line Diff line number Diff line change
@@ -0,0 +1,16 @@
#encoding=utf-8
import sys
sys.path.append("../")
import jieba
jieba.load_userdict("userdict.txt")
import jieba.posseg as pseg

test_sent = "李小福是创新办主任也是云计算方面的专家"
words = jieba.cut(test_sent)
for w in words:
print w

result = pseg.cut(test_sent)

for w in result:
print w.word, "/", w.flag, ", ",
3 changes: 3 additions & 0 deletions test/userdict.txt
Original file line number Original file line Diff line number Diff line change
@@ -0,0 +1,3 @@
云计算 5
李小福 2 nr
创新办 3 i

0 comments on commit 58c3636

Please sign in to comment.