Permalink
Browse files

first commit

  • Loading branch information...
0 parents commit 6f6e812afb08cf794a4cb9771bf72b788d27d949 @fxsjy committed Sep 29, 2012
Showing with 395,965 additions and 0 deletions.
  1. +22 −0 .gitattributes
  2. +163 −0 .gitignore
  3. +60 −0 jieba/__init__.py
  4. +395,705 −0 jieba/dict.txt
  5. +15 −0 test.py
22 .gitattributes
@@ -0,0 +1,22 @@
+# Auto detect text files and perform LF normalization
+* text=auto
+
+# Custom for Visual Studio
+*.cs diff=csharp
+*.sln merge=union
+*.csproj merge=union
+*.vbproj merge=union
+*.fsproj merge=union
+*.dbproj merge=union
+
+# Standard to msysgit
+*.doc diff=astextplain
+*.DOC diff=astextplain
+*.docx diff=astextplain
+*.DOCX diff=astextplain
+*.dot diff=astextplain
+*.DOT diff=astextplain
+*.pdf diff=astextplain
+*.PDF diff=astextplain
+*.rtf diff=astextplain
+*.RTF diff=astextplain
163 .gitignore
@@ -0,0 +1,163 @@
+#################
+## Eclipse
+#################
+
+*.pydevproject
+.project
+.metadata
+bin/
+tmp/
+*.tmp
+*.bak
+*.swp
+*~.nib
+local.properties
+.classpath
+.settings/
+.loadpath
+
+# External tool builders
+.externalToolBuilders/
+
+# Locally stored "Eclipse launch configurations"
+*.launch
+
+# CDT-specific
+.cproject
+
+# PDT-specific
+.buildpath
+
+
+#################
+## Visual Studio
+#################
+
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.sln.docstates
+
+# Build results
+[Dd]ebug/
+[Rr]elease/
+*_i.c
+*_p.c
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.vspscc
+.builds
+*.dotCover
+
+## TODO: If you have NuGet Package Restore enabled, uncomment this
+#packages/
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opensdf
+*.sdf
+
+# Visual Studio profiler
+*.psess
+*.vsp
+
+# ReSharper is a .NET coding add-in
+_ReSharper*
+
+# Installshield output folder
+[Ee]xpress
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish
+
+# Others
+[Bb]in
+[Oo]bj
+sql
+TestResults
+*.Cache
+ClientBin
+stylecop.*
+~$*
+*.dbmdl
+Generated_Code #added for RIA/Silverlight projects
+
+# Backup & report files from converting an old project file to a newer
+# Visual Studio version. Backup files are not needed, because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+
+
+
+############
+## Windows
+############
+
+# Windows image file caches
+Thumbs.db
+
+# Folder config file
+Desktop.ini
+
+
+#############
+## Python
+#############
+
+*.py[co]
+
+# Packages
+*.egg
+*.egg-info
+dist
+build
+eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+
+#Translations
+*.mo
+
+#Mr Developer
+.mr.developer.cfg
+
+# Mac crap
+.DS_Store
60 jieba/__init__.py
@@ -0,0 +1,60 @@
+import re
+import math
+import os,sys
+import pprint
+
+def gen_trie(f_name):
+ trie = {}
+ for line in open(f_name):
+ word,freq = line.strip().split(" ")
+ word = word.decode('utf-8')
+ p = trie
+ for c in word:
+ if not c in p:
+ p[c] ={}
+ p = p[c]
+ p['']='' #ending flag
+ return trie
+
+_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
+print >> sys.stderr, "loading dictionary..."
+trie = gen_trie(os.path.join(_curpath,"dict.txt"))
+print >> sys.stderr,"done."
+
+def __cut(sentence):
+ N = len(sentence)
+ i,j=0,0
+ p = trie
+ while i<N:
+ c = sentence[j]
+ if c in p:
+ p = p[c]
+ if '' in p:
+ yield sentence[i:j+1]
+ j+=1
+ if j>=N:
+ i+=1
+ j=i
+ else:
+ p = trie
+ i+=1
+ j=i
+
+def cut(sentence):
+ if not ( type(sentence) is unicode):
+ try:
+ sentence = sentence.decode('utf-8')
+ except:
+ sentence = sentence.decode('gbk','ignore')
+ re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
+ blocks = re_han.split(sentence)
+
+ for blk in blocks:
+ if re_han.match(blk):
+ for word in __cut(blk):
+ yield word
+ else:
+ tmp = re_skip.split(blk)
+ for x in tmp:
+ if x!="":
+ yield x
395,705 jieba/dict.txt
395,705 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
15 test.py
@@ -0,0 +1,15 @@
+#encoding=utf-8
+import sys
+import jieba
+
+def cuttest(test_sent):
+ result = jieba.cut(test_sent)
+ for word in result:
+ print word, "/",
+ print ""
+
+
+if __name__ == "__main__":
+ cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
+ cuttest("永和服装饰品有限公司")
+ cuttest("我爱北京天安门")

0 comments on commit 6f6e812

Please sign in to comment.