## 安裝 jieba 套件

In [1]:
!pip install jieba



## 建立新目錄，存放辭典檔案

In [2]:
!mkdir jieba_data

mkdir: cannot create directory ‘jieba_data’: File exists


## 下載繁體字的辭典檔

In [3]:
!wget https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big -O jieba_data/dict.txt.big

--2020-07-24 07:17:50--  https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/fxsjy/jieba/master/extra_dict/dict.txt.big [following]
--2020-07-24 07:17:51--  https://raw.githubusercontent.com/fxsjy/jieba/master/extra_dict/dict.txt.big
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8583143 (8.2M) [text/plain]
Saving to: ‘jieba_data/dict.txt.big’


2020-07-24 07:17:55 (2.33 MB/s) - ‘jieba_data/dict.txt.big’ saved [8583143/8583143]



## 載入 Jieba 套件

In [4]:
import jieba

### 檢查 Jieba 版本

In [5]:
jieba.__version__

'0.42.1'

## 指定辭典檔

In [6]:
jieba.set_dictionary('jieba_data/dict.txt.big')

In [7]:
!head -10 jieba_data/dict.txt.big

1号店 3 n
1號店 3 n
4S店 3 n
4s店 3 n
AA制 3 n
AB型 3 n
AT&T 3 nz
A型 3 n
A座 3 n
A股 3 n


## 分詞

In [8]:
text_str = '今天天氣真好'

In [9]:
?jieba.cut

### 精確模式分詞 (cut_all=False)

In [10]:
seg_result = jieba.cut(text_str, cut_all=False)
print(' / '.join(list(seg_result)))

Building prefix dict from /home/jovyan/work/Text_wordcloud/jieba_data/dict.txt.big ...
Dumping model to file cache /tmp/jieba.u240ff8c1e70462be159af457c3f6d652.cache
Loading model cost 3.419 seconds.
Prefix dict has been built successfully.


今天天氣 / 真 / 好


### 全模式分詞 (cut_all=True)

In [11]:
seg_result = jieba.cut(text_str, cut_all=True)
print(' / '.join(list(seg_result)))

今天 / 今天天氣 / 天天 / 天氣 / 真好


### 搜尋引擎模式分詞

In [12]:
?jieba.cut_for_search

In [13]:
seg_result = jieba.cut_for_search(text_str, HMM=True)
print(' / '.join(list(seg_result)))

今天 / 天天 / 天氣 / 今天天氣 / 真 / 好


In [14]:
seg_result = jieba.cut_for_search(text_str, HMM=False)
print(' / '.join(list(seg_result)))

今天 / 天天 / 天氣 / 今天天氣 / 真 / 好


### Paddle 模式分詞 (use_paddle=True)

In [8]:
!pip install --upgrade paddlepaddle-tiny

[31mERROR: Could not find a version that satisfies the requirement paddlepaddle-tiny (from versions: none)[0m
[31mERROR: No matching distribution found for paddlepaddle-tiny[0m


In [17]:
jieba.enable_paddle()
seg_result = jieba.cut(text_str, use_paddle=True)
print(' / '.join(list(seg_result)))

Installing paddle-tiny, please wait a minute......
Import paddle error, please use command to install: pip install paddlepaddle-tiny==1.6.1.Now, back to jieba basic cut......


UnboundLocalError: local variable 'paddle' referenced before assignment

## 詞性標註

#### 詞性說明: https://gist.github.com/luw2007/6016931

In [18]:
import jieba.posseg as pseg
jieba.enable_paddle()
seg_result = pseg.lcut(text_str, use_paddle=True)
for w, p in seg_result:
    print("%s, %s"%(w, p))

Installing paddle-tiny, please wait a minute......
Import paddle error, please use command to install: pip install paddlepaddle-tiny==1.6.1.Now, back to jieba basic cut......


UnboundLocalError: local variable 'paddle' referenced before assignment

## 自定義詞庫

In [None]:
text_str_2 = '是在哈囉嗎?'

In [None]:
!cat jieba_data/mydict.txt

In [None]:
seg_result = jieba.lcut(text_str_2)
print(' / '.join(seg_result))

In [None]:
jieba.load_userdict('jieba_data/mydict.txt')

In [None]:
seg_result = jieba.lcut(text_str_2)
print(' / '.join(seg_result))

In [None]:
!cat jieba_data/mydict.txt

## 停止字

In [None]:
!wget https://raw.githubusercontent.com/fxsjy/jieba/master/extra_dict/stop_words.txt -O jieba_data/stop_words.txt

In [None]:
!tail -5 jieba_data/stop_words.txt

In [None]:
# 開檔後一次讀一行
stop_words_list = []
with open(file='jieba_data/stop_words.txt',mode='r', encoding="UTF-8") as file:
    for line in file:
        line = line.strip()
        stop_words_list.append(line)
stop_words_list

In [None]:
# 開檔後一次讀完，再進行切割
with open(file='jieba_data/stop_words.txt',mode='r', encoding="UTF-8") as file:
    #依照換行字元 \n 進行切割，切完為 list
    stop_words = file.read().split('\n')
    print(type(stop_words))
    print(stop_words)

In [None]:
text_str_3 = '我是一位小學生，從小學習鋼琴，希望成為youtuber'
seg_result = jieba.lcut(text_str_3)
seg_result_stopword = []
for term in seg_result:
    if term not in stop_words:
        seg_result_stopword.append(term)
seg_result_stopword

## 其他

In [None]:
print(jieba.DEFAULT_DICT_NAME)
# check jieba/jieba/__init__.py