In [1]:
import os
import jieba
import numpy as np
from collections import Counter
# pip install gensim
from gensim.corpora import Dictionary

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


### 讀入stopwords

In [2]:
stopwords = []
jieba.set_dictionary('./jieba_data/dict.txt.big')
jieba.load_userdict('./jieba_data/userdict.txt')

with open('./jieba_data/stopwords.txt', 'r', encoding='UTF-8') as file:
    for data in file.readlines():
        data = data.strip()
        stopwords.append(data)

Building prefix dict from /Users/alexis/Documents/GitHub/HW1--data-mining_bak/HW2/jieba_data/dict.txt.big ...
Loading model from cache /var/folders/y9/vfmh1hx93jj2rp98mgt85dl00000gn/T/jieba.ub3af8d1fe5443e291930f157fa54cb87.cache
Loading model cost 1.111 seconds.
Prefix dict has been built succesfully.


In [3]:
def getTokens(text):
    tokens = []
    if len(text) > 0:
        segments = jieba.cut(text,cut_all=False)
        for t in segments:
            t = t.strip()
            if not t in stopwords:
                tokens.append(t)
    return tokens

In [4]:
def getText(filename):
    with open(filename,'r') as file:
        lines = file.readlines()
        
    news_title = lines[2]
    news_body = lines[3]
    text = news_title + news_body
    return(text)    

### 取出全部文件(180個)的內容做斷詞

In [5]:
text = ''
dir_path = './ctee_TMC'
for file in os.listdir(dir_path):
    file_path = os.path.join(dir_path,file)
    if os.path.isfile(file_path) and not file.startswith('.'):
        text = text + getText(file_path)

tokens = getTokens(text)

### 共有約2萬個詞

In [6]:
print(len(tokens))

17595


### 共有約5000個不重複詞

In [7]:
counter = Counter(tokens)
print(len(counter))

5065


### 取出詞頻最高前500個詞

In [8]:
top_tokens_tuple = counter.most_common(500)
all_tokens = []
for t in top_tokens_tuple:
    all_tokens.append(t[0])

print(all_tokens)

['元', '億元', '％', '奈米', '營收', '外資', '製程', '市場', ',', '晶圓', '台股', '法人', '成長', '去年', '科技', '股價', '代工', '10', '脈動', '股利', '預期', '(', ')', '現金', '龍頭', '半導體', '約', '2330', '點', '買超', '新高', '後', '新', '第一季', '-', '台灣', '高', '晶片', '市值', '蘋果', '中', '指出', '影響', '大', '產業', '技術', '12', '達', '先進', '～', '可望', '客戶', '上漲', '仍', '公司', '每股', '前', '設計', '持續', '11', '昨', '營運', '第四季', '全球', '證券', '投資', '獲利', '產能', '台積', '手機', '董事長', '訂單', '預估', '台幣', '歷史', '指數', '合併', '股', '處理器', '說', '已', '大立光', '更', '上', '張忠謀', '未來', '減少', '2019', '明年', '董事會', '廠', '財經', '毛利率', '億美元', '2018', '逾', '個股', '展望', '帶動', '包括', '季', '美元', '發展', '全年', '不', '增加', '業績', '表現', '聯發科', '需求', '受惠', '2020', '下', '產品', '出貨', '兆元', 'iPhone', '事件', '行情', '漲幅', '5G', '創新', '第二季', '創意', '配發', '量產', '人', '盤中', '大陸', '季減', '再', '看好', '稅後', '震盪', '營業', '占', '優於', '企業', '淨利', '紀錄', '16', '英特爾', '評等', '運算', '發放', '加上', '新台幣', '去年同期', '權值', '資金', '營收達', '動能', '好', '比特', '鴻海', '超過', '金額', '利益', '今', '張', '核准', '南京', '法說', 'AI', '利率', '上半年', '大漲', '

### 過濾掉沒有在top 500裡的詞

In [9]:
def filterTokens(tokens):
    global all_tokens
    filtered_tokens = []
    for t in tokens:
        if t in all_tokens:
            filtered_tokens.append(t)
    return filtered_tokens

In [10]:
doc_tokens = []
dir_path = './ctee_TMC'
for file in os.listdir(dir_path):
    file_path = os.path.join(dir_path,file)
    if os.path.isfile(file_path) and not file.startswith('.'):
        wanted_tokens = []
        text = getText(file_path)
        tokens = getTokens(text)
        tokens2 = filterTokens(tokens)
        doc_tokens.append(tokens2)

### 稀疏矩陣

In [11]:
sparse_matrix = []

dir_path = './ctee_TMC'
for file in os.listdir(dir_path):
    file_path = os.path.join(dir_path,file)
    if os.path.isfile(file_path) and not file.startswith('.'):
        text = getText(file_path)
        tokens = getTokens(text)
        doc_to_term = []
        for t in all_tokens:
            if t in tokens:
                doc_to_term.append(1)
            else:
                doc_to_term.append(0)
        sparse_matrix.append(doc_to_term)

### 共現矩陣

In [12]:
def co_occurrence_matrix(ls):
    global matrix
    length = len(ls)
    for i in range(length):
        for j in range(length):
            if i != j:
                matrix[[ls[i]], [ls[j]]] += 1

In [13]:
dimension = len(all_tokens)
matrix = np.matrix([[0] * dimension] * dimension)

dt = Dictionary(doc_tokens).token2id
doc_token_ids = [[dt[word] for word in words] for words in doc_tokens]

for ids in doc_token_ids:
    co_occurrence_matrix(ids)

In [15]:
print(dt)

{'28': 0, '前': 1, '大陸': 2, '奈米': 3, '政策': 4, '營業': 5, '相關': 6, '財經': 7, '-': 8, '10': 9, '15': 10, '17': 11, '18': 12, '20': 13, '2019': 14, '2020': 15, '5G': 16, 'AI': 17, 'HPC': 18, 'iPhone': 19, '上半年': 20, '下修': 21, '下半年': 22, '下滑': 23, '人工智慧': 24, '仍': 25, '低': 26, '來到': 27, '保守': 28, '倍': 29, '元': 30, '先前': 31, '全年': 32, '分析師': 33, '利率': 34, '利用率': 35, '加上': 36, '區間': 37, '反彈': 38, '受惠': 39, '可望': 40, '台': 41, '同步': 42, '圈': 43, '增': 44, '外資': 45, '季': 46, '季減': 47, '客戶': 48, '年減': 49, '強勁': 50, '後': 51, '成': 52, '投資': 53, '指出': 54, '推測': 55, '支撐': 56, '整體': 57, '新': 58, '時': 59, '智慧': 60, '樂觀': 61, '殖': 62, '毛利率': 63, '法': 64, '法說': 65, '減少': 66, '營收': 67, '現金': 68, '產能': 69, '發放': 70, '目標': 71, '約': 72, '美元': 73, '考量': 74, '股價': 75, '股利': 76, '衰退': 77, '製程': 78, '觀察': 79, '評等': 80, '說': 81, '證券': 82, '財務': 83, '貢獻': 84, '超過': 85, '車用': 86, '運算': 87, '重要': 88, '長線': 89, '需求': 90, '預期': 91, '首季': 92, '％': 93, '～': 94, 'EUV': 95, '三星': 96, '代工': 97, '先進': 98, '卻': 99, '台積': 100, '大

In [14]:
print(matrix)

[[ 4 14  1 ...  0  0  0]
 [14 18  4 ...  0  1  0]
 [ 1  4 28 ... 24  9  0]
 ...
 [ 0  0 24 ... 62 30  0]
 [ 0  1  9 ... 30  8  0]
 [ 0  0  0 ...  0  0 56]]
