# Set Up The Environment

In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import re
import os
import time
from tqdm.auto import tqdm
from ckiptagger import WS, POS, NER, construct_dictionary # tokenization

# Setup work directory
os.chdir('/Users/deankuo/Desktop/python/dissertation/Taiwan-Manifesto-Analysis-A-ML-Approach')
os.listdir()

# Import utils.py
from Utils.utils_token import (
    flatten,
    text_select,
    load_text,
    split_content,
    tokenization,
    postprocess_dataframe,
    
)
matplotlib.rcParams['font.family'] = 'Times New Roman'

# CKIP module
CKIP_PATH = "../CKIP_TAGGER"
ws = WS(CKIP_PATH) # 斷詞
pos = POS(CKIP_PATH) # 詞性標註
ner = NER(CKIP_PATH) # 命名實體識別

2024-03-11 20:27:52.995108: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)
2024-03-11 20:28:08.325787: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-11 20:28:08.377698: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:357] MLIR V1 optimization pass is not enabled
  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)
  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, na

In [2]:
# 加入自定義字典
word_to_weight = {'823': 1, 'ECFA': 1, '2300': 1, '台26線': 1, '台74線': 1, '12年國教': 1, 'BOT': 1, '88快速道路': 1, '台27線': 1, '台61線': 1, '十二年國教': 1, '國道10號': 1,
                  '台88號': 1, 'M型': 1, '205兵工廠': 1, '北二高': 1, '台65線': 1, 'CEPA': 1, 'FTA': 1, '科學園區': 1, '228': 1, 'MIT': 1, '202兵工廠': 1, '86快速道路': 1, '國道8號': 1,
                  '台64': 1, '台66': 1, 'iBike': 1, 'MRT': 1, 'TPP': 1, 'TIFA': 1, 'TPP':1, '台22': 1, '台29': 1, '國10': 1, '國1': 1, '318': 1, 'NCC':1, 'PM2.5': 1, 'YouBike': 1, 
                  '台68': 1, '快速道路': 1, 'NGO': 1, 'NPO': 1, 'U-Bike': 1, 'LGBTQ': 1, '三七五減租': 1, '小三通': 1, '大三通': 1, '基礎建設': 1, '戒急用忍': 1, '社會役': 1, '非核家園': 1,
                  '教育券': 1, '九二共識': 1}
dictionary = construct_dictionary(word_to_weight)

# 停用詞
with open("./Data/stopwords_zh-tw.txt", encoding="utf-8") as fin:
    stopwords = fin.read().split("\n")[1:]

In [2]:
years = [1992, 1995, 1998, 2001, 2004, 2008, 2012, 2016, 2020, 2024]
dfs = {}
for year in years:
    dfs[year] = pd.read_excel(f"./Data/merge_data/{year}.xlsx")
    print(dfs[year].shape)

(330, 16)
(314, 16)
(383, 16)
(434, 16)
(368, 16)
(283, 16)
(267, 16)
(354, 16)
(410, 16)
(309, 16)


In [12]:
for k, v in dfs.items():
    dfs[k] = split_content(dfs[k])

# Tokenization

In [13]:
# Test the text selection function
text = dfs[2024].loc[100, 'CONTENT']

sentence_, pos_, ner_, token_ = load_text(text, test=1)
print()
print('=====')
print("原文：")
print(text)
print("斷詞後：")
print(sentence_)
print("斷詞後+詞性標注：")
print(pos_)
print("透過詞性篩選的斷詞: ")
print(token_)
print("命名實體辨識: ")
print(ner_)
print('=====')


=====
原文：
２.加強實現中山區幸福工程建造計劃，定期舉辦１８場講座，讓每個人身懷絕技，擁有１８般武藝。
斷詞後：
加強 實現 中山區 幸福 工程 建造 計劃 定期 舉辦 １８ 講座 絕技 擁有 １８般武藝
斷詞後+詞性標注：
['加強(VC)', '實現(VC)', '中山區(Nc)', '幸福(VH)', '工程(Na)', '建造(Nv)', '計劃(Na)', '定期(D)', '舉辦(VC)', '講座(Na)', '絕技(Na)', '擁有(VJ)', '１８般武藝(Na)']
透過詞性篩選的斷詞: 
加強 實現 中山區 幸福 工程 建造 計劃 定期 舉辦 講座 絕技 擁有 １８般武藝
命名實體辨識: 
[{(0, 1, 'CARDINAL', '２'), (22, 24, 'CARDINAL', '１８'), (6, 9, 'LOC', '中山區')}]
=====


## Iterate Each Year Data

In [14]:
for key, value in dfs.items():
    dfs[key] = tokenization(key, value)

Tokenizing 1992 election statements:   0%|          | 0/4070 [00:00<?, ?it/s]

1992年選舉公報的斷詞運算時間為: 8.29 分


Tokenizing 1995 election statements:   0%|          | 0/3633 [00:00<?, ?it/s]

1995年選舉公報的斷詞運算時間為: 7.49 分


Tokenizing 1998 election statements:   0%|          | 0/4357 [00:00<?, ?it/s]

1998年選舉公報的斷詞運算時間為: 9.97 分


Tokenizing 2001 election statements:   0%|          | 0/4593 [00:00<?, ?it/s]

2001年選舉公報的斷詞運算時間為: 9.92 分


Tokenizing 2004 election statements:   0%|          | 0/3926 [00:00<?, ?it/s]

2004年選舉公報的斷詞運算時間為: 8.16 分


Tokenizing 2008 election statements:   0%|          | 0/3724 [00:00<?, ?it/s]

2008年選舉公報的斷詞運算時間為: 7.55 分


Tokenizing 2012 election statements:   0%|          | 0/2877 [00:00<?, ?it/s]

2012年選舉公報的斷詞運算時間為: 7.45 分


Tokenizing 2016 election statements:   0%|          | 0/3988 [00:00<?, ?it/s]

2016年選舉公報的斷詞運算時間為: 9.72 分


Tokenizing 2020 election statements:   0%|          | 0/5531 [00:00<?, ?it/s]

2020年選舉公報的斷詞運算時間為: 11.04 分


Tokenizing 2024 election statements:   0%|          | 0/3876 [00:00<?, ?it/s]

2024年選舉公報的斷詞運算時間為: 9.02 分


## Post Processing

In [3]:
years = [1992, 1995, 1998, 2001, 2004, 2008, 2012, 2016, 2020, 2024]
dfs = {}
for year in years:
    dfs[year] = pd.read_csv(f"./Data/Dataset/{year}.csv")

In [4]:
for k, v in dfs.items():
    dfs[k] = postprocess_dataframe(dfs[k])

In [5]:
sum = 0
for k, v in dfs.items():
    sum += dfs[k].shape[0]

print(sum)

40557


In [7]:
for k, v in dfs.items():
    dfs[k].to_csv(f'./Data/Dataset/{k}.csv', encoding='utf-8-sig', index=False)

In [8]:
combined_df = pd.concat(dfs.values(), ignore_index=True)
combined_df.to_csv('./Data/Manifesto_Dataset.csv', encoding='utf-8-sig', index=False)