# Lour's Pork Barrel Classifier (羅老師肉桶文本分類器)🐖
## Convolutional Neural Networks for Pork Barrel Project Classification 

-------------------------

### Stage 1: Libaries & Dependencies

In [1]:
# built-in library
import math
import re
import collections
import zipfile
import random
from itertools import chain

# ML & Deep Learning/ NLP toolkit
import pandas as pd
import numpy as np
import jieba
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard

#from keras.layers import Flatten

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

-------------------------

### Stage 2: Data Preprocessing (Training Data: Introduction of Bills and Legislation from 6th Session to 7th Session, 2004-2012)

#### (1) Read file 

In [2]:
# read file
df = pd.read_csv('data/Pork Bill - 2021-05-20.csv',encoding='utf-8')

# combine abstract of bill and title 
df['text'] =  df['Title'] + df['Content'].fillna(df['Title'])

# drop conten without having any characters
# view na's row: df[df['text'].isnull()==True]
data = df[['text', 'pork_bill']].dropna(subset=['text'])

In [3]:
print(" Pork Legislation", data['pork_bill'].value_counts()[1],'\n', 
      "None-Pork Legislation", data['pork_bill'].value_counts()[0])

 Pork Legislation 2510 
 None-Pork Legislation 4733


#### (2) Balancing the Training Samples

random.seed(24)
data = data[data['pork_bill']==1].append(data[data['pork_bill']==0].reset_index(drop=True).loc[[random.randint(0, 2500) for i in range(2500)]]).reset_index(drop=True)

print(" Pork Legislation", data['pork_bill'].value_counts()[1],'\n', 
      "None-Pork Legislation", data['pork_bill'].value_counts()[0])

#### (2) Tokenization

In [4]:
import collections
import numpy as np
import jieba
from itertools import chain


def jieba_cut(filename):
    """
    cut Chinese and remove stop words
    Reference: https://www.cnblogs.com/Luv-GEM/p/10836454.html
    Stopwords: https://www.kaggle.com/rikdifos/english-and-chinese-stopwords?select=cn_stopwords.txt
    """
    stop_list = [i.strip() for i in open('cn_stopwords.txt','r',encoding='utf-8')]  
    news_cut = []
    news_list = []
    for line in filename:    
        if line:
            news_cut = list(jieba.cut(''.join(line),cut_all=False,HMM=True))  
            news_list.append([word.strip() for word in news_cut if word not in stop_list and len(word.strip())>0]) 
    news_list = list(chain.from_iterable(news_list))  
    return news_list

def clearPucts(context):
    """
    remove punctuation
    ref: https://chenyuzuoo.github.io/posts/28001/
    """
    context = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "", context)
    context = re.sub("[【】╮╯▽╰╭★→「」]+","", context)
    context = re.sub("！，❤。～《》：（）【】「」？”“；：、","",context)
    context = re.sub("\s","",context)
    return context

def seg_char(sent):
    """
    cut Chinese and remove stop words
    ref: https://blog.csdn.net/renyuanfang/article/details/86487367
    """
    # split
    pattern_char_1 = re.compile(r'([\W])')
    parts = pattern_char_1.split(sent)
    parts = [p for p in parts if len(p.strip())>0]
    # cut sentence
    pattern = re.compile(r'([\u4e00-\u9fa5])')
    chars = pattern.split(sent)
    chars = [w for w in chars if len(w.strip())>0]
    chars = ' '.join(chars)
    return chars

In [5]:
data_clean = [seg_char(text) for text in [clearPucts(text) for text in data.text]]

tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size=2**18)

data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

#### (3) Padding

In [6]:
MAX_LEN = max([len(sentence) for sentence in data_clean])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)


#### (4) Spliting Training/ Testing Set

In [7]:
data_labels = data.pork_bill.values

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    data_inputs, data_labels, test_size=0.33, random_state=42)

print("Shape of X Train:", train_inputs.shape, '\n'
      "Shape of X Test :", test_inputs.shape,'\n'
      "Shape of Y Trian:", train_labels.shape , '\n'
      "Shape of Y Test :", test_labels.shape )

Shape of X Train: (4852, 785) 
Shape of X Test : (2391, 785) 
Shape of Y Trian: (4852,) 
Shape of Y Test : (2391,)


In [9]:
unique_train_labels, counts_train_labels = np.unique(train_labels, return_counts=True)

print(' Num of Train Set:' , train_inputs.shape[0], '\n'
      ,'Not Pork vs Pork:', dict(zip(unique_train_labels, counts_train_labels)))

 Num of Train Set: 4852 
 Not Pork vs Pork: {0: 3167, 1: 1685}


In [10]:
unique_test_labels, counts_test_labels = np.unique(test_labels, return_counts=True)

print(' Num of Test Set:' , test_inputs.shape[0], '\n'
      ,'Not Pork vs Pork:', dict(zip(unique_test_labels, counts_test_labels)))

 Num of Test Set: 2391 
 Not Pork vs Pork: {0: 1566, 1: 825}


-------------------------


### Stage 3: Model and Building

#### (1) Using the Subclassing API to Build Dynamic Model

In [11]:
class PorkCNN(tf.keras.Model):
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=100,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(PorkCNN, self).__init__(name=name)
        self.embedding=layers.Embedding(vocab_size, emb_dim, input_length=30, input_shape=(BATCH_SIZE, 30))
        self.bigram=layers.Conv1D(filters=nb_filters, kernel_size=2, strides = 2, padding="valid", activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,kernel_size=3, strides = 2, padding="valid",activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,kernel_size=4, strides = 1, padding="valid",activation="relu")
        self.fivegram = layers.Conv1D(filters=nb_filters,kernel_size=5,strides = 1, padding="valid",activation="relu")
        self.sixgram = layers.Conv1D(filters=nb_filters,kernel_size=6,strides = 3, padding="valid",activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.last_dense = layers.Dense(units=1, activation="sigmoid") 
        #self.flaten =  layers.Flatten()
        #self.batch_nor = layers.BatchNormalization()
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        x_4 = self.fourgram(x)
        x_4 = self.pool(x_4) 
        x_5 = self.fivegram(x)
        x_5 = self.pool(x_5)
        x_6 = self.sixgram(x)
        x_6 = self.pool(x_6)          
        merged = tf.concat([x_1, x_2, x_3, x_4, x_5, x_6], axis=-1) 
        #merged = self.batch_nor(merged)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        #merged = self.flaten(merged)
        output = self.last_dense(merged)        
        return output

In [12]:
VOCAB_SIZE = tokenizer.vocab_size  # 5000 tokenizer.vocab_size
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2 #len(set(train_labels))
DROPOUT_RATE = 0.4
BATCH_SIZE = 230
NB_EPOCHS = 80

Dcnn = PorkCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

#### (2) Compile and Summary of the Model

In [13]:
Dcnn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
Dcnn.build(input_shape = (BATCH_SIZE, 30)) # (BATCH_SIZE, input_length=30)
Dcnn.summary()

Model: "dcnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  586600    
_________________________________________________________________
conv1d (Conv1D)              multiple                  40100     
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  60100     
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  80100     
_________________________________________________________________
conv1d_3 (Conv1D)            multiple                  100100    
_________________________________________________________________
conv1d_4 (Conv1D)            multiple                  120100    
_________________________________________________________________
global_max_pooling1d (Global multiple                  0      

#### (3) Check Point Path

In [14]:
checkpoint_path = "checkpoint_recode/ckpt_manager"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=10)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Checkpoint Located!!")

-------------------------

### Stage 4: Train the Model

In [15]:
early_stop = EarlyStopping(monitor='val_loss',patience=5)

In [16]:
Dcnn.fit(train_inputs, train_labels,
         validation_data=(test_inputs, test_labels),
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS,
         callbacks=[early_stop])
ckpt_manager.save()

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80


'checkpoint_recode/ckpt_manager/ckpt-1'

-------------------------

### Stage 5: Evaluation


In [31]:
losses = pd.DataFrame(Dcnn.history.history)
losses.head()

In [34]:
losses[['accuracy','val_accuracy']].plot()
losses[['loss','val_loss']].plot()

KeyError: "None of [Index(['accuracy', 'val_accuracy'], dtype='object')] are in the [columns]"

In [33]:
losses.plot(figsize=(8,5))
plt.grid(True)
plt.gca().set_ylim(0,1)
plt.xlabel("Learning Curves: the mean training loss and accuracy measured over each epochs")
#plt.savefig('images/learning_curves.png')
plt.show()


TypeError: no numeric data to plot

In [20]:
from sklearn.metrics import classification_report,confusion_matrix

In [21]:
evaluation_model = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(evaluation_model)

[0.1876908540725708, 0.9385194182395935]


In [22]:
# pd.DataFrame(confusion_matrix(test_labels,predictions))
predictions = Dcnn.predict(test_inputs)
predictions = np.where(predictions >0.8 , 1, 0)
print(classification_report(test_labels, predictions))

              precision    recall  f1-score   support

           0       0.94      0.97      0.96      1566
           1       0.94      0.89      0.91       825

    accuracy                           0.94      2391
   macro avg       0.94      0.93      0.94      2391
weighted avg       0.94      0.94      0.94      2391



In [23]:
t = pd.DataFrame(confusion_matrix(test_labels,predictions), 
             columns=['Predictions: Not Pork(0)','Predictions:Pork(1)'])
t.index = ['Acutal: Not Pork(0)', 'Acutal: Pork (1)']
t

Unnamed: 0,Predictions: Not Pork(0),Predictions:Pork(1)
Acutal: Not Pork(0),1517,49
Acutal: Pork (1),89,736


-------------------------

### Stage 6: Try the Model with New Data and Export an End-to-end Model


#### (1) Test New Dataset from 2000 samples from Legislative Questions from 6th Session 

In [24]:
def as_num(x):
    """
    keep 10 decimals
    """
    y = '{:.15f}'.format(x) 
    return y

In [25]:
import pandas as pd
import random

# test the trined model using new text from 200 samples from Leislative Questions
LQ6 = pd.read_csv('data/LQ_6th.csv')

# drop '0': legislor  never have proposed any legisaltive questions. 
LQ6 = LQ6[LQ6['title'].isin(['0'])== False]

# select text with number of charecters within EMB_DIM; len(sub_set) = 11162
# sample_df = LQ6.loc[[len(char_num) for char_num in LQ6.title if len(char_num) < MAX_LEN], ['legislator','title', 'topic', 'category', 'keywords', 'ques_type']].reset_index(drop=True)
random.seed(42)
sample_df = LQ6.iloc[random.sample(range(len(LQ6)), 2000)][['legislator', 'title', 'topic', 'category', 'keywords', 'ques_type']]
sub_set = [seg_char(text) for text in [clearPucts(text) for text in sample_df.title]]

##### Top 10 of 2000 Samples

In [26]:
pd.DataFrame(
    {'legislator': sample_df.legislator,
     'Pork Value(Constituency Interest)':[as_num(Dcnn(np.array([tokenizer.encode(line)]), training=True).numpy()[0][0]) for line in sub_set],
     'Legislative Questions ': sample_df.title,
     'Topic': sample_df.topic,
     'Key Word':sample_df.keywords}).sort_values(by=['Pork Value(Constituency Interest)'], ascending = False).head(15)

Unnamed: 0,legislator,Pork Value(Constituency Interest),Legislative Questions,Topic,Key Word
3999,林正峰,0.997782468795776,針對政府準備修法推動「二代健保」，健保保費採取「年度所得總額」為計算基礎，而非採用扣除免稅額...,國民 ; 健康保險 ; 保險費,二代健保 ; 年度所得總額
1641,彭添富,0.99675726890564,針對「豪雨成災，農作物損失補償」問題，特向行政院提出質詢。,農業補助,豪雨 ; 農作物
1761,彭添富,0.992317736148834,針對「辦理九十四年原住民中低收入戶家庭租屋補助計畫」專案補助計畫，特向行政院提出質詢。,原住民族生活,原住民中低收入戶 ; 租屋補助
4435,林重謨,0.99196469783783,針對彰化縣教育經費不足，急需中央補助，特向行政院提出緊急質詢。,地方財政 ; 教育經費,彰化縣 ; 教育經費
10264,陳根德,0.991881608963013,鑒於桃園縣民眾陳情抗議中正路延伸工程拆遷補償不公，並聚集在中正路尾工地現場，希望盡速發放地上...,道路工程 ; 政府補助,工程 ; 補償
3304,林建榮,0.986285328865051,為宜蘭縣冬山鄉南富段土地因鄰近冬山河，遭100米風景線逕為分割，致原符合農發條例規定可興建農...,農地 ; 農村住宅 ; 政府補助,冬山鄉 ; 農舍 ; 徵收補償
619,吳成典,0.986227869987488,鑑於居家安全及消防常識已逐漸受到民眾的重視，請中央寬列消防機關人事費用，補助金、馬兩縣規劃合...,住宅安全 ; 消防設施,居家安全 ; 消防常識
4553,林鴻池,0.985814332962036,針對行政院宣布補助弱勢家庭子女就讀私校高中職之學費，卻將資格限制在「非自願就讀者」，有畫餅充...,低收入戶 ; 教育補助,弱勢家庭 ; 私立高中職
9892,陳啟昱,0.980187714099884,鑑於現行《所得稅法》第十七條規定特別扣除額教育支出部分，僅以納稅義務人之子女就讀大專院校為限...,所得稅 ; 教育費用 ; 扣除額,所得稅法 ; 特別扣除額 ; 教育支出
2275,曾華德,0.969020545482635,為政府應特別重視山地離島地區居民健康問題，讓原住民之醫療照護在政府妥善照顧下獲得改善，特向行...,醫療政策 ; 離島政務,離島地區 ; 健康 ; 醫療照護


##### Last 10 Rows of 2000 Samples

In [27]:
pd.DataFrame(
    {'legislator': sample_df.legislator,
     'Pork Value(Constituency Interest)':[as_num(Dcnn(np.array([tokenizer.encode(line)]), training=True).numpy()[0][0]) for line in sub_set],
     'Legislative Questions ': sample_df.title,
     'Topic': sample_df.topic,
     'Key Word':sample_df.keywords}).sort_values(by=['Pork Value(Constituency Interest)'], ascending = False).tail(15)

Unnamed: 0,legislator,Pork Value(Constituency Interest),Legislative Questions,Topic,Key Word
5952,王塗發,6.480477623e-06,針對行政院金融監督管理委員會（金管會）宣稱，台灣上市上櫃公司投資中國累計匯回資金比例達7.9...,大陸政策 ; 對外投資,投資中國 ; 台商資金匯回
6636,王幸男,5.92473225e-06,針對「莊嚴肅穆」的行憲紀念日，在威權時代「今天應懸掛國旗」，不過，在愈來愈多人的真實生活中，...,紀念活動 ; 憲政運作,聖誕節 ; 行憲紀念日 ; 制憲權
10120,陳朝龍,5.769047675e-06,針對英國政府宣稱台灣出口至該國禽鳥，檢驗出感染禽流感H5N1病毒死亡。由於我國迄今並未發現有...,傳染病防治 ; 走私,英國政府 ; 台灣禽鳥 ; 禽流感 ; 相思鳥 ; 走私
862,吳育昇,5.473021247e-06,為接獲入出境管理局員工陳情，質疑入出境管理局移撥移民署之人事分發不公，遂公開提出質疑，入出境...,移民 ; 政府組織 ; 人事行政,移民署 ; 人事分發
6273,王幸男,4.969437214e-06,針對目前斃死豬陰影持續籠罩全台：屏東再度查獲私宰販賣斃死豬的常業慣犯，十一名嫌犯落網，但卻不...,肉品衛生 ; 食品管理,斃死豬
9039,邱毅,3.410448016e-06,針對新聞局認定TVBS應為綜合台非新聞台乙案，準備將TVBS轉頻一事，日前行政院新聞局認定T...,新聞自由 ; 電視臺,TVBS ; 股權結構 ; 外資 ; 國家通訊傳播委員會組織法 ; NCC ; 報導自由
573,吳志揚,3.13701662e-06,"針對內政部規劃於今年七月換發國民身分證的同時，「依法」要求全民按捺指紋並錄存, 此舉引發了行...",國民身分證 ; 指紋,國民身分證 ; 指紋
1504,張顯耀,2.912579703e-06,針對整合警力資源、擴大警政服務範疇，建議行政院將現行警政署、海岸巡防署、消防署及入出境管理局...,政府組織,警力資源 ; 警政服務 ; 警政總署
6347,王幸男,2.851536692e-06,針對本院交通委員會在台中市長夫人邵曉鈴女士車禍事件之後，以超高的效率通過了「道路交通管理處罰...,交通規則,道路交通管理處罰條例 ; 安全帶
6532,王幸男,2.288767064e-06,針對陳總統日昨重申，有信心一定能夠在二○○八年卸任總統之前，為台灣催生一部合身、合時、合用的...,憲法修正,新憲法 ; 修改憲法


#### (2) Export an End-to-end Model


In [28]:
from tensorflow.keras.models import load_model

In [29]:
# Get model (Sequential, Functional Model, or Model subclass)
Dcnn.save('lour_pork_model')

INFO:tensorflow:Assets written to: lour_pork_model/assets


INFO:tensorflow:Assets written to: lour_pork_model/assets


In [30]:
from tensorflow import keras
model = keras.models.load_model('lour_pork_model')

import pandas as pd
import random

# test the trined model using new text from 200 samples from Leislative Questions
LQ5 = pd.read_csv('LQ_5th.csv')

# drop legislor whose speech token less than 15
LQ5 = LQ5[[len(x)>15 for x in LQ5.title]]

# split the sentence with space
full_set = [seg_char(text) for text in [clearPucts(text) for text in LQ5.title]]

def recode_as_pork(pork):
    if pork>0.75:
        return 1
    else:
        return 0

# fit the model    
LQ5['pork_value'] = pd.to_numeric([as_num(Dcnn(np.array([tokenizer.encode(line)]), training=True).numpy()[0][0]) for line in full_set])

# classify pork_value>0.75 into pork 
LQ5['pork_dummy'] = [recode_as_pork(x) for x in LQ5['pork_value']]

# save
LQ5.to_csv('LQ5_pork.csv', index=False)