#### source: https://github.com/google-research/bert/blob/master/README.md

### BERT fine-tune 實作分成三部分
### 1. 準備資料
### 2. 修改run_classifier.py 中的processor
### 3. 執行run_classifier.py 與設定參數

# ---------------------------------------------------

#### 1. 準備資料

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pickle
import uuid

In [2]:
df = pd.read_csv('./data/19999_question_category_a_v10.csv')

In [3]:
# df=df[((df.category_a_target != 7) & (df.category_a_target != 1) 
#        & (df.category_a_target != 2) & (df.category_a_target != 3))]

#df=df[((df.category_a_target != 7))]

In [4]:
df.head(2)

Unnamed: 0,description,category_a,category_b,category_c,category_d,category,description_clean,category_target
0,"無法會員登入,顯示訊息:無效的帳號或使用者不存在。帳號:96122401",CAMP,CAMP,CAMP,CAMP,資訊系統,無法會員登入 顯示訊息 無效的帳號或不存在 帳號,6
1,因今天有事請假，要補登先前的加班，但開啟camp後，出勤表單顯示的內容看起來像測試的，沒有最...,CAMP,CAMP,CAMP,CAMP,資訊系統,因有事請假 要補登先前的加班 但開啟camp後 出勤表單顯示的內容像測試的 沒有的時間也沒辦...,6


In [5]:
# t_df=df.loc[df['description']=='無法登入雲端']
# t_df

# t_df=df.loc[df['description']=='網管 Outlook一直Lag,無法作業。']
# t_df

In [6]:
df.shape

(10163, 8)

In [7]:
df.groupby(['category','category_target']).size()

category    category_target
Outlook與郵件  0                  1778
SAP         1                  1445
VPN連線       2                   311
其他          3                  2621
商務用Skype    4                   362
網路          5                  2008
資訊系統        6                  1380
電腦防毒        7                   258
dtype: int64

In [8]:
#drop category
# df=df.loc[(df['category_target'] != 2) & (df['category_target'] != 4) & (df['category_target'] != 7)]

In [9]:
df=df.drop(['category_a','category_b','category_c','category_d','description_clean'], axis=1)
df.head(5)

Unnamed: 0,description,category,category_target
0,"無法會員登入,顯示訊息:無效的帳號或使用者不存在。帳號:96122401",資訊系統,6
1,因今天有事請假，要補登先前的加班，但開啟camp後，出勤表單顯示的內容看起來像測試的，沒有最...,資訊系統,6
2,"手機簽核BPM有問題,都顯示亂碼 #18227",資訊系統,6
3,無法在手機上面使用CAMP進行表單簽核動作。分機：17132,資訊系統,6
4,"出差同仁無法登入camp, 委請同事詢問",資訊系統,6


In [10]:
df.columns = ['description','category_a','category_a_target']

In [11]:
df.head(5)

Unnamed: 0,description,category_a,category_a_target
0,"無法會員登入,顯示訊息:無效的帳號或使用者不存在。帳號:96122401",資訊系統,6
1,因今天有事請假，要補登先前的加班，但開啟camp後，出勤表單顯示的內容看起來像測試的，沒有最...,資訊系統,6
2,"手機簽核BPM有問題,都顯示亂碼 #18227",資訊系統,6
3,無法在手機上面使用CAMP進行表單簽核動作。分機：17132,資訊系統,6
4,"出差同仁無法登入camp, 委請同事詢問",資訊系統,6


In [12]:
sorted(df.category_a_target.unique())

[0, 1, 2, 3, 4, 5, 6, 7]

In [13]:
pd.unique(df[['category_a', 'category_a_target']].values.ravel('K'))

array(['資訊系統', 'SAP', '電腦防毒', '其他', '網路', 'Outlook與郵件', 'VPN連線',
       '商務用Skype', 6, 1, 7, 3, 5, 0, 2, 4], dtype=object)

In [14]:
pd.concat([df['category_a'], df['category_a_target']]).unique()

array(['資訊系統', 'SAP', '電腦防毒', '其他', '網路', 'Outlook與郵件', 'VPN連線',
       '商務用Skype', 6, 1, 7, 3, 5, 0, 2, 4], dtype=object)

In [15]:
category_df =df[['category_a', 'category_a_target']]
category_df = category_df.drop_duplicates()
category_df

Unnamed: 0,category_a,category_a_target
0,資訊系統,6
14,SAP,1
96,電腦防毒,7
97,其他,3
539,網路,5
598,Outlook與郵件,0
629,VPN連線,2
1642,商務用Skype,4


In [16]:
df.shape

(10163, 3)

In [17]:
df.groupby(['category_a','category_a_target']).size()

category_a  category_a_target
Outlook與郵件  0                    1778
SAP         1                    1445
VPN連線       2                     311
其他          3                    2621
商務用Skype    4                     362
網路          5                    2008
資訊系統        6                    1380
電腦防毒        7                     258
dtype: int64

### 現在我們將資料轉換成 BERT 需要的資料形式。根據官方fine tune教學，需要給的資料有四個column，分別是:
### (1) guid: Unique id for the example.

### (2) text_a: text_a: string. The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified.
### (3) text_b(Optional): string. The untokenized text of the second sequence. Only must be specified for sequence pair tasks.
### (4) label(Optional): string. The label of the example. This should be specified for train and dev examples, but not for test examples.

In [18]:
data_for_bert = []
strat_list = []
for i in range(len(df)):
    data_for_bert.append((i, #guid
                          df.iloc[i,:].description, #text_a
                          None, #text_b
                          df.iloc[i,:].category_a_target #label
                         ))
    strat_list.append((i,df.iloc[i,:].category_a_target))

In [19]:
len(data_for_bert)

10163

### 產生test.csv,下列兩項擇一

In [20]:
# #generate test data from 19999_test_question_v1.csv
# test_df = pd.read_csv('./data/19999_test_question_v1.csv')
# test_df=test_df.drop(['category_a','category_a_b','description_clean'], axis=1)
# test_df
# test_df.columns = ['description','label']
# test_df.to_csv('tmp/call_center_input/test.csv', index=False, header=True)

In [21]:
#random select to test.csv
#import random
#test_df = pd.DataFrame(random.sample(data_for_bert, 10))
test_df = pd.DataFrame(data_for_bert)
test_df=test_df.sample(frac=0.01) #frac是要返回的比例
test_df.columns = ['id', 'description', 'text_b', 'label']
test_df_2=test_df.drop(['id', 'text_b'], axis=1)
#test_df
test_df_2.to_csv('tmp/call_center_input/test.csv', index=False, header=True)

### 產生test.tsv

In [22]:
#test.csv -> test.tsv
import pandas as pd
df_temp = pd.read_csv('tmp/call_center_input/test.csv') 
df_temp.to_csv('tmp/call_center_input/test.tsv', sep='\t', index=False, header=True) #

In [23]:
#test_df.head(5)
len(test_df)

102

In [24]:
len(data_for_bert)

10163

In [25]:
for s in test_df.id:
    data_for_bert=[k for k in data_for_bert if s not in k]

In [26]:
len(data_for_bert)

10061

In [27]:
for s in test_df.id:
    strat_list=[k for k in strat_list if s not in k]

In [28]:
len(strat_list)

10061

In [29]:
strat_list_temp = []
for s in strat_list:
    strat_list_temp.append(s[1])
strat_list=strat_list_temp

### 產生pickle_file

In [30]:
train, dev = train_test_split(data_for_bert, test_size=0.2, stratify=strat_list)

In [31]:
print('length for fine-tune training data: ',len(train))
print('length for fine-tune development data: ',len(dev))
print('first instance:', data_for_bert[0])

length for fine-tune training data:  8048
length for fine-tune development data:  2013
first instance: (0, '無法會員登入,顯示訊息:無效的帳號或使用者不存在。帳號:96122401', None, 6)


In [32]:
with open('bert_train.p',mode = 'wb') as pickle_file:
    pickle.dump(train, pickle_file)

with open('bert_dev.p',mode = 'wb') as pickle_file:
    pickle.dump(dev, pickle_file)


#### 2. 於影片中說明如何修改 run_classifier.py 中的 processor

#### 3. 執行run_classifier.py 並設定參數

In [33]:
# import os
# os.environ["TF_CPP_MIN_LOG_LEVEL"]="3" 
#'1' # 這是默認的顯示等級，顯示所有信息
#'2' # 只顯示 warning 和 Error
#'3' # 只顯示 Error

In [34]:
!python run_classifier.py \
  --task_name=call_center \
  --do_train=true \
  --do_eval=true \
  --data_dir=. \
  --vocab_file=chinese_L-12_H-768_A-12/vocab.txt \
  --bert_config_file=chinese_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint= chinese_L-12_H-768_A-12/bert_model.ckpt \
  --max_seq_length=128 \
  --output_dir=tmp/call_center_output/ \
  --train_batch_size=32 \
  --learning_rate=2e-5 \
  --num_train_epochs=3.0 \

  from ._conv import register_converters as _register_converters
INFO:tensorflow:Using config: {'_model_dir': 'tmp/call_center_output/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3b07a6e978>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None), '_cluster': None}
INFO:tensorflow:_TPUContext: eval_on_tpu True
INFO:tensorflow:Writing example 0 of 8048
IN

### predict test.tsv

In [35]:
!python run_classifier.py \
  --task_name=call_center \
  --do_predict=true \
  --data_dir=tmp/call_center_input \
  --vocab_file=chinese_L-12_H-768_A-12/vocab.txt \
  --bert_config_file=chinese_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint=chinese_L-12_H-768_A-12/bert_model.ckpt \
  --max_seq_length=128 \
  --output_dir=tmp/call_center_output/

  from ._conv import register_converters as _register_converters
INFO:tensorflow:Using config: {'_model_dir': 'tmp/call_center_output/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9bb23c8518>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None), '_cluster': None}
INFO:tensorflow:_TPUContext: eval_on_tpu True
INFO:tensorflow:['description', 'label']
INF

### 預測準確率

In [36]:
#test_df
#test_df2 = pd.read_csv('tmp/call_center_input/test.csv')
test_label = test_df['label'].tolist()
len(test_label)

102

In [37]:
test_predict_df = pd.read_csv('tmp/call_center_output/test_results.tsv',header=None,sep = '\t')
#test_predict_df

In [38]:
# a=np.argmax(test_predict_df.as_matrix(), axis=-1)  
# a

In [39]:
# #[5, 0, 4, 6]
# for n, i in enumerate(a):
#     if i == 0:
#         a[n] = 5
#     if i == 1:
#         a[n] = 0
#     if i == 2:
#         a[n] = 4  
#     if i == 3:
#         a[n] = 6
# a

In [40]:
test_predict_label =np.argmax(test_predict_df.as_matrix(), axis=-1)  
#test_predict_label =a

result = [1 if x==y else 0 for x,y in zip(test_label,test_predict_label)]


sum(result)/len(result)

  """Entry point for launching an IPython kernel.


0.8235294117647058

In [41]:
# category_df[category_df['category_a_target']==8].category_a.iloc[0]

In [42]:
#test_predict_label[0]
predict_no = []
for i in range(len(test_predict_label)):
    if not (test_predict_label[i]==test_label[i]):
        predict_no.append((i,
                          #test_predict_label[i],
                          category_df[category_df['category_a_target']==test_predict_label[i]].category_a.iloc[0],
                          #test_label[i],
                          category_df[category_df['category_a_target']==test_label[i]].category_a.iloc[0],
                          test_df.iloc[i].description
                         ))
predict_no

[(7,
  'Outlook與郵件',
  '電腦防毒',
  '#15233  懷疑電腦中毒，時常跳出訊息，也有影響到outlook的使用  請幫忙處理一下 感謝'),
 (9, '資訊系統', '電腦防毒', '系統不明執行檔NTRTSCAN.EXE  執行影響CPU效能問題'),
 (12, 'Outlook與郵件', '資訊系統', '收到郵件的簽核通知，點進去之後卻顯示沒權限'),
 (13, '網路', '其他', '換座位，無線網路很慢，想開通有線網路的網點，PM22 C286'),
 (17, '其他', 'SAP', '執行到特定階段卡住~無法關閉'),
 (21, 'Outlook與郵件', '其他', '客人寄來的PDF檔案無法正確開啟'),
 (27, '其他', '網路', '舊電腦最近拿來使用都需要敲入帳號密碼。#15610'),
 (29,
  '網路',
  '其他',
  'User want to install OS via network installation. But he has IP problem'),
 (32, '資訊系統', '其他', '工作需求想申請Email群組 #13619'),
 (41, '資訊系統', '其他', '無法開啟BPM系統網頁'),
 (49, '網路', '其他', '在國外需要連intra system，請問操作文件放的網址  ext:13022'),
 (58,
  '資訊系統',
  '商務用Skype',
  '請協助查看有2位同仁的Lync無法登入的問題。人員資料如下：  工號：10403104 / 10404041   姓名：鍾意宸 / 魏秀珊'),
 (60,
  '資訊系統',
  '電腦防毒',
  'QCN Report    We are unable to make VOIP calls from our end. This affects calls to all - QCI, QCB, QCG   We get busy signals and disconnects.   This happens since yesterday.'),
 (77,
  '資訊系統',
  '其他',
  '你好 我在使用 ,員工服務專區 的會議室管理系統, 在預約會

In [43]:
#t_df=df.loc[df['description']=='人在QCA出差 密碼過期無法收發Mail']

# t_df=df.loc[df['description']=='ProE 在使用  很容易被踢出  不確定是否是License 問題  或多人在使用']
# t_df

In [44]:
print(len(test_predict_label))
print(len(test_label))

102
102


In [45]:
# print(test_predict_label)
# print(test_label)