In [1]:
import warnings
import pandas as pd
import numpy as np
import os
from zhon.hanzi import punctuation
from ckiptagger import WS, POS, NER, construct_dictionary
warnings.filterwarnings('ignore')

In [2]:
# from ckiptagger import data_utils
# data_utils.download_data_gdown("./")

In [3]:
# global keyword dictionary so we won't have to mess with the replace_synonyms function too much
keyword_dict = {}
keyword_weight = 2

def replace_synonyms(para, chem_syn, crop_syn, pest_syn):
    """
    replace all the synonym by the longest one.
    Input: one sentence and all three keyword dataframe
    Output: replaced sentence (with the longest one)
    """
    global keyword_dict
    global keyword_weight
    
    # chem
    for i in range(len(chem_syn)):
        base_word = max(chem_syn[i], key=len)
        for word in chem_syn[i]:
            para = para.replace(word, base_word)
        keyword_dict[base_word] = keyword_weight
                
    # crop
    for i in range(len(crop_syn)):
        base_word = max(crop_syn[i], key=len)
        for word in crop_syn[i]:
            para = para.replace(word, base_word)
        keyword_dict[base_word] = keyword_weight

    # pest
    for i in range(len(pest_syn)):
        base_word = max(pest_syn[i], key=len)
        for word in pest_syn[i]:
            para = para.replace(word, base_word)
        keyword_dict[base_word] = keyword_weight

    return para

def get_text(directory):
    """
    get all text data from `directory`
    input : the text files directory
    output : all text in a list `all_data`
    """
    # Get all the text file in that directory
    all_data = []
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            with open(f"{directory}/{file}", 'r', encoding="utf-8") as f:
                tmp = f.read()
                tmp = tmp.replace("\n", "")
                for i in punctuation:
                    tmp = tmp.replace(i, '')
                tmp = tmp.replace("\x7f", "")
                all_data.append(tmp)   
    return all_data

def unify(text_data, syn_directory):
    """
    replace all synonyms in each sentence of `text_data`
    input: a list of sentence, `text_data` and the directory of keyword data
    output: a list of replace sentence
    """
    # Read all xlsx files (no headers in file) and collapse into a Series of lists
    chem_syn = pd.read_excel(f"{syn_directory}/02chem.list.xlsx", header=None).apply(lambda x: x.dropna().tolist(), axis=1)
    crop_syn = pd.read_excel(f"{syn_directory}/02crop.list.xlsx", header=None).apply(lambda x: x.dropna().tolist(), axis=1)
    pest_syn = pd.read_excel(f"{syn_directory}/02pest.list.xlsx", header=None).apply(lambda x: x.dropna().tolist(), axis=1)
    
    res = []
    for para in text_data:
        tmp = replace_synonyms(para, chem_syn, crop_syn, pest_syn)
        res.append(tmp)
    return res

def preprocessing(directory, syn_directory):
    """
    get all text data from `directory`, replace all keyword in `syn_directory` with longest synonym, then tokenize it
    input: directory of text file, directory of synonym keyword
    output: a list of tokens
    """
    global keyword_dict
    
    #get text data
    all_data = get_text(directory)
    rep_data = unify(all_data, syn_directory)

    # Tokenize 
    ws = WS("./data")
    # We use coerce dictionary to force the segmenter to leave our keywords untouched
    word_spilt = ws(rep_data, coerce_dictionary=construct_dictionary(keyword_dict))
    
    #id_name
    name = os.listdir(directory)
    id_name = []
    for i in name:
        if i.endswith(".txt"):
            temp = i.split(".")[0]
            id_name.append(temp)
        
    processed_df = pd.DataFrame({'doc_index': id_name, 'raw_text': all_data, 'tokens_rep': word_spilt})
    
    processed_df['tokens_num'] = ""
    p_len = len(processed_df['doc_index'])
    for i in range(p_len):
        processed_df['tokens_num'][i] = len(processed_df['tokens_rep'][i])
    
    return processed_df

In [4]:
# use example
path = "../AI_Cup/dataTrainComplete" # path to training file(.txt)
syn_path = "../AI_Cup/Keywords" # path to the keyword file
data = preprocessing(path, syn_path)

In [5]:
data

Unnamed: 0,doc_index,raw_text,tokens_rep,tokens_num
0,1,梅雨季來臨文旦黑點病易發生請注意病徵以及早加強防治措施5月已進入梅雨季節近日連續降雨為文旦黑...,"[梅, 雨季, 來臨, 麻豆文旦, 黑點病, 易, 發生, 請, 注意, 病徵, 以, 及早...",150
1,10,天氣多變溫差大近山區及偏施氮肥田區稻熱病發病較為嚴重籲請農友注意防治花蓮區農改場防檢局及田邊...,"[天氣, 多變, 溫差, 大近, 山區, 及, 偏施, 氮肥田區, 稻熱病, 發病, 較為,...",288
2,1000,新聞稿-稻熱病進入好發季節防檢局籲請農友加強防範行政院農業委員會動植物防疫檢疫局以下簡稱防檢...,"[新聞稿, -, 稻熱病, 進入, 好, 發, 季節, 防檢局, 籲請, 農友, 加強, 防...",420
3,1005,稻熱病進入好發季節防檢局籲請農友加強防治農委會防檢局表示自3月起全國各地水稻生長陸續進入分蘗...,"[稻熱病, 進入, 好, 發, 季節, 防檢局, 籲請, 農友, 加強, 防治, 農委會, ...",408
4,1007,乍暖還寒防檢局籲請農友加強防治稻熱病農委會防檢局表示全國各地水稻生長陸續進入分蘗期因逢暖冬且...,"[乍暖還寒, 防檢局, 籲請, 農友, 加強, 防治, 稻熱病, 農委會, 防檢局, 表示,...",379
...,...,...,...,...
555,986,苗栗區農業改良場發佈水稻白葉枯病警報糧食作物病蟲害發生警報中華民國90年9月21日發佈第3號...,"[苗栗區, 農業, 改良場, 發佈, 水稻, 白葉枯病, 警報, 糧食, 作物, 病蟲害, ...",277
556,988,雨後適合稻熱病發生請持續進行監測並指導農民防治依據氣象預報今年自五月中旬起臺灣地區即進入梅雨...,"[雨, 後, 適合, 稻熱病, 發生, 請, 持續, 進行, 監測, 並, 指導, 農民, ...",87
557,992,新入侵果實蠅緊急撲滅模擬演習 新聞稿新入侵植物害蟲緊急撲滅演習產官學總動員嚴防外來疫病蟲害...,"[新, 入侵, 果實蠅, 緊急, 撲滅, 模擬, 演習, 多, 保, 鏈黴素溶液, 可濕性,...",432
558,997,梨木蝨危害請農友注意防範梨木蝨危害請農友注意防範行政院農業委員會動植物防疫檢疫局(以下簡稱防...,"[中國梨, 木蝨, 危害, 請, 農友, 注意, 防範, 中國梨, 木蝨, 危害, 請, 農...",396


In [6]:
def form_training_ref(df, train_directory):
    """
    Only for construct training data, add `reference` columns
    for test data, neglect this process
    """
    train_label = pd.read_csv(f"{train_directory}/TrainLabel.csv")
    df['reference'] = df['doc_index'].apply(lambda idx: train_label.query(f'Test == {idx}')['Reference'].tolist()) 
    return df

In [7]:
train_path = "../AI_Cup" # path to reference file
train_df = form_training_ref(data, train_path)

In [8]:
train_df

Unnamed: 0,doc_index,raw_text,tokens_rep,tokens_num,reference
0,1,梅雨季來臨文旦黑點病易發生請注意病徵以及早加強防治措施5月已進入梅雨季節近日連續降雨為文旦黑...,"[梅, 雨季, 來臨, 麻豆文旦, 黑點病, 易, 發生, 請, 注意, 病徵, 以, 及早...",150,[]
1,10,天氣多變溫差大近山區及偏施氮肥田區稻熱病發病較為嚴重籲請農友注意防治花蓮區農改場防檢局及田邊...,"[天氣, 多變, 溫差, 大近, 山區, 及, 偏施, 氮肥田區, 稻熱病, 發病, 較為,...",288,[]
2,1000,新聞稿-稻熱病進入好發季節防檢局籲請農友加強防範行政院農業委員會動植物防疫檢疫局以下簡稱防檢...,"[新聞稿, -, 稻熱病, 進入, 好, 發, 季節, 防檢局, 籲請, 農友, 加強, 防...",420,"[1005, 1023]"
3,1005,稻熱病進入好發季節防檢局籲請農友加強防治農委會防檢局表示自3月起全國各地水稻生長陸續進入分蘗...,"[稻熱病, 進入, 好, 發, 季節, 防檢局, 籲請, 農友, 加強, 防治, 農委會, ...",408,"[1000, 1023]"
4,1007,乍暖還寒防檢局籲請農友加強防治稻熱病農委會防檢局表示全國各地水稻生長陸續進入分蘗期因逢暖冬且...,"[乍暖還寒, 防檢局, 籲請, 農友, 加強, 防治, 稻熱病, 農委會, 防檢局, 表示,...",379,"[438, 893, 1000, 1005, 1015, 1023]"
...,...,...,...,...,...
555,986,苗栗區農業改良場發佈水稻白葉枯病警報糧食作物病蟲害發生警報中華民國90年9月21日發佈第3號...,"[苗栗區, 農業, 改良場, 發佈, 水稻, 白葉枯病, 警報, 糧食, 作物, 病蟲害, ...",277,"[27, 46, 49, 95, 129, 163, 344, 375, 470, 476,..."
556,988,雨後適合稻熱病發生請持續進行監測並指導農民防治依據氣象預報今年自五月中旬起臺灣地區即進入梅雨...,"[雨, 後, 適合, 稻熱病, 發生, 請, 持續, 進行, 監測, 並, 指導, 農民, ...",87,"[188, 286, 289, 296, 438, 809, 891, 893, 913, ..."
557,992,新入侵果實蠅緊急撲滅模擬演習 新聞稿新入侵植物害蟲緊急撲滅演習產官學總動員嚴防外來疫病蟲害...,"[新, 入侵, 果實蠅, 緊急, 撲滅, 模擬, 演習, 多, 保, 鏈黴素溶液, 可濕性,...",432,[]
558,997,梨木蝨危害請農友注意防範梨木蝨危害請農友注意防範行政院農業委員會動植物防疫檢疫局(以下簡稱防...,"[中國梨, 木蝨, 危害, 請, 農友, 注意, 防範, 中國梨, 木蝨, 危害, 請, 農...",396,[]


In [9]:
# Save to CSV file
train_df.to_csv("./processed_data.csv", index=False)

In [10]:
print(keyword_dict)

{'貝芬硫可濕性粉劑': 2, '腈硫醌可濕性粉劑': 2, '鋅錳乃浦可濕性粉劑': 2, '費洛蒙緩釋劑': 2, '蘇力菌水分散性粒劑': 2, '比多農可濕性粉劑': 2, '夏油(乙)': 2, '亞磷酸稀釋液': 2, '葵花油乳化液': 2, '窄域油': 2, '核多角體病毒': 2, '苦楝油': 2, '微生物製劑': 2, '三賽唑可濕性粉劑': 2, '亞賜圃可濕性粉劑': 2, '嘉賜黴素可溼性粉劑': 2, '喜樂克拉乳劑': 2, '護粒松乳劑': 2, '芬殺松乳劑稀釋': 2, '加保利可濕性粉劑可濕性粉劑': 2, '加保扶水懸劑稀釋': 2, '第滅寧水懸劑': 2, '甲基多保淨可濕性粉劑': 2, '嘉賜三賽唑可濕性粉劑': 2, '護粒三賽唑可濕性粉劑': 2, '撲殺培丹粒劑': 2, '撲殺熱藥劑': 2, '保米熱必斯可濕性粉劑': 2, '肉桂油乳化液': 2, '陶斯松乳劑或水基乳劑': 2, '賽洛寧膠囊懸著劑': 2, '苦楝油+皂素': 2, '丙基喜樂松乳劑': 2, '富米熱斯可濕性粉劑': 2, '賜諾特水懸劑': 2, '益達胺水懸劑': 2, '布芬淨可濕性粉劑': 2, '含毒蛋白質酵母錠': 2, '賜諾殺濃餌劑': 2, '加普胺水懸劑': 2, '鏈四環黴素水溶性粉劑': 2, '克枯爛可濕性粉劑': 2, '培丹水溶性粉劑': 2, '芬普尼水懸劑': 2, '撲滅松乳劑': 2, '賽達松乳劑': 2, '布芬第滅寧水懸劑': 2, '腐絕快得寧可濕性粉劑': 2, '待克利水分散性粒劑': 2, '百克敏乳劑': 2, '嘉賜克枯欄可濕性粉劑': 2, '克枯三賽唑可濕性粉劑': 2, '可尼丁水溶性粒劑': 2, '益滅松可濕性粉劑': 2, '布得芬諾可濕性粉劑': 2, '賓克隆可濕性粉劑': 2, '福多寧水懸劑': 2, '菲克利水懸劑': 2, '賽氟滅水懸劑': 2, '依普座水懸劑': 2, '滅蝨丹粒劑': 2, '四克利水基乳': 2, '保米黴素可濕性粉劑': 2, '三氟敏': 2, '免扶克粒劑': 2, '丁基加保扶可濕性粉劑': 2, '賽速安水溶性粒劑': 2, '氟尼胺水分散性粒劑': 2, '達特南水溶性粒劑': 2, '矽護芬水基乳劑': 2, 