In [1]:
import warnings
import pandas as pd
import numpy as np
import os
from zhon.hanzi import punctuation
from ckiptagger import WS, POS, NER
warnings.filterwarnings('ignore')

In [67]:
# from ckiptagger import data_utils
# data_utils.download_data_gdown("./")

In [2]:
def replace_synonyms(para, chem_syn, crop_syn, pest_syn):
    """
    replace all the synonym by the longest one.
    Input: one sentence and all three keyword dataframe
    Output: replaced sentence (with the longest one)
    """
    # chem
    s_len = chem_syn.shape[0]
    chem_cols = chem_syn.columns
    for i in range(s_len):
        # find the longest syn.
        base_word = chem_syn['synonym1'][i]
        for c in chem_cols:
            if chem_syn[c][i]!= 0 and len(chem_syn[c][i]) > len(base_word):
                base_word = chem_syn[c][i]
        # replace all syn.
        for col in chem_cols:
            if chem_syn[col][i] != 0:
                para = para.replace(chem_syn[col][i], base_word)
                
    # crop
    s_len = crop_syn.shape[0]
    crop_cols = crop_syn.columns
    for i in range(s_len):
        # find the longest syn.
        base_word = crop_syn['synonym1'][i]
        for c in crop_cols:
            if crop_syn[c][i]!= 0 and len(crop_syn[c][i]) > len(base_word):
                base_word = crop_syn[c][i]
        # replace all syn.
        for col in crop_cols:
            if crop_syn[col][i] != 0:
                para = para.replace(crop_syn[col][i], base_word)

    # pest
    s_len = pest_syn.shape[0]
    pest_cols = pest_syn.columns
    for i in range(s_len):
        # find the longest syn.
        base_word = pest_syn['synonym1'][i]
        for c in pest_cols:
            if pest_syn[c][i]!= 0 and len(pest_syn[c][i]) > len(base_word):
                base_word = pest_syn[c][i]
        # replace all syn.
        for col in pest_cols:
            if pest_syn[col][i] != 0:
                para = para.replace(pest_syn[col][i], base_word)

    return para

def get_text(directory):
    """
    get all text data from `directory`
    input : the text files directory
    output : all text in a list `all_data`
    """
    # Get all the text file in that directory
    all_data = []
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            with open(directory + "/"+ file, 'r', encoding="utf-8") as f:
                tmp = f.read()
                tmp = tmp.replace("\n", "")
                for i in punctuation:
                    tmp = tmp.replace(i, '')
                tmp = tmp.replace("\x7f", "")
                all_data.append(tmp)   
    return all_data

def unify(text_data, syn_directory):
    """
    replace all synonyms in each sentence of `text_data`
    input: a list of sentence, `text_data` and the directory of keyword data
    output: a list of replace sentence
    """
    chem_syn = pd.read_excel(syn_directory + "/02chem.list.xlsx").fillna(0)
    crop_syn = pd.read_excel(syn_directory + "/02crop.list.xlsx").fillna(0)
    pest_syn = pd.read_excel(syn_directory + "/02pest.list.xlsx").fillna(0)
    res = []
    for para in text_data:
        tmp = replace_synonyms(para, chem_syn, crop_syn, pest_syn)
        res.append(tmp)
    return res

def preprocessing(directory, syn_directory):
    """
    get all text data from `directoy`, replace all keyword in `syn_directory` with longest synonym, then tokenize it
    input: directory of text file, directory of synonym keyword
    output: a list of tokens
    """
    #get text data
    all_data = get_text(directory)
    rep_data = unify(all_data, syn_directory)

    # Tokenize 
    ws = WS("./data")
    word_spilt = ws(rep_data)
    
    #id_name
    name = os.listdir(directory)
    id_name = []
    for i in name:
        if i.endswith(".txt"):
            temp = i.split(".")[0]
            id_name.append(temp)
        
    processed_df = pd.DataFrame({'doc_index': id_name, 'raw_text': all_data, 'tokens_rep': word_spilt})
    
    processed_df['tokens_num'] = ""
    p_len = len(processed_df['doc_index'])
    for i in range(p_len):
        processed_df['tokens_num'][i] = len(processed_df['tokens_rep'][i])
    
    return processed_df

In [3]:
# use example
path = "C:/Users/WangHongWen/Desktop/data_mining/final_project/competition_data" # path to training file(.txt)
syn_path = "C:/Users/WangHongWen/Desktop/data_mining/final_project/other_info/Keywords" # path to the keyword file
data = preprocessing(path, syn_path)

In [4]:
data

Unnamed: 0,doc_index,raw_text,tokens_rep,tokens_num
0,1,梅雨季來臨文旦黑點病易發生請注意病徵以及早加強防治措施5月已進入梅雨季節近日連續降雨為文旦黑...,"[梅雨季, 來臨, 麻豆, 文旦, 黑點病, 易, 發生, 請, 注意, 病徵, 以, 及早...",160
1,10,天氣多變溫差大近山區及偏施氮肥田區稻熱病發病較為嚴重籲請農友注意防治花蓮區農改場防檢局及田邊...,"[天氣, 多變, 溫差, 大近, 山區, 及, 偏施, 氮肥田區, 稻熱病, 發病, 較為,...",302
2,1000,新聞稿-稻熱病進入好發季節防檢局籲請農友加強防範行政院農業委員會動植物防疫檢疫局以下簡稱防檢...,"[新聞稿, -, 稻熱病, 進入, 好, 發, 季節, 防檢局, 籲請, 農友, 加強, 防...",421
3,1005,稻熱病進入好發季節防檢局籲請農友加強防治農委會防檢局表示自3月起全國各地水稻生長陸續進入分蘗...,"[稻熱病, 進入, 好, 發, 季節, 防檢局, 籲請, 農友, 加強, 防治, 農委會, ...",409
4,1007,乍暖還寒防檢局籲請農友加強防治稻熱病農委會防檢局表示全國各地水稻生長陸續進入分蘗期因逢暖冬且...,"[乍暖還寒, 防檢局, 籲請, 農友, 加強, 防治, 稻熱病, 農委會, 防檢局, 表示,...",379
...,...,...,...,...
555,986,苗栗區農業改良場發佈水稻白葉枯病警報糧食作物病蟲害發生警報中華民國90年9月21日發佈第3號...,"[苗栗區, 農業, 改良場, 發佈, 水稻, 白葉, 枯病, 警報, 糧食, 作物, 病蟲害...",299
556,988,雨後適合稻熱病發生請持續進行監測並指導農民防治依據氣象預報今年自五月中旬起臺灣地區即進入梅雨...,"[雨, 後, 適合, 稻熱病, 發生, 請, 持續, 進行, 監測, 並, 指導, 農民, ...",86
557,992,新入侵果實蠅緊急撲滅模擬演習 新聞稿新入侵植物害蟲緊急撲滅演習產官學總動員嚴防外來疫病蟲害...,"[新, 入侵, 果實, 蠅, 緊急, 撲滅, 模擬, 演習, 多, 保鏈, 黴素, 溶液, ...",439
558,997,梨木蝨危害請農友注意防範梨木蝨危害請農友注意防範行政院農業委員會動植物防疫檢疫局(以下簡稱防...,"[中國, 梨木蝨, 危害, 請, 農友, 注意, 防範, 中國, 梨木蝨, 危害, 請, 農...",400


In [5]:
def form_training_ref(df, train_directory):
    """
    Only for construct training data, add `reference` columns
    for test data, neglect this process
    """
    train_lebel = pd.read_csv(train_directory + "/TrainLabel.csv")
    df['reference'] = ""
    p_len = len(df['doc_index'])
    for i in range(p_len):
        idx = df['doc_index'][i]
        df['reference'][i] = list(train_lebel.query('Test == @idx')['Reference'])
    return df

In [6]:
train_path = "C:/Users/WangHongWen/Desktop/data_mining/final_project/other_info" # path to reference file
train_df = form_training_ref(data, train_path)

In [7]:
train_df

Unnamed: 0,doc_index,raw_text,tokens_rep,tokens_num,reference
0,1,梅雨季來臨文旦黑點病易發生請注意病徵以及早加強防治措施5月已進入梅雨季節近日連續降雨為文旦黑...,"[梅雨季, 來臨, 麻豆, 文旦, 黑點病, 易, 發生, 請, 注意, 病徵, 以, 及早...",160,[]
1,10,天氣多變溫差大近山區及偏施氮肥田區稻熱病發病較為嚴重籲請農友注意防治花蓮區農改場防檢局及田邊...,"[天氣, 多變, 溫差, 大近, 山區, 及, 偏施, 氮肥田區, 稻熱病, 發病, 較為,...",302,[]
2,1000,新聞稿-稻熱病進入好發季節防檢局籲請農友加強防範行政院農業委員會動植物防疫檢疫局以下簡稱防檢...,"[新聞稿, -, 稻熱病, 進入, 好, 發, 季節, 防檢局, 籲請, 農友, 加強, 防...",421,"[1005, 1023]"
3,1005,稻熱病進入好發季節防檢局籲請農友加強防治農委會防檢局表示自3月起全國各地水稻生長陸續進入分蘗...,"[稻熱病, 進入, 好, 發, 季節, 防檢局, 籲請, 農友, 加強, 防治, 農委會, ...",409,"[1000, 1023]"
4,1007,乍暖還寒防檢局籲請農友加強防治稻熱病農委會防檢局表示全國各地水稻生長陸續進入分蘗期因逢暖冬且...,"[乍暖還寒, 防檢局, 籲請, 農友, 加強, 防治, 稻熱病, 農委會, 防檢局, 表示,...",379,"[438, 893, 1000, 1005, 1015, 1023]"
...,...,...,...,...,...
555,986,苗栗區農業改良場發佈水稻白葉枯病警報糧食作物病蟲害發生警報中華民國90年9月21日發佈第3號...,"[苗栗區, 農業, 改良場, 發佈, 水稻, 白葉, 枯病, 警報, 糧食, 作物, 病蟲害...",299,"[27, 46, 49, 95, 129, 163, 344, 375, 470, 476,..."
556,988,雨後適合稻熱病發生請持續進行監測並指導農民防治依據氣象預報今年自五月中旬起臺灣地區即進入梅雨...,"[雨, 後, 適合, 稻熱病, 發生, 請, 持續, 進行, 監測, 並, 指導, 農民, ...",86,"[188, 286, 289, 296, 438, 809, 891, 893, 913, ..."
557,992,新入侵果實蠅緊急撲滅模擬演習 新聞稿新入侵植物害蟲緊急撲滅演習產官學總動員嚴防外來疫病蟲害...,"[新, 入侵, 果實, 蠅, 緊急, 撲滅, 模擬, 演習, 多, 保鏈, 黴素, 溶液, ...",439,[]
558,997,梨木蝨危害請農友注意防範梨木蝨危害請農友注意防範行政院農業委員會動植物防疫檢疫局(以下簡稱防...,"[中國, 梨木蝨, 危害, 請, 農友, 注意, 防範, 中國, 梨木蝨, 危害, 請, 農...",400,[]
