In [11]:
import re
import os
import string
import nltk
import pandas as pd
from nltk import word_tokenize, pos_tag, corpus
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# 讀資料貼標

In [2]:
# train / test / validation data
# target = "Train_Textual"
target = "Test_Intuitive"

# 訓練資料檔案名稱
listdirs = os.listdir(target)

In [3]:
# [(text,y), (text,y), .....]
data = []
for filename in listdirs:
    y = filename.split("_")[0]        # 結果
    with open("{}/{}".format(target, filename), "r") as f:
        text = f.read()
    data.append((text.lower(), y))

In [12]:
# 移除標點符號&數字
def remove_punctuation(line):
    rule = re.compile(r"[^a-zA-Z]")
    line = rule.sub(' ',line)
    return line

In [13]:
# 獲取單詞的詞性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# 資料清整

In [13]:
temp = []
temp1 = []
for d in data:
    text = d[0]
    y = d[1]
    y = 1 if y == "Y" else 0    #"Y"=1 其他=0
    
    # 移除標點符號
    text = remove_punctuation(text)

    # 斷字
    words = word_tokenize(text)

    
    # 詞性
    words_tags = pos_tag(words)     # 獲取單詞詞性
    
    
    # 去除停用字 且 字長度>2
    nltk_stopwords = nltk.corpus.stopwords.words("english")
    wnl = WordNetLemmatizer()
    clean_words = []
    for word_tag in words_tags:
        wordnet_pos = get_wordnet_pos(word_tag[1]) or wordnet.NOUN
        origin_word = wnl.lemmatize(word_tag[0], pos=wordnet_pos)    # 詞性還原
#         print(word_tag[0], origin_word)
        
        if origin_word not in nltk_stopwords and len(origin_word) > 2:
            clean_words.append(origin_word)
            
    temp.append((y, clean_words))                         # W2V不能去除重複字，不然算相似詞會有問題
    temp1.append((y, list(set(clean_words))))             # 計算權重時去除重複字
                 

In [14]:
import pandas as pd
pd.DataFrame(temp, columns=["y", "clean_words"]).to_pickle("clean_{}.pk".format(target))
pd.DataFrame(temp1, columns=["y", "clean_words"]).to_pickle("clean_{}_VAL.pk".format(target))

# validation也要整理

In [19]:
target = "Validation"

# 訓練資料檔案名稱
listdirs = os.listdir(target)

# [(text,y), (text,y), .....]
data = []
for filename in listdirs:
    with open("{}/{}".format(target, filename), "r") as f:
        text = f.read()
    data.append((filename, text.lower()))

In [21]:
temp = []
temp1 = []
for d in data:
    filename = d[0]
    text = d[1]
    
    # 移除標點符號
    text = remove_punctuation(text)

    # 斷字
    words = word_tokenize(text)

    
    # 詞性
    words_tags = pos_tag(words)     # 獲取單詞詞性
    
    
    # 去除停用字 且 字長度>2
    nltk_stopwords = nltk.corpus.stopwords.words("english")
    wnl = WordNetLemmatizer()
    clean_words = []
    for word_tag in words_tags:
        wordnet_pos = get_wordnet_pos(word_tag[1]) or wordnet.NOUN
        origin_word = wnl.lemmatize(word_tag[0], pos=wordnet_pos)    # 詞性還原
#         print(word_tag[0], origin_word)
        
        if origin_word not in nltk_stopwords and len(origin_word) > 2:
            clean_words.append(origin_word)
            
    temp.append((filename, clean_words))                    # W2V不能去除重複字，不然算相似詞會有問題，因為還未有預測結果我先填None
    temp1.append((filename, list(set(clean_words))))        # 計算權重時去除重複字

In [22]:
import pandas as pd
pd.DataFrame(temp, columns=["Filename", "clean_words"]).to_pickle("clean_{}.pk".format(target))
pd.DataFrame(temp1, columns=["Filename", "clean_words"]).to_pickle("clean_{}_VAL.pk".format(target))