원래 statistical feature를 design할 때, 어떤 linguistic knowledge도 적용하지 않는게 원칙이지만, <br>
여기서는 전처리할 때, stemming 대신 lemmatization (pos사용)을 사용한다. <br>
그리고 n-gram 사전을 구축할 때, unknown_token을 단순히 1개로 정의하는게 아니라. <br>
pos를 먼저 실시하여, unknown_noun과 unknown_verb 등과 같이 세부적으로 나눌수 있도록 해준다.

In [55]:
# parameter setting for filtering
stemmming_lemmatisation = 0 # 0: stemming, 1: lemmatisation

# threshold
thr_1gram = 1
thr_2gram = 2
thr_3gram = 2
thr_5gram = 1

thr_component = 1
thr_refinement_of_component = 1
thr_action = 1
thr_refinement_of_action = 1
thr_condition = 1
thr_priority = 1
thr_motivation = 1
thr_role = 1
thr_object = 1
thr_refinement_of_object = 1
thr_sub_action = 1
thr_sub_argument_of_action = 1
thr_sub_priority = 1
thr_sub_role = 1
thr_sub_object = 1
thr_sub_refinement_of_object = 1
thr_none = 1


##################################################################
import numpy as np
import pandas as pd
import operator
import pandas as pd 
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import words
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter
import matplotlib.pyplot as plt
from data_handler import *

PATH = 'assets/freq-dist/'

# Load files
X_train = load('X_train')
Y_train = load('Y_train')
X_test = load('X_test')
Y_test = load('Y_test')
print('train:', len(X_train), ',  test:', len(X_test))

train: 563 ,  test: 141


# Preprocessing

* 전처리 과정에서 가장 중요한 점이 sparsity 문제를 해결하는 것이다. (NLP의 기본적인 전처리)

In [56]:
# def count_token(train):
#     cnt = 0
#     for i, sentence in enumerate(train): # for training data
#         for j, token in enumerate(sentence):
#              cnt += 1
#     return cnt
# print('before removing some tokens = %d' % count_token(X_train))

#############
""" Lower """
#############
## Lower은 preprocessing 맨처음 하는게 좋다. 한 token에 불특정한 패턴들의 대문자들이 등장하는 경우가 있기 때문이다.
## 대문자가 첫 글자에 있어버리면 제대로 동작을 못한다. (만약 Members 이면 member라고 못한다.)
## 문제는 I 같은 경우는 i인 상태에서 pos하면 제대로 동작 안한다. 그러나 이는 미미한 문제이다.
for i, sentence in enumerate(X_train): # for training data
    for j, token in enumerate(sentence):
        X_train[i][j] = token.lower()
for i, sentence in enumerate(X_test): # for testing data
    for j, token in enumerate(sentence):
        X_test[i][j] = token.lower()

# ###############
# """ Removal """
# ###############
# ### for reducing complexity (=high-dimensional) of our data and for dealing with sparse problem
# ### these tend to lead for our model to be underfitted (if we have infinite number of data, we don't need to care about these)
# ### so, we need to do these:
# """ Remove puntuation (e.g.,  .  , ( ) ? ' ; ) """
# puntation_list = ['.', ',', '(', ')', '*', ':', '``', "''", ';', '-', '--', '?', '.', ',', '$']
# puntation_list = ['.', ',', '(', ')', '*', ':']
# puntation_list = []
# """ Remove unnecessary token or not (e.g., 'i.e', 'e.g') """
# unnecessary_token_list = ['i.e', 'e.g', '1.0', "n't", "'s", ')']
# unnecessary_token_list = ['i.e', 'e.g', 'the']
# unnecessary_token_list = []
# for i, sentence in enumerate(X_train): # for training data
#     for j, token in enumerate(sentence):
#         if any(token in t for t in puntation_list):
#             sentence.remove(sentence[j])  # X delete
#             del Y_train[i][j]             # Y delete
#         elif any(token in t for t in unnecessary_token_list):
#             sentence.remove(sentence[j])
#             del Y_train[i][j]

# for i, sentence in enumerate(X_test): # for testing data
#     for j, token in enumerate(sentence):
#         if any(token in t for t in puntation_list):
#             sentence.remove(sentence[j])
#             del Y_test[i][j]
#         elif any(token in t for t in unnecessary_token_list):
#             sentence.remove(sentence[j])
#             del Y_test[i][j]
# print('after removing some tokens = %d' % count_token(X_train))

In [57]:
##################################
""" lemmatisation and stemming """
##################################
## lemmatisation이 wordnet기반이라 하나로 통일해주는 것에 잘 한다. 그런데, 명사, 동사를 구분해야 되기 때문에 
## 정확하게 하려면 POS를 먼저하고 명사, 동사에 맞게 lemmatisation을 해줘야 한다. 그렇지 않으면 명사의 단/복수 구분도 못한다.
## 일단 stemming만 하고. 나중에 lemmatisation을 적용시켜보자. lemmatisation을 하면 문제가 되는 것이 pos를 통해 syntactic 정보를 사용한다는 점이다.
## 성능을 높이기 위해 일정 개수 이하의 단어(1회씩 등장하는 단어)들은 모두 unknown token으로 규정하자.
## 여기서 만약 비슷한 단어끼리 clustering해서 하나의 단어로 표현할 수 있으면, word embedding보다 좋은 성능을 가져올 수도 있다.
## soft clustering이 아니라 hard clustering을 한다는 것인데, 적은 데이터셋에서는 더 유용할 수도 있다.
## 그냥 사전을 구축하는 단계까지는 syntactic / semantic 정보를 허용하는 것은 어떠할까...?
## lemmatiser는 default로 context를 n(명사)를 기반으로 한다. (wordnet과 연결되어 있어서 stemming보다는 훨씬 느리다.)
## stemming을 할 수 밖에 없는 가장 큰 이유: as를 살리고 싶어서...
from nltk.corpus import wordnet as wn
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer() # WordNet Lemmatizer

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']
def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']
def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS', 'IN'] # IN 추가 IN중에서 as가 그냥 lemmitization되면 a가 되므로, 'a'로 설정해준다.
def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None

def lemmatise(tuple): 
## to distinguish whether token is noun or verb
## because in lemmatization, there are different result according to them 
    verb_tag_set = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] # verb tag list from 'nltk.help.upenn_tagset()'
    token = tuple[0]
    pos_tag = tuple[1]
    
    if penn_to_wn(pos_tag) == None:
        return str(lemmatiser.lemmatize(token))
    else:
        return str(lemmatiser.lemmatize(token, penn_to_wn(pos_tag)))
    
#     if any(pos_tag in t for t in verb_tag_set):
#         return str(lemmatiser.lemmatize(token, pos="v"))
#     else:
#         return str(lemmatiser.lemmatize(token)) # default = pos='n'

if stemmming_lemmatisation == 0:
    # 1. Stemming
    for i, sentence in enumerate(X_train): # for training data
        for j, token in enumerate(sentence):
            X_train[i][j] = str(stemmer.stem(token))
    for i, sentence in enumerate(X_test): # for testing data
        for j, token in enumerate(sentence):
            X_test[i][j] = str(stemmer.stem(token))
else:       
    # 2. Lemmatise
    for i, sentence in enumerate(X_train): # for training data
        pos_sentence = nltk.pos_tag(sentence)
        for j, token in enumerate(sentence):
            X_train[i][j] = lemmatise(pos_sentence[j]) # input: tuple

    for i, sentence in enumerate(X_test): # for training data
        pos_sentence = nltk.pos_tag(sentence)
        for j, token in enumerate(sentence):
            X_test[i][j] = lemmatise(pos_sentence[j]) # input: tuple

## Build frequency dictionary (using only training data)
### (before making Look-up Table)

unknown token이라 training data에 수정하지 않고, <br>
그냥 곧바로 vocabulary set을 만들어서 나중에 feature extraction할 때 <br>
사전에 없는 것은 자동으로 unknown token/pharse로 처리되도록 하면 된다.

### 1. Word-level One-gram

In [58]:
""" Define unknown token using only training data """
## 데이터 셋이 작아서 그런지 생각보다 freq 1인 단어들이 많다. 
## stemming말고 lemmatisation을 통해 word 어원의 형태를 잡고, wordnet과 같은 외부 자원을 사용해서
## freq 1인 단어라도 wordnet에 해당되는 단어는 unknown token으로 처리하지 않는 방식도 생각해보자.
## Stemming 후, word freq 1 비율: ('ratio of unknown_word:', 48.505434782608695)

# Word frequency dictionary
word_count_dic = dict()
for i, sentence in enumerate(X_train):
    for j, token in enumerate(sentence):
        word_count_dic[token] = word_count_dic.get(token, 0) + 1

#sorted(word_count_dic.items(), key=lambda x:x[1], reverse=True)
#sorted(word_count_dic.items(), key=lambda x:x[1]) 

# Write Frequency Distribution
text_file = open(PATH+"freqeucny_distribution_1gram.txt", "w")
for w in sorted(word_count_dic, key=word_count_dic.get, reverse=True):
    text_file.write(w)
    text_file.write('\t')
    text_file.write(str(word_count_dic[w]))
    text_file.write('\n')
text_file.close()

# Build unknwon word list based on frequency or wordnet (e.g., threshold: freq 1)
unknown_word_list = [] 
word_one_gram_voca_set = []
for key, value in word_count_dic.items():
    if value <= thr_1gram: # if only 1 frequency
        unknown_word_list.append(key)
    else:
        word_one_gram_voca_set.append(key)
print("ratio of (word-level) 1-gram unknown_word:", len(unknown_word_list) / float(len(word_count_dic)) * 100)

# """ Replace unknown token using only training data """
# for i, sentence in enumerate(X_train): 
#     for j, token in enumerate(sentence):
#         if any(token in t for t in unknown_word_list):
#             X_train[i][j] = 'unknown_token'
print('1-gram voca size: ', len(word_one_gram_voca_set))

ratio of (word-level) 1-gram unknown_word: 47.66355140186916
1-gram voca size:  728


### 2. Word-level Bi-gram

In [59]:
### create bi-gram set
bigram_list = []
for i, sentence in enumerate(X_train):
    
    # Extract bigrams from a single sentence
    bgs = nltk.ngrams(sentence, 2)
    
    # Compute frequency distribution
    output = nltk.FreqDist(bgs).items()

    # Integrate All
    bigram_list += output

### merge the same tuple
merged_bigram_freq_list = []
cnt = 0
for i, bigram_and_freq in enumerate(bigram_list):
    check = False
    for j, merged_bigram_and_freq in enumerate(merged_bigram_freq_list):
        if bigram_and_freq[0] == merged_bigram_and_freq[0]:
#             print 'here', bigram_and_freq[0], merged_bigram_and_freq[0]
            merged_bigram_freq_list[j][1] = merged_bigram_and_freq[1] + bigram_and_freq[1] # merging freqeucny
            check = True
    if check == False:
        temp_list = []
        temp_list.append(bigram_and_freq[0])
        temp_list.append(bigram_and_freq[1])
        merged_bigram_freq_list.append(temp_list)
    
#     cnt += 1
#     if cnt==50:
#         break
    
# print 'total # of bi-gram:', len(merged_bigram_freq_list)
# print merged_bigram_freq_list
# print np.array(merged_bigram_freq_list).shape

### Write Frequency Distribution
text_file = open(PATH+"freqeucny_distribution_2gram.txt", "w")
for bigram, freq in sorted(merged_bigram_freq_list, key=lambda x: x[1], reverse=True):
    text_file.write(str(bigram))
    text_file.write('\t')
    text_file.write(str(freq))
    text_file.write('\n')
text_file.close()

### Filter and make bigram voca set
word_bi_gram_voca_set = []

for bigram, freq in merged_bigram_freq_list:
    if freq > thr_2gram:
        word_bi_gram_voca_set.append(bigram)
        
print('2-gram voca size: ', len(word_bi_gram_voca_set))

2-gram voca size:  537


### 3. Word-level Tri-gram

In [60]:
### create tri-gram set
trigram_list = []
for i, sentence in enumerate(X_train):
    
    # Extract trigrams from a single sentence
    tgs = nltk.ngrams(sentence, 3)
    
    # Compute frequency distribution
    output = nltk.FreqDist(tgs).items()

    # Integrate All
    trigram_list += output
    
### merge the same tuple
merged_trigram_freq_list = []
cnt = 0
for i, trigram_and_freq in enumerate(trigram_list):

    check = False
    for j, merged_trigram_and_freq in enumerate(merged_trigram_freq_list):

        if trigram_and_freq[0] == merged_trigram_and_freq[0]:
#             print 'here', bigram_and_freq[0], merged_bigram_and_freq[0]
            merged_trigram_freq_list[j][1] = merged_trigram_and_freq[1] + trigram_and_freq[1] # merging freqeucny
            check = True
            
    if check == False:
        temp_list = []
        temp_list.append(trigram_and_freq[0])
        temp_list.append(trigram_and_freq[1])
        merged_trigram_freq_list.append(temp_list)
        
#     cnt += 1
#     if cnt==50:
#         break
    
# print 'total # of bi-gram:', len(merged_bigram_freq_list)
# print merged_bigram_freq_list
# print np.array(merged_bigram_freq_list).shape

### Write Frequency Distribution
text_file = open(PATH+"freqeucny_distribution_3gram.txt", "w")
for trigram, freq in sorted(merged_trigram_freq_list, key=lambda x: x[1], reverse=True):
    text_file.write(str(trigram))
    text_file.write('\t')
    text_file.write(str(freq))
    text_file.write('\n')
text_file.close()

### Filter and make trigram voca set
word_tri_gram_voca_set = []

for trigram, freq in merged_trigram_freq_list:
    if freq > thr_3gram:
        word_tri_gram_voca_set.append(trigram)
print('3-gram voca size: ', len(word_tri_gram_voca_set))

3-gram voca size:  311


### 4. Word-level Five-gram

In [61]:
### create five-gram set
fivegram_list = []
for i, sentence in enumerate(X_train):
    
    # Extract trigrams from a single sentence
    fgs = nltk.ngrams(sentence, 5)
    
    # Compute frequency distribution
    output = nltk.FreqDist(fgs).items()

    # Integrate All
    fivegram_list += output
    
### merge the same tuple
merged_fivegram_freq_list = []
cnt = 0
for i, fivegram_and_freq in enumerate(fivegram_list):

    check = False
    for j, merged_fivegram_and_freq in enumerate(merged_fivegram_freq_list):

        if fivegram_and_freq[0] == merged_fivegram_and_freq[0]:
#             print 'here', bigram_and_freq[0], merged_bigram_and_freq[0]
            merged_fivegram_freq_list[j][1] = merged_fivegram_and_freq[1] + fivegram_and_freq[1] # merging freqeucny
            check = True
            
    if check == False:
        temp_list = []
        temp_list.append(fivegram_and_freq[0])
        temp_list.append(fivegram_and_freq[1])
        merged_fivegram_freq_list.append(temp_list)
        
#     cnt += 1
#     if cnt==50:
#         break
    
# print 'total # of bi-gram:', len(merged_bigram_freq_list)
# print merged_bigram_freq_list
# print np.array(merged_bigram_freq_list).shape

### Write Frequency Distribution
text_file = open(PATH+"freqeucny_distribution_5gram.txt", "w")
for fivegram, freq in sorted(merged_fivegram_freq_list, key=lambda x: x[1], reverse=True):
    text_file.write(str(fivegram))
    text_file.write('\t')
    text_file.write(str(freq))
    text_file.write('\n')
text_file.close()

### Filtering out
word_five_gram_voca_set = []
for fivegram, freq in merged_fivegram_freq_list:
    if freq > thr_5gram:
        word_five_gram_voca_set.append(fivegram)
print('5-gram voca size: ', len(word_five_gram_voca_set))

5-gram voca size:  379


### Word List per class

In [166]:
# ann_info = {0: 'component', 1: 'refinement_of_component', 2: 'action',
#                 3: 'refinement_of_action',
#                 4: 'condition', 5: 'priority', 6: 'motivation', 7: 'role',
#                 8: 'object', 9: 'refinement_of_object',
#                 10: 'sub_action', 11: 'sub_argument_of_action', 12: 'sub_priority',
#                 13: 'sub_role', 14: 'sub_object',
#                 15: 'sub_refinement_of_object', 16: 'none'}

# word_count_classes = [[] for i in range(len(ann_info))]    

# for i, sentence in enumerate(Y_train): # y label
#     for j, token in enumerate(sentence):
#         for k in range(0, len(ann_info)):
#             if token == k:
#                 word_count_classes[k].append(X_train[i][j]) # x data 
                
# ### Write out             
# def dict_wirte_to_txt(class_list, file_name):
#     text_file = open(file_name, "w")
#     class_dict = dict(Counter(class_list))
#     for w in sorted(class_dict, key=class_dict.get, reverse=True):
#         text_file.write(w)
#         text_file.write('\t')
#         text_file.write(str(class_dict[w]))
#         text_file.write('\n')
#     text_file.close()
#     return class_dict

# dict_classes = [None] * len(ann_info)
# for k in range(0, len(ann_info)):
#     dict_classes[k] = dict_wirte_to_txt(word_count_classes[k], PATH+str(ann_info[k])+'.txt')
    
# ### Filtering out and make voca set
# def namestr(obj, namespace):
#     return [name for name in namespace if namespace[name] is obj]

# def filtering_out_for_dict(dict):
#     temp_voca_set = []
#     for key, value in dict.items():
#         if value > thr_1gram: # if only 1 frequency
#             temp_voca_set.append(key)
#     return temp_voca_set

# vocab_classes = [None] * len(ann_info)
# for k in range(0, len(ann_info)):
#     vocab_classes[k] = filtering_out_for_dict(dict_classes[k])

In [62]:
component_w_li = []
refinement_of_component_w_li = []
action_w_li = []
refinement_of_action_w_li = []
condition_w_li = []
priority_w_li = []
motivation_w_li = []
role_w_li = []
object_w_li = []
refinement_of_object_w_li = []
sub_action_w_li = []
sub_argument_of_action_w_li = []
sub_priority_w_li = []
sub_role_w_li = []
sub_object_w_li = []
sub_refinement_of_object_w_li = []
none_w_li = []

for i, sentence in enumerate(Y_train):
    for j, token in enumerate(sentence):
        if token==0:
            component_w_li.append(X_train[i][j])
        elif token==1:
            refinement_of_component_w_li.append(X_train[i][j])
        elif token==2:
            action_w_li.append(X_train[i][j])
        elif token==3:
            refinement_of_action_w_li.append(X_train[i][j])
        elif token==4:
            condition_w_li.append(X_train[i][j])
        elif token==5:
            priority_w_li.append(X_train[i][j])
        elif token==6:
            motivation_w_li.append(X_train[i][j])
        elif token==7:
            role_w_li.append(X_train[i][j])
        elif token==8:
            object_w_li.append(X_train[i][j])
        elif token==9:
            refinement_of_object_w_li.append(X_train[i][j])
        elif token==10:
            sub_action_w_li.append(X_train[i][j])
        elif token==11:
            sub_argument_of_action_w_li.append(X_train[i][j])
        elif token==12:
            sub_priority_w_li.append(X_train[i][j])
        elif token==13:
            sub_role_w_li.append(X_train[i][j])
        elif token==14:
            sub_object_w_li.append(X_train[i][j])
        elif token==15:
            sub_refinement_of_object_w_li.append(X_train[i][j])
        else:
            none_w_li.append(X_train[i][j]) 
            

### Write out             
def dict_wirte_to_txt(class_list, file_name):
    text_file = open(file_name, "w")
    class_dict = dict(Counter(class_list))
    for w in sorted(class_dict, key=class_dict.get, reverse=True):
        text_file.write(w)
        text_file.write('\t')
        text_file.write(str(class_dict[w]))
        text_file.write('\n')
    text_file.close()
    return class_dict

component_dict = dict_wirte_to_txt(component_w_li, PATH+'frequency_distribution_class_component.txt')
refinement_of_component_dict = dict_wirte_to_txt(refinement_of_component_w_li, PATH+'frequency_distribution_class_refinement_of_component.txt')
action_dict = dict_wirte_to_txt(action_w_li, PATH+'frequency_distribution_class_action.txt')
refinement_of_action_dict = dict_wirte_to_txt(refinement_of_action_w_li, PATH+'frequency_distribution_class_refinement_of_action.txt')
condition_dict = dict_wirte_to_txt(condition_w_li, PATH+'frequency_distribution_class_condition.txt')
priority_dict = dict_wirte_to_txt(priority_w_li, PATH+'frequency_distribution_class_priority.txt')
motivation_dict = dict_wirte_to_txt(motivation_w_li, PATH+'frequency_distribution_class_motivation.txt')
role_dict = dict_wirte_to_txt(role_w_li, PATH+'frequency_distribution_class_role.txt')
object_dict = dict_wirte_to_txt(object_w_li, PATH+'frequency_distribution_class_object.txt')
refinement_of_object_dict = dict_wirte_to_txt(refinement_of_object_w_li, PATH+'frequency_distribution_class_refinement_of_object.txt')
sub_action_dict = dict_wirte_to_txt(sub_action_w_li, PATH+'frequency_distribution_class_sub_action.txt')
sub_argument_of_action_dict = dict_wirte_to_txt(sub_argument_of_action_w_li, PATH+'frequency_distribution_class_sub_argument_of_action.txt')
sub_priority_dict = dict_wirte_to_txt(sub_priority_w_li, PATH+'frequency_distribution_class_sub_priority.txt')
sub_role_dict = dict_wirte_to_txt(sub_role_w_li, PATH+'frequency_distribution_class_sub_role.txt')
sub_object_dict = dict_wirte_to_txt(sub_object_w_li, PATH+'frequency_distribution_class_sub_object.txt')
sub_refinement_of_object_dict = dict_wirte_to_txt(sub_refinement_of_object_w_li, PATH+'frequency_distribution_class_sub_refinement_of_object.txt')
none_dict = dict_wirte_to_txt(none_w_li, PATH+'frequency_distribution_class_none.txt')

### Filtering out and make voca set
def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]

def filtering_out_for_dict(dict):
    temp_voca_set = []
    for key, value in dict.items():
        if value > thr_1gram: # if only 1 frequency
            temp_voca_set.append(key)
    print(len(temp_voca_set), '\t: ' , namestr(dict, globals())[0],"'s voca size")
    return temp_voca_set

voca_set_class_component = filtering_out_for_dict(component_dict)
voca_set_class_refinement_of_component = filtering_out_for_dict(refinement_of_component_dict)
voca_set_class_action = filtering_out_for_dict(action_dict)
voca_set_class_refinement_of_action = filtering_out_for_dict(refinement_of_action_dict)
voca_set_class_condition = filtering_out_for_dict(condition_dict)
voca_set_class_priority = filtering_out_for_dict(priority_dict)
voca_set_class_motivation = filtering_out_for_dict(motivation_dict)
voca_set_class_role = filtering_out_for_dict(role_dict)
voca_set_class_object = filtering_out_for_dict(object_dict)
voca_set_class_refinement_of_object = filtering_out_for_dict(refinement_of_object_dict)
voca_set_class_sub_action = filtering_out_for_dict(sub_action_dict)
voca_set_class_sub_argument_of_action = filtering_out_for_dict(sub_argument_of_action_dict)
voca_set_class_sub_priority = filtering_out_for_dict(sub_priority_dict)
voca_set_class_sub_role = filtering_out_for_dict(sub_role_dict)
voca_set_class_sub_object = filtering_out_for_dict(sub_object_dict)
voca_set_class_sub_refinement_of_object = filtering_out_for_dict(sub_refinement_of_object_dict)
voca_set_class_none = filtering_out_for_dict(none_dict)

38 	: 	 component_dict 's voca size
12 	: 	 refinement_of_component_dict 's voca size
92 	: 	 action_dict 's voca size
138 	: 	 refinement_of_action_dict 's voca size
104 	: 	 condition_dict 's voca size
40 	: 	 priority_dict 's voca size
66 	: 	 motivation_dict 's voca size
32 	: 	 role_dict 's voca size
178 	: 	 object_dict 's voca size
147 	: 	 refinement_of_object_dict 's voca size
24 	: 	 sub_action_dict 's voca size
33 	: 	 sub_argument_of_action_dict 's voca size
3 	: 	 sub_priority_dict 's voca size
2 	: 	 sub_role_dict 's voca size
22 	: 	 sub_object_dict 's voca size
28 	: 	 sub_refinement_of_object_dict 's voca size
74 	: 	 none_dict 's voca size


# Build Lookup Table

* Lookup Table을 만들고 마지막 element에 unknown token/phrase 넣어주기.
* dic index를 0으로 하지말고 1로 하자. 왜냐면 0은 나중에 sentence boudnary로 인해 exception될 경우 0으로 하자. 즉, [0,0,0,...,0,0,0] 벡터
* 일단 모든 voca_set를 대상으로 unknown token을 명시해서 lookup table을 만들자. 그러고 나중에 unknown token을 사용하고 싶지 않을 때는 나중에 feature extraction할 때, 맨 뒤의 element만 삭제하도록 하자. (unknown token은 항상 lookup table list의 맨 뒤이다)
* unknown을 모든 lookup table에 포함시키는 것은 생각해볼 여지가 있다. 
* class lookup table같은 경우는 unknown token은 제외시켜야 될 것 같다... 
* 즉, 상대적으로 voca set 크기가 적은 lookup table같은 경우는 unknown token을 포함하는게 좋을지 의문이 든다. 
* 적은 크기의 voca set의 lookup table에 unknown token을 넣지 않는 것은 잘못된 판단일수도 있다. 모든 class lookup table에 unknown token을 추가하니 2% 가량 성능이 올랐다.

In [63]:
### Create word_one_gram Lookup Table
word_one_gram_lookup_table = { }
for i, token in enumerate(word_one_gram_voca_set):
    word_one_gram_lookup_table[token] = i+1 # 1부터 시작하게 유도
# Add unknown token
word_one_gram_lookup_table['unknown_token'] = len(word_one_gram_lookup_table)+1
print('1-gram voca size: ',len(word_one_gram_lookup_table))

### Create word_bi_gram Lookup Table
word_bi_gram_lookup_table = { }
for i, token in enumerate(word_bi_gram_voca_set):
    word_bi_gram_lookup_table[token] = i+1
# Add unknown token
word_bi_gram_lookup_table[('unknown_token', 'unknown_token')] = len(word_bi_gram_lookup_table)+1
print('2-gram voca size: ', len(word_bi_gram_lookup_table))

### Create word_tri_gram Lookup Table
word_tri_gram_lookup_table = { }
for i, token in enumerate(word_tri_gram_voca_set):
    word_tri_gram_lookup_table[token] = i+1
# Add unknown token
word_tri_gram_lookup_table['unknown_token', 'unknown_token', 'unknown_token'] = len(word_tri_gram_lookup_table)+1
print('3-gram voca size: ', len(word_tri_gram_lookup_table))

### Create word_five_gram Lookup Table
word_five_gram_lookup_table = { }
for i, token in enumerate(word_five_gram_voca_set):
    word_five_gram_lookup_table[token] = i+1
# Add unknown token
word_five_gram_lookup_table['unknown_token', 'unknown_token', 'unknown_token', 'unknown_token', 'unknown_token'] = len(word_five_gram_lookup_table)+1
print('5-gram voca size: ', len(word_five_gram_lookup_table))

1-gram voca size:  729
2-gram voca size:  538
3-gram voca size:  312
5-gram voca size:  380


In [64]:
def create_lookup_table_1gram(voca_set_1gram):
    temp_1gram_lookup_table = { } # dictionary
    for i, token in enumerate(voca_set_1gram):
        temp_1gram_lookup_table[token] = i+1 # 1부터 시작하게 유도, 사전에 있는 모든 단어들에 해당안되면 0벡터로 정의할거니까...
    # Add unknown token
    temp_1gram_lookup_table['unknown_token'] = len(temp_1gram_lookup_table)+1
    return temp_1gram_lookup_table # lookup_table_dictionary

lookup_table_class_component = create_lookup_table_1gram(voca_set_class_component)
lookup_table_class_refinement_of_component = create_lookup_table_1gram(voca_set_class_refinement_of_component)
lookup_table_class_action = create_lookup_table_1gram(voca_set_class_action)
lookup_table_class_refinement_of_action = create_lookup_table_1gram(voca_set_class_refinement_of_action)
lookup_table_class_condition = create_lookup_table_1gram(voca_set_class_condition)
lookup_table_class_priority = create_lookup_table_1gram(voca_set_class_priority)
lookup_table_class_motivation = create_lookup_table_1gram(voca_set_class_motivation)
lookup_table_class_role = create_lookup_table_1gram(voca_set_class_role)
lookup_table_class_object = create_lookup_table_1gram(voca_set_class_object)
lookup_table_class_refinement_of_object = create_lookup_table_1gram(voca_set_class_refinement_of_object)
lookup_table_class_sub_action = create_lookup_table_1gram(voca_set_class_sub_action)
lookup_table_class_sub_argument_of_action = create_lookup_table_1gram(voca_set_class_sub_argument_of_action)
lookup_table_class_sub_priority = create_lookup_table_1gram(voca_set_class_sub_priority)
lookup_table_class_sub_role = create_lookup_table_1gram(voca_set_class_sub_role)
lookup_table_class_sub_object = create_lookup_table_1gram(voca_set_class_sub_object)
lookup_table_class_sub_refinement_of_object = create_lookup_table_1gram(voca_set_class_sub_refinement_of_object)
lookup_table_class_none = create_lookup_table_1gram(voca_set_class_none)

# Write out output files

In [65]:
dump(X_train, 'pre_X_train')
dump(X_test, 'pre_X_test')
dump(Y_train, 'pre_Y_train')
dump(Y_test, 'pre_Y_test')

dump(word_one_gram_lookup_table, 'word_one_gram_lookup_table')
dump(word_bi_gram_lookup_table, 'word_bi_gram_lookup_table')
dump(word_tri_gram_lookup_table, 'word_tri_gram_lookup_table')
dump(word_five_gram_lookup_table, 'word_five_gram_lookup_table')

dump(lookup_table_class_component, 'lookup_table_class_component')
dump(lookup_table_class_refinement_of_component, 'lookup_table_class_refinement_of_component')
dump(lookup_table_class_action, 'lookup_table_class_action')
dump(lookup_table_class_refinement_of_action, 'lookup_table_class_refinement_of_action')
dump(lookup_table_class_condition, 'lookup_table_class_condition')
dump(lookup_table_class_priority, 'lookup_table_class_priority')
dump(lookup_table_class_motivation, 'lookup_table_class_motivation')
dump(lookup_table_class_role, 'lookup_table_class_role')
dump(lookup_table_class_object, 'lookup_table_class_object')
dump(lookup_table_class_refinement_of_object, 'lookup_table_class_refinement_of_object')
dump(lookup_table_class_sub_action, 'lookup_table_class_sub_action')
dump(lookup_table_class_sub_argument_of_action, 'lookup_table_class_sub_argument_of_action')
dump(lookup_table_class_sub_priority, 'lookup_table_class_sub_priority')
dump(lookup_table_class_sub_role, 'lookup_table_class_sub_role')
dump(lookup_table_class_sub_object, 'lookup_table_class_sub_object')
dump(lookup_table_class_sub_refinement_of_object, 'lookup_table_class_sub_refinement_of_object')
dump(lookup_table_class_none, 'lookup_table_class_none')

## Vectorize Data
transform numerical values into vectorized values

In [66]:
# max(word2idx.items(), key=operator.itemgetter(1))[1] # vocabulary size

In [67]:
# len(X)

In [68]:
# for i, point in enumerate(X):
#     binary = "{0:{fill}12b}".format(point, fill='0')
#     X[i] = list(binary)
    
# # for i, point in enumerate(Xtest):
# #     binary = "{0:{fill}12b}".format(point, fill='0')
# #     Xtest[i] = list(binary)

In [69]:
# np.shape(X)

## Write data array to csv file

In [70]:
# X_df = pd.DataFrame(X)
# X_df.to_csv("X.csv", sep=',', header=False, index=False)

# Y_df = pd.DataFrame(Y)
# Y_df.to_csv("Y.csv", sep=',', header=False, index=False)

# # Xtest_df = pd.DataFrame(Xtest)
# # Xtest_df.to_csv("Xtest.csv", sep=',', header=False, index=False)

# # Ytest_df = pd.DataFrame(Ytest)
# # Ytest_df.to_csv("Ytest.csv", sep=',', header=False, index=False)