In [2]:
import mxnet as mx
import gensim
import tqdm
import gluonnlp

import pandas as pd
import numpy as np

C:\dev\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\dev\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.TXA6YQSD3GCQQC22GEQ54J2UDCXDXHWN.gfortran-win_amd64.dll
  stacklevel=1)


In [3]:
from mxnet import gluon
from mxnet.gluon import nn,rnn

In [148]:
class auto_spacing(gluon.HybridBlock) :
    def __init__(self, n_hidden, vocab_size, embed_dim, max_seq_length,**kwargs) :
        super(auto_spacing, self).__init__(**kwargs)
        
        self.in_seq_len = max_seq_length
        self.out_seq_len = max_seq_length
        self.n_hidden = n_hidden
        self.vocab_size = vocab_size
        self.max_seq_length = max_seq_length
        self.embed_dim = embed_dim
        
        with self.name_scope() :
            self.embedding = nn.Embedding(input_dim=self.vocab_size,output_dim=self.embed_dim)
            
            self.conv_unigram = nn.Conv2D(channels=128, kernel_size=(1, self.embed_dim))
            self.conv_bigram = nn.Conv2D(channels=256, kernel_size=(2, self.embed_dim), padding=(1, 0))
            self.conv_trigram = nn.Conv2D(channels=128, kernel_size=(3, self.embed_dim), padding=(1, 0))
            self.conv_forthgram = nn.Conv2D(channels=64, kernel_size=(4, self.embed_dim), padding=(2, 0))
            self.conv_fifthgram = nn.Conv2D(channels=32, kernel_size=(5, self.embed_dim), padding=(2, 0))

            self.bi_gru = rnn.GRU(hidden_size=self.n_hidden, layout='NTC', bidirectional=True)
            self.dense_sh = nn.Dense(100, activation='relu', flatten=False)
            self.dense = nn.Dense(1, activation='sigmoid', flatten=False)
    
    def hybrid_forward(self, F, inputs) :
        embed = self.embedding(inputs)
        embed = F.expand_dims(embed, axis=1)
        unigram = self.conv_unigram(embed)
        bigram = self.conv_bigram(embed)
        trigram = self.conv_trigram(embed)
        forthgram = self.conv_forthgram(embed)
        fifthgram = self.conv_fifthgram(embed)
        
        grams = F.concat(unigram, F.slice_axis(bigram, axis=2, begin=0,end=self.max_seq_length),
                        trigram, F.slice_axis(forthgram, axis=2, begin=0,end=self.max_seq_length), 
                        F.slice_axis(fifthgram, axis=2, begin=0,end=self.max_seq_length), dim=1)
        
        grams = F.transpose(grams, (0,2,3,1))
        grams = F.reshape(grams, (-1,self.max_seq_length,-3))
        grmas = self.bi_gru(grams)
        fc1 = self.dense_sh(grams)
        return (self.dense(fc1))


In [152]:
def model_init(n_hidden,vocab_size, embed_dim, max_seq_length, ctx, embed_weights) :
    model = auto_spacing(n_hidden, vocab_size, embed_dim, max_seq_length)
    
    model.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
    model.embedding.weight.set_data(embed_weights)
    model.hybridize(static_alloc=True)
    
    model.embedding.collect_params().setattr('grad_req', 'null')
    trainer = gluon.Trainer(model.collect_params(), 'rmsprop')
    loss = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=True)
    loss.hybridize(static_alloc=True)
    return (model, loss, trainer)

In [17]:
def load_vocab(path) :
    import json
    with open(path, 'r') as f:
        data = json.loads(f.read())
        word2idx = data
        idx2word = dict([(v,k) for (k,v) in data.items()])
    
    return word2idx, idx2word

In [18]:
word2idx, idx2word = load_vocab('data/tmp/newDict.dic')

In [28]:
idx2word[1]

'«'

In [21]:
from gensim.models import Word2Vec

In [92]:
def string2spacingChar(input_) :
    # 빈칸을 '^'로 변환하기
    chars = input_.strip().replace(' ','^')
    
    # SOS : «, EOS : »
    tagged_chars = "«"+chars+"»"
    
    char_list = ' '.join(list(tagged_chars))
    
    return char_list

In [101]:
class MySentenceGenerator(object) :
    import os
    import pandas as pd
    
    def __init__(self, fname) :
        self.fname = fname
    def __iter__(self) :
        df_origin = pd.read_csv('data/sejong_corpus/spacing.csv').origin
        for idx in range(len(df_origin)) :
            yield string2spacingChar(df_origin[idx].strip()).split(' ')

In [99]:
tmp = pd.read_csv('data/sejong_corpus/spacing.csv').origin

In [122]:
def create_embedding(data_dir, model_file, embeddings_file, vocab_file, splitc=' ',**params) :
    import json
    from gensim.models import Word2Vec
    
    class SentenceGenerator(object) :
        import os
        def __init__(self, dirname) :
            self.dirname = dirname
        def __iter__(self) :
            for fname in os.listdir(self.dirname) :
                print("Processing~ '{}'".format(fname))
                with open(self.dirname+fname,'r',encoding='utf8') as f :
                    for line in f.readlines() :
                        yield string2spacingChar(line.strip()).split(splitc)
                        
    class SentenceGenerator_another(object) :
        import os
        import pandas as pd

        def __init__(self, fname) :
            self.fname = fname
        def __iter__(self) :
            df_origin = pd.read_csv('data/sejong_corpus/spacing.csv').origin
            for idx in range(len(df_origin)) :
                yield string2spacingChar(df_origin[idx].strip()).split(' ')

    sentences = SentenceGenerator(data_dir)
    
    model = Word2Vec(sentences,**params)
    
    model.save(model_file)
    
    weights = model.wv.syn0
    default_vec = np.mean(weights, axis=0, keepdims = True)
    padding_vec = np.zeros((1,weights.shape[1]))
    
    weights_default = np.concatenate([weights, default_vec, padding_vec], axis=0)
    
    np.save(open(embeddings_file, 'wb'), weights_default)
    
    vocab = dict([(k,v.index) for k,v in model.wv.vocab.items()])
    vocab['__ETC__'] = weights_default.shape[0] -2
    vocab['__PAD__'] = weights_default.shape[0] -1
    
    with open(vocab_file,'w') as f :
        f.write(json.dumps(vocab))


In [124]:
data_dir = 'data/sejong_corpus/conv_raw/'
model_file = 'data/sejong_corpus/util_2/model.mdl'
embeddings_file = 'data/sejong_corpus/util_2/emb.np'
vocab_file = 'data/sejong_corpus/util_2/w2idx.dic'

create_embedding(data_dir, model_file, embeddings_file, vocab_file)

Processing~ '4CM00003.txt'
Processing~ '4CM00005.txt'
Processing~ '4CM00006.txt'
Processing~ '4CM00011.txt'
Processing~ '4CM00013.txt'
Processing~ '4CM00014.txt'
Processing~ '4CM00018.txt'
Processing~ '4CM00019.txt'
Processing~ '4CM00020.txt'
Processing~ '4CM00021.txt'
Processing~ '4CM00022.txt'
Processing~ '4CM00023.txt'
Processing~ '4CM00025.txt'
Processing~ '4CM00027.txt'
Processing~ '4CM00028.txt'
Processing~ '4CM00029.txt'
Processing~ '4CM00030.txt'
Processing~ '4CM00034.txt'
Processing~ '4CM00041.txt'
Processing~ '4CM00046.txt'
Processing~ '4CM00047.txt'
Processing~ '4CM00048.txt'
Processing~ '4CM00050.txt'
Processing~ '4CM00051.txt'
Processing~ '4CM00054.txt'
Processing~ '4CM00055.txt'
Processing~ '4CM00066.txt'
Processing~ '4CM00075.txt'
Processing~ '4CM00077.txt'
Processing~ '4CM00085.txt'
Processing~ '4CM00086.txt'
Processing~ '4CM00089.txt'
Processing~ '4CM00090.txt'
Processing~ '4CM00091.txt'
Processing~ '4CM00092.txt'
Processing~ '4CM00093.txt'
Processing~ '4CM00094.txt'
P

Processing~ '6CM00009.txt'
Processing~ '6CM00010.txt'
Processing~ '6CM00011.txt'
Processing~ '6CM00013.txt'
Processing~ '6CM00014.txt'
Processing~ '6CM00015.txt'
Processing~ '6CM00016.txt'
Processing~ '6CM00017.txt'
Processing~ '6CM00018.txt'
Processing~ '6CM00019.txt'
Processing~ '6CM00020.txt'
Processing~ '6CM00022.txt'
Processing~ '6CM00023.txt'
Processing~ '6CM00024.txt'
Processing~ '6CM00025.txt'
Processing~ '6CM00028.txt'
Processing~ '6CM00029.txt'
Processing~ '6CM00030.txt'
Processing~ '6CM00031.txt'
Processing~ '6CM00032.txt'
Processing~ '6CM00034.txt'
Processing~ '6CM00036.txt'
Processing~ '6CM00037.txt'
Processing~ '6CM00038.txt'
Processing~ '6CM00039.txt'
Processing~ '6CM00040.txt'
Processing~ '6CM00042.txt'
Processing~ '6CM00043.txt'
Processing~ '6CM00044.txt'
Processing~ '6CM00045.txt'
Processing~ '6CM00046.txt'
Processing~ '6CM00047.txt'
Processing~ '6CM00048.txt'
Processing~ '6CM00051.txt'
Processing~ '6CM00054.txt'
Processing~ '6CM00056.txt'
Processing~ '6CM00057.txt'
P

Processing~ '4CM00091.txt'
Processing~ '4CM00092.txt'
Processing~ '4CM00093.txt'
Processing~ '4CM00094.txt'
Processing~ '4CM00097.txt'
Processing~ '4CM00098.txt'
Processing~ '4CM00099.txt'
Processing~ '4CM00100.txt'
Processing~ '4CM00101.txt'
Processing~ '4CM00102.txt'
Processing~ '4CM00103.txt'
Processing~ '4CM00104.txt'
Processing~ '4CM00105.txt'
Processing~ '4CM00106.txt'
Processing~ '4CM00107.txt'
Processing~ '4CM00108.txt'
Processing~ '4CM00109.txt'
Processing~ '4CM00110.txt'
Processing~ '4CM00111.txt'
Processing~ '4CM00112.txt'
Processing~ '4CM00113.txt'
Processing~ '4CM00114.txt'
Processing~ '4CM00115.txt'
Processing~ '4CM00116.txt'
Processing~ '4CM00117.txt'
Processing~ '4CM00118.txt'
Processing~ '4CM00119.txt'
Processing~ '5CM00016.txt'
Processing~ '5CM00040.txt'
Processing~ '5CM00041.txt'
Processing~ '5CM00042.txt'
Processing~ '5CM00043.txt'
Processing~ '5CM00044.txt'
Processing~ '5CM00045.txt'
Processing~ '5CM00046.txt'
Processing~ '5CM00047.txt'
Processing~ '5CM00048.txt'
P

Processing~ '6CM00098.txt'
Processing~ '6CM00099.txt'
Processing~ '6CM00103.txt'
Processing~ '6CM00104.txt'
Processing~ '6CM00105.txt'
Processing~ '6CM00107.txt'
Processing~ '7CM00001.txt'
Processing~ '7CM00002.txt'
Processing~ '7CM00003.txt'
Processing~ '7CM00004.txt'
Processing~ '7CM00005.txt'
Processing~ '7CM00006.txt'
Processing~ '7CM00008.txt'
Processing~ '7CM00009.txt'
Processing~ '7CM00010.txt'
Processing~ '7CM00011.txt'
Processing~ '7CM00026.txt'
Processing~ '7CM00028.txt'
Processing~ '7CM00039.txt'
Processing~ '7CM00042.txt'
Processing~ '7CM00044.txt'
Processing~ '7CM00045.txt'
Processing~ '7CM00054.txt'
Processing~ '7CM00055.txt'
Processing~ '8CK00001.txt'
Processing~ '8CK00002.txt'
Processing~ '8CL00001.txt'
Processing~ '8CL00002.txt'
Processing~ '8CM00002.txt'
Processing~ '8CM00007.txt'
Processing~ '8CM00011.txt'
Processing~ '8CM00012.txt'
Processing~ '8CM00013.txt'
Processing~ '8CM00014.txt'
Processing~ '8CM00015.txt'
Processing~ '8CM00049.txt'
Processing~ '8CM00050.txt'
P



In [119]:
def pad_sequences(sequences,maxlen=None,dtype='int32',padding='pre',truncating='pre',value=0.):

    if not hasattr(sequences, '__len__'):
        raise ValueError('`sequences` must be iterable.')
    lengths = []
    for x in sequences:
        if not hasattr(x, '__len__'):
            raise ValueError('`sequences` must be a list of iterables. '
                             'Found non-iterable: ' + str(x))
        lengths.append(len(x))

    num_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((num_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if not len(s):
            continue  # empty list/array was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' %truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError(
                'Shape of sample %s of sequence at position %s is different from expected shape %s'
                % (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x

In [120]:
def encoding_and_padding(word2dx_dic, sequences,**params) :
    seq_idx = [[word2idx_dic.get(a,word2idx_idx['__ETC__']) for a in i] for i in sequences]
    params['value'] = word2idx_dic['__PAD__']
    return pad_sequences(seq_idx, **params)
    

In [114]:
word2idx, idx2word = load_vocab('data/tmp/newDict.dic')

In [117]:
w2i, i2w = load_vocab('data/sejong_corpus/util/w2idx.dic')

In [127]:
def load_embedding(embeddings_file):
    return (np.load(embeddings_file))

In [154]:
# train 

w2idx, idx2w = load_vocab('data/sejong_corpus/util_2/w2idx.dic')

weights = load_embedding('data/sejong_corpus/util_2/emb.np')

vocab_size = weights.shape[0]
embed_dim = weights.shape[1]

max_seq_len = 50
n_hidden = 50
gpu_count = 0
ctx = [mx.gpu(i) for i in range(gpu_count)] 

model, loss, trainer = model_init(n_hidden=n_hidden, vocab_size=vocab_size,
                                  embed_dim=embed_dim, max_seq_length=max_seq_len,ctx=ctx, embed_weights=weights)


In [None]:
def input_data(input_path) :
    dir_list =  [txt for txt in os.listdir(input_path) if txt.endswith(".txt")]
    
    for txt in dir_list :
        