In [1]:
import mxnet as mx
import gensim
import tqdm
import gluonnlp

import pandas as pd
import numpy as np

C:\dev\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\dev\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.TXA6YQSD3GCQQC22GEQ54J2UDCXDXHWN.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:
from mxnet import gluon
from mxnet.gluon import nn,rnn

In [136]:
class auto_spacing(gluon.HybridBlock) :
    def __init__(self, n_hidden, vocab_size, embed_dim, max_seq_length,**kwargs) :
        super(auto_spacing, self).__init__(**kwargs)
        
        self.in_seq_len = max_seq_length
        self.out_seq_len = max_seq_length
        self.n_hidden = n_hidden
        self.vocab_size = vocab_size
        self.max_seq_length = max_seq_length
        self.embed_dim = embed_dim
        
        with self.name_scope() :
            self.embedding = nn.Embedding(input_dim=self.vocab_size,output_dim=self.embed_dim)
            
            self.conv_unigram = nn.Conv2D(channels=128, kernel_size=(1, self.embed_dim))
            self.conv_bigram = nn.Conv2D(channels=256, kernel_size=(2, self.embed_dim), padding=(1, 0))
            self.conv_trigram = nn.Conv2D(channels=128, kernel_size=(3, self.embed_dim), padding=(1, 0))
            self.conv_forthgram = nn.Conv2D(channels=64, kernel_size=(4, self.embed_dim), padding=(2, 0))
            self.conv_fifthgram = nn.Conv2D(channels=32, kernel_size=(5, self.embed_dim), padding=(2, 0))

            self.bi_gru = rnn.GRU(hidden_size=self.n_hidden, layout='NTC', bidirectional=True)
            self.dense_sh = nn.Dense(100, activation='relu', flatten=False)
            self.dense = nn.Dense(1, activation='sigmoid', flatten=False)
    
    def hybrid_forward(self, F, inputs) :
        embed = self.embedding(inputs)
        embed = F.expand_dims(embed, axis=1)
        unigram = self.conv_unigram(embed)
        bigram = self.conv_bigram(embed)
        trigram = self.conv_trigram(embed)
        forthgram = self.conv_forthgram(embed)
        fifthgram = self.conv_fifthgram(embed)
        
        grams = F.concat(unigram, F.slice_axis(bigram, axis=2, begin=0,end=self.max_seq_length),
                        trigram, F.slice_axis(forthgram, axis=2, begin=0,end=self.max_seq_length), 
                        F.slice_axis(fifthgram, axis=2, begin=0,end=self.max_seq_length), dim=1)
        
        grams = F.transpose(grams, (0,2,3,1))
        grams = F.reshape(grams, (-1,self.max_seq_length,-3))
        grmas = self.bi_gru(grams)
        fc1 = self.dense_sh(grams)
        return (self.dense(fc1))


In [134]:
def model_init(n_hidden,vocab_size, embed_dim, max_seq_length, ctx, embed_weights) :
    model = auto_spacing(n_hidden, vocab_size, embed_dim, max_seq_length)
    
    model.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
    model.embedding.weight.set_data(embed_weights)
    model.hybridize(static_alloc=True)
    
    model.embedding.collect_params().setattr('grad_req', 'null')
    trainer = gluon.Trainer(model.collect_params(), 'rmsprop')
    loss = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=True)
    loss.hybridize(static_alloc=True)
    return (model, loss, trainer)

In [114]:
def load_vocab(path) :
    import json
    with open(path, 'r') as f:
        data = json.loads(f.read())
        word2idx = data
        idx2word = dict([(v,k) for (k,v) in data.items()])
    
    return word2idx, idx2word

In [18]:
word2idx, idx2word = load_vocab('data/tmp/newDict.dic')

In [28]:
idx2word[1]

'«'

In [21]:
from gensim.models import Word2Vec

In [15]:
def string2spacingChar(input_) :
    # 빈칸을 '^'로 변환하기
    chars = input_.strip().replace(' ','^')
    
    # SOS : «, EOS : »
    tagged_chars = "«"+chars+"»"
    
    char_list = ' '.join(list(tagged_chars))
    
    return char_list

In [17]:
def pre_processing(input_) :
    ch_list = []
    
    for cl in input_ :
        ch_list.append(string2spacingChar(cl).replace(' ',''))
        
    return ch_list        

In [101]:
class MySentenceGenerator(object) :
    import os
    import pandas as pd
    
    def __init__(self, fname) :
        self.fname = fname
    def __iter__(self) :
        df_origin = pd.read_csv('data/sejong_corpus/spacing.csv').origin
        for idx in range(len(df_origin)) :
            yield string2spacingChar(df_origin[idx].strip()).split(' ')

In [99]:
tmp = pd.read_csv('data/sejong_corpus/spacing.csv').origin

In [122]:
def create_embedding(data_dir, model_file, embeddings_file, vocab_file, splitc=' ',**params) :
    import json
    from gensim.models import Word2Vec
    
    class SentenceGenerator(object) :
        import os
        def __init__(self, dirname) :
            self.dirname = dirname
        def __iter__(self) :
            for fname in os.listdir(self.dirname) :
                print("Processing~ '{}'".format(fname))
                with open(self.dirname+fname,'r',encoding='utf8') as f :
                    for line in f.readlines() :
                        yield string2spacingChar(line.strip()).split(splitc)
                        
    class SentenceGenerator_another(object) :
        import os
        import pandas as pd

        def __init__(self, fname) :
            self.fname = fname
        def __iter__(self) :
            df_origin = pd.read_csv('data/sejong_corpus/spacing.csv').origin
            for idx in range(len(df_origin)) :
                yield string2spacingChar(df_origin[idx].strip()).split(' ')

    sentences = SentenceGenerator(data_dir)
    
    model = Word2Vec(sentences,**params)
    
    model.save(model_file)
    
    weights = model.wv.syn0
    default_vec = np.mean(weights, axis=0, keepdims = True)
    padding_vec = np.zeros((1,weights.shape[1]))
    
    weights_default = np.concatenate([weights, default_vec, padding_vec], axis=0)
    
    np.save(open(embeddings_file, 'wb'), weights_default)
    
    vocab = dict([(k,v.index) for k,v in model.wv.vocab.items()])
    vocab['__ETC__'] = weights_default.shape[0] -2
    vocab['__PAD__'] = weights_default.shape[0] -1
    
    with open(vocab_file,'w') as f :
        f.write(json.dumps(vocab))


In [124]:
data_dir = 'data/sejong_corpus/conv_raw/'
model_file = 'data/sejong_corpus/util_2/model.mdl'
embeddings_file = 'data/sejong_corpus/util_2/emb.np'
vocab_file = 'data/sejong_corpus/util_2/w2idx.dic'

create_embedding(data_dir, model_file, embeddings_file, vocab_file)

Processing~ '4CM00003.txt'
Processing~ '4CM00005.txt'
Processing~ '4CM00006.txt'
Processing~ '4CM00011.txt'
Processing~ '4CM00013.txt'
Processing~ '4CM00014.txt'
Processing~ '4CM00018.txt'
Processing~ '4CM00019.txt'
Processing~ '4CM00020.txt'
Processing~ '4CM00021.txt'
Processing~ '4CM00022.txt'
Processing~ '4CM00023.txt'
Processing~ '4CM00025.txt'
Processing~ '4CM00027.txt'
Processing~ '4CM00028.txt'
Processing~ '4CM00029.txt'
Processing~ '4CM00030.txt'
Processing~ '4CM00034.txt'
Processing~ '4CM00041.txt'
Processing~ '4CM00046.txt'
Processing~ '4CM00047.txt'
Processing~ '4CM00048.txt'
Processing~ '4CM00050.txt'
Processing~ '4CM00051.txt'
Processing~ '4CM00054.txt'
Processing~ '4CM00055.txt'
Processing~ '4CM00066.txt'
Processing~ '4CM00075.txt'
Processing~ '4CM00077.txt'
Processing~ '4CM00085.txt'
Processing~ '4CM00086.txt'
Processing~ '4CM00089.txt'
Processing~ '4CM00090.txt'
Processing~ '4CM00091.txt'
Processing~ '4CM00092.txt'
Processing~ '4CM00093.txt'
Processing~ '4CM00094.txt'
P

Processing~ '4CM00091.txt'
Processing~ '4CM00092.txt'
Processing~ '4CM00093.txt'
Processing~ '4CM00094.txt'
Processing~ '4CM00097.txt'
Processing~ '4CM00098.txt'
Processing~ '4CM00099.txt'
Processing~ '4CM00100.txt'
Processing~ '4CM00101.txt'
Processing~ '4CM00102.txt'
Processing~ '4CM00103.txt'
Processing~ '4CM00104.txt'
Processing~ '4CM00105.txt'
Processing~ '4CM00106.txt'
Processing~ '4CM00107.txt'
Processing~ '4CM00108.txt'
Processing~ '4CM00109.txt'
Processing~ '4CM00110.txt'
Processing~ '4CM00111.txt'
Processing~ '4CM00112.txt'
Processing~ '4CM00113.txt'
Processing~ '4CM00114.txt'
Processing~ '4CM00115.txt'
Processing~ '4CM00116.txt'
Processing~ '4CM00117.txt'
Processing~ '4CM00118.txt'
Processing~ '4CM00119.txt'
Processing~ '5CM00016.txt'
Processing~ '5CM00040.txt'
Processing~ '5CM00041.txt'
Processing~ '5CM00042.txt'
Processing~ '5CM00043.txt'
Processing~ '5CM00044.txt'
Processing~ '5CM00045.txt'
Processing~ '5CM00046.txt'
Processing~ '5CM00047.txt'
Processing~ '5CM00048.txt'
P

Processing~ '6CM00098.txt'
Processing~ '6CM00099.txt'
Processing~ '6CM00103.txt'
Processing~ '6CM00104.txt'
Processing~ '6CM00105.txt'
Processing~ '6CM00107.txt'
Processing~ '7CM00001.txt'
Processing~ '7CM00002.txt'
Processing~ '7CM00003.txt'
Processing~ '7CM00004.txt'
Processing~ '7CM00005.txt'
Processing~ '7CM00006.txt'
Processing~ '7CM00008.txt'
Processing~ '7CM00009.txt'
Processing~ '7CM00010.txt'
Processing~ '7CM00011.txt'
Processing~ '7CM00026.txt'
Processing~ '7CM00028.txt'
Processing~ '7CM00039.txt'
Processing~ '7CM00042.txt'
Processing~ '7CM00044.txt'
Processing~ '7CM00045.txt'
Processing~ '7CM00054.txt'
Processing~ '7CM00055.txt'
Processing~ '8CK00001.txt'
Processing~ '8CK00002.txt'
Processing~ '8CL00001.txt'
Processing~ '8CL00002.txt'
Processing~ '8CM00002.txt'
Processing~ '8CM00007.txt'
Processing~ '8CM00011.txt'
Processing~ '8CM00012.txt'
Processing~ '8CM00013.txt'
Processing~ '8CM00014.txt'
Processing~ '8CM00015.txt'
Processing~ '8CM00049.txt'
Processing~ '8CM00050.txt'
P



In [117]:
def pad_sequences(sequences,maxlen=None,dtype='int32',padding='pre',truncating='pre',value=0.):

    if not hasattr(sequences, '__len__'):
        raise ValueError('`sequences` must be iterable.')
    lengths = []
    for x in sequences:
        if not hasattr(x, '__len__'):
            raise ValueError('`sequences` must be a list of iterables. '
                             'Found non-iterable: ' + str(x))
        lengths.append(len(x))

    num_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((num_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if not len(s):
            continue  # empty list/array was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' %truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError(
                'Shape of sample %s of sequence at position %s is different from expected shape %s'
                % (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x

In [121]:
def encoding_and_padding(word2idx_dic, sequences,**params) :
    seq_idx = [[word2idx_dic.get(a,word2idx_dic['__ETC__']) for a in i] for i in sequences]
    params['value'] = word2idx_dic['__PAD__']
    return pad_sequences(seq_idx, **params)
    

In [114]:
word2idx, idx2word = load_vocab('data/tmp/newDict.dic')

In [117]:
w2i, i2w = load_vocab('data/sejong_corpus/util/w2idx.dic')

In [132]:
def load_embedding(embeddings_file):
    return (np.load(embeddings_file))

In [105]:
def replaceEx(input_) :
    
    res = []
    for str_ in input_ :
        res.append(str_.replace('::','').replace('{','').replace('-',''))
    
    return res    

In [106]:
def y_encoding(n_grams, maxlen=50):
    init_mat = np.zeros(shape=(len(n_grams), maxlen), dtype=np.int8)
    for i in range(len(n_grams)):
        init_mat[i, np.cumsum([len(j) for j in n_grams[i]]) - 1] = 1
    return init_mat

In [128]:
def get_transDtype(x,y,batch_size) :
    from mxnet import gluon
    tr_set = gluon.data.ArrayDataset(x,y.astype('float32'))
    tr_data_iterator = gluon.data.DataLoader(tr_set,batch_size=batch_size)
    
    return tr_data_iterator

In [138]:
def split_train_set(x_train, p=0.98):
    
    import numpy as np
    train_idx = np.random.choice(range(x_train.shape[0]),int(x_train.shape[0] * p),replace=False)
    set_tr_idx = set(train_idx)
    test_index = [i for i in range(x_train.shape[0]) if i not in set_tr_idx]
    return ((train_idx, np.array(test_index)))

In [143]:
def input_data(input_path, train_ratio = 1,isMake_dif_set=False,max_seq_len=200,vocab_path='data/sejong_corpus/util_2/w2idx.dic',batch_size=100) :
    import os
    
    # get_rawData
    dir_list =  [txt for txt in os.listdir(input_path) if txt.endswith(".txt")]
    
    X_origin = []
    for txt in dir_list :
        with open(input_path+txt, 'r', encoding='utf8') as f:
            X_origin.extend(f.readlines())
            
    # del '::','{' ..
    X_origin = replaceEx(X_origin)
    # add <SoS>, <EoS>, replace(' ','^')
    processed_seq_ = pre_processing(X_origin)
    # del blank_list
    processed_seq_ = list(filter(lambda x : x != "«»",processed_seq_))
    # shuffle sequence list
    samp_idx = np.random.choice(range(len(processed_seq_)), int(len(processed_seq_) * train_ratio), replace=False)
    
    processed_seq = [tmp_test[i] for i in samp_idx]
    sp_sents = [i.split('^') for i in processed_seq]
    
    # 8어절로 나눠서 테스트
    if isMake_dif_set is True:
        n_gram = [[k, v, z, a, c, d, e, f] for sent in sp_sents for k, v, z, a, c, d, e, f in zip(sent, sent[1:], sent[2:],
                                                               sent[3:], sent[4:], sent[5:],sent[6:], sent[7:])]
    else:
        n_gram = sp_sents
    
    # make_target(space = 1, others = 0)
    n_gram_y = y_encoding(n_gram, max_seq_len)
    
    w2idx, idx2w = load_vocab(vocab_path)
    
    # input seq - del blank
    ngram_encoded_padded = encoding_and_padding(word2idx_dic=w2idx,
                                                sequences=[''.join(gram) for gram in n_gram], maxlen=max_seq_len,
                                                padding='post', truncating='post')
    
    if train_ratio < 1:
        # 학습셋 테스트셋 생성
        tr_idx, te_idx = split_train_set(ngram_encoded_padded, train_ratio)

        y_train = n_gram_y[tr_idx, ]
        x_train = ngram_encoded_padded[tr_idx, ]

        y_test = n_gram_y[te_idx, ]
        x_test = ngram_encoded_padded[te_idx, ]

        # train generator
        train_generator = get_transDtype(x_train, y_train, batch_size)
        valid_generator = get_transDtype(x_test, y_test, 500)
        return (train_generator, valid_generator)
    else :
        return get_transDtype(ngram_encoded_padded,n_gram_y,batch_size)
            
        

In [182]:
import threading
from mxnet import ndarray

def split_data(data, num_slice, batch_axis=0, even_split=True):    
    size = data.shape[batch_axis]
    if even_split and size % num_slice != 0:
        raise ValueError(
            "data with shape %s cannot be evenly split into %d slices along axis %d. " \
            "Use a batch size that's multiple of %d or set even_split=False to allow " \
            "uneven partitioning of data."%(
                str(data.shape), num_slice, batch_axis, num_slice))
        
    if num_slice == 0 :
        num_slice = 1
    step = size // num_slice

    # If size < num_slice, make fewer slices
    if not even_split and size < num_slice:
        step = 1
        num_slice = size

    if batch_axis == 0:
        slices = [data[i*step:(i+1)*step] if i < num_slice - 1 else data[i*step:size]
                  for i in range(num_slice)]
    elif even_split:
        if is_np_array():
            slices = _mx_np.split(data, indices_or_sections=num_slice, axis=batch_axis)
        else:
            slices = ndarray.split(data, num_outputs=num_slice, axis=batch_axis)
    else:
        if is_np_array():
            indices = [step * i for i in range(1, num_slice)]
            slices = _mx_np.split(data, indices_or_sections=indices, axis=batch_axis)
        else:
            slices = [ndarray.slice_axis(data, batch_axis, i*step, (i+1)*step)
                      if i < num_slice - 1 else
                      ndarray.slice_axis(data, batch_axis, i*step, size)
                      for i in range(num_slice)]
    return slices


def split_and_load(data, ctx_list, batch_axis=0, even_split=True):
    array_fn = _mx_np.array if is_np_array() else ndarray.array
    if not isinstance(data, ndarray.NDArray):
        data = array_fn(data, ctx=ctx_list[0])
    if len(ctx_list) == 1:
        return [data.as_in_context(ctx_list[0])]

    slices = split_data(data, len(ctx_list), batch_axis, even_split)
    return [i.as_in_context(ctx) for i, ctx in zip(slices, ctx_list)]

class _NumpyArrayScope(object):
    _current = threading.local()

    def __init__(self, is_np_array):  # pylint: disable=redefined-outer-name
        self._old_scope = None
        self._is_np_array = is_np_array

    def __enter__(self):
        if not hasattr(_NumpyArrayScope._current, "value"):
            _NumpyArrayScope._current.value = _NumpyArrayScope(False)
        self._old_scope = _NumpyArrayScope._current.value
        _NumpyArrayScope._current.value = self
        return self

    def __exit__(self, ptype, value, trace):
        assert self._old_scope
        _NumpyArrayScope._current.value = self._old_scope


def is_np_array():
    
    return _NumpyArrayScope._current.value._is_np_array if hasattr(
        _NumpyArrayScope._current, "value") else False

In [184]:
# train_function

def train(epochs,train_data,test_data, vali_data, model, loss, trainer,pad_idx,ctx, decay=False,mdl_desc='spacing_model') :
    import time
    from tqdm import tqdm
    from mxnet import gluon
    import mxnet.autograd as autograd
    
    output = 'data/sejong_corpus/util_2'
    
    tot_test_acc = []
    tot_train_loss = []
    
    for e in range(epochs) :
        tic = time.time()
        
        if e > 1 and decay :
            trainer.set_learning_rate(trainer.learning_rate * 0.7)
            
        train_loss = []
        
        iter_tqdm = tqdm(train_data, 'Batches')
        
        for  i, (x_data,y_data) in enumerate(iter_tqdm) :
            x_data_l = split_and_load(x_data, ctx, even_split=False)
            y_data_l = split_and_load(y_data, ctx, even_split=False)
            
            with autograd.record() :
                losses = [loss(model(x), y) for x,y in zip(x_data_l, y_data_l)]
                
            for l in losses :
                l.backward()
            
            trainer.step(x_data.shape[0])
            curr_loss = np.mean([mx.nd.mean(l).asscalar() for l in losses])
            train_loss.append(curr_loss)
            iter_tqdm.set_description("loss {}".format(curr_loss))
            mx.nd.waitall()

        # caculate test loss
        test_acc = evaluate_accuracy(test_data,model,pad_idx,
                                     ctx=ctx[0] if isinstance(ctx, list) else mx.gpu(0))
        valid_acc = evaluate_accuracy(vali_data,model,pad_idx,
                                      ctx=ctx[0] if isinstance(ctx, list) else mx.gpu(0))
        logger.info('[Epoch %d] time cost: %f' % (e, time.time() - tic))
        logger.info("[Epoch %d] Train Loss: %f, Test acc : %f Valid acc : %f" %
                    (e, np.mean(train_loss), test_acc, valid_acc))
        tot_test_acc.append(test_acc)
        tot_train_loss.append(np.mean(train_loss))
        model.save_parameters(outputs + '/' + "{}_{}.params".format(mdl_desc, e))
    return (tot_test_acc, tot_train_loss)

In [127]:
input_path = 'data/sejong_corpus/conv_raw/'

train_gen = input_data(input_path)

In [158]:
mx.gpu(0)

gpu(0)

In [162]:
# train 

# default_parameter
w2idx, idx2w = load_vocab('data/sejong_corpus/util_2/w2idx.dic')

weights = load_embedding('data/sejong_corpus/util_2/emb.np')
train_data_path = 'data/sejong_corpus/conv_raw/'
test_data_path = 'data/sejong_corpus/conv_test/'

vocab_size = weights.shape[0]
embed_dim = weights.shape[1]

max_seq_len = 50
n_hidden = 50
gpu_count = 0
ctx = [mx.gpu(i) for i in range(gpu_count)] 

# train dataset_init
train_generator, vali_generator = input_data(train_data_path,train_ratio=0.9,isMake_dif_set=True)

# test dataset_init
test_generator = input_data(test_data_path)

# model_init
model, loss, trainer = model_init(n_hidden=n_hidden, vocab_size=vocab_size,
                                  embed_dim=embed_dim, max_seq_length=max_seq_len,ctx=ctx, embed_weights=weights)



In [185]:
train(epochs=100, train_data=train_generator,test_data=test_generator,
      vali_data= vali_generator,model=model,loss=loss,trainer=trainer,
      pad_idx=w2idx['__PAD__'],ctx=ctx)

Batches:   0%|          | 0/598 [00:00<?, ?it/s]


IndexError: list index out of range

In [None]:
def train(epochs,train_data,test_data, vali_data, 
          model, loss, trainer,pad_idx,ctx, decay=False,mdl_desc='spacing_model') :

In [169]:
6//4

1