In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# don't install already mxnet
pip install mxnet

In [0]:
pip install mxnet-mkl

In [0]:
!nvcc --version

In [0]:
pip install mxnet-cu101

In [0]:
pip install mxnet-cu101mkl

In [0]:
pip install gluonnlp

In [0]:
# import package

import mxnet as mx
import gensim
import tqdm
import gluonnlp
from mxnet import gluon
from mxnet.gluon import nn,rnn
import threading
import logging

from mxnet import ndarray

import pandas as pd
import numpy as np
import os
from gensim.models import Word2Vec


In [0]:
# hyper parameter

base_path = 'drive/My Drive/pythonWorkspace/DeepLearning/AutoGenerating_VariableName/'
logger = logging.getLogger()


In [0]:
# modeling class

class auto_spacing(gluon.HybridBlock) :
    def __init__(self, n_hidden, vocab_size, embed_dim, max_seq_length,**kwargs) :
        super(auto_spacing, self).__init__(**kwargs)
        
        self.in_seq_len = max_seq_length
        self.out_seq_len = max_seq_length
        self.n_hidden = n_hidden
        self.vocab_size = vocab_size
        self.max_seq_length = max_seq_length
        self.embed_dim = embed_dim
        
        with self.name_scope() :
            self.embedding = nn.Embedding(input_dim=self.vocab_size,output_dim=self.embed_dim)
            
            self.conv_unigram = nn.Conv2D(channels=128,in_channels=1, kernel_size=(1, self.embed_dim))
            self.conv_bigram = nn.Conv2D(channels=256, in_channels=1, kernel_size=(2, self.embed_dim), padding=(1, 0))
            self.conv_trigram = nn.Conv2D(channels=128, in_channels=1, kernel_size=(3, self.embed_dim), padding=(1, 0))
            self.conv_forthgram = nn.Conv2D(channels=64, in_channels=1,  kernel_size=(4, self.embed_dim), padding=(2, 0))
            self.conv_fifthgram = nn.Conv2D(channels=32,in_channels=1,  kernel_size=(5, self.embed_dim), padding=(2, 0)) 

            self.bi_gru = rnn.GRU(hidden_size=self.n_hidden,input_size=608, layout='NTC', bidirectional=True)
            self.dense_sh = nn.Dense(100, in_units=608, activation='relu', flatten=False)
            self.dense = nn.Dense(1,in_units=100, activation='sigmoid', flatten=False)
    
    def hybrid_forward(self, F, inputs) :
        embed = self.embedding(inputs)
        embed = F.expand_dims(embed, axis=1)

        unigram = self.conv_unigram(embed)
        bigram = self.conv_bigram(embed)
        trigram = self.conv_trigram(embed)
        forthgram = self.conv_forthgram(embed)
        fifthgram = self.conv_fifthgram(embed)
        
        grams = F.concat(unigram, F.slice_axis(bigram, axis=2, begin=0,end=self.max_seq_length),
                        trigram, F.slice_axis(forthgram, axis=2, begin=0,end=self.max_seq_length), 
                        F.slice_axis(fifthgram, axis=2, begin=0,end=self.max_seq_length), dim=1)
        
        grams = F.transpose(grams, (0,2,3,1))
        grams = F.reshape(grams, (-1,self.max_seq_length,-3))
        grmas = self.bi_gru(grams)
        fc1 = self.dense_sh(grams)
        return (self.dense(fc1))


def model_init(n_hidden,vocab_size, embed_dim, max_seq_length, ctx, embed_weights) :
    model = auto_spacing(n_hidden, vocab_size, embed_dim, max_seq_length)
    
    model.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
    model.embedding.weight.set_data(embed_weights)
    model.hybridize(static_alloc=True)
    
    model.embedding.collect_params().setattr('grad_req', 'null')
    trainer = gluon.Trainer(model.collect_params(), 'rmsprop')
    loss = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=True)
    loss.hybridize(static_alloc=True)
    return (model, loss, trainer)



In [0]:
# util

def string2spacingChar(input_) :
    # 빈칸을 '^'로 변환하기
    chars = input_.strip().replace(' ','^')
    
    # SOS : «, EOS : »
    tagged_chars = "«"+chars+"»"
    
    char_list = ' '.join(list(tagged_chars))
    
    return char_list

def load_vocab(path) :
    import json
    with open(path, 'r') as f:
        data = json.loads(f.read())
        word2idx = data
        idx2word = dict([(v,k) for (k,v) in data.items()])
    
    return word2idx, idx2word

def pre_processing(input_) :
    ch_list = []
    
    for cl in input_ :
        ch_list.append(string2spacingChar(cl).replace(' ',''))
        
    return ch_list    

def load_embedding(embeddings_file):
    return (np.load(embeddings_file))

def replaceEx(input_) :
    # 이상한 글자제거
    res = []
    for str_ in input_ :
        res.append(str_.replace('::','').replace('{','').replace('-',''))
    return res    

def mkdirs_(output_path) : 
        try :
            if not os.path.exists(output_path) :
                os.mkdir(output_path)
        except :
            print('Error : '+ output_path)



In [0]:
# train

class MySentenceGenerator(object) :
    import os
    import pandas as pd
    
    def __init__(self, fname) :
        self.fname = fname
    def __iter__(self) :
        df_origin = pd.read_csv(base_path+'dataset/spacing.csv').origin
        for idx in range(len(df_origin)) :
            yield string2spacingChar(df_origin[idx].strip()).split(' ')

def create_embedding(data_dir, model_file, embeddings_file, vocab_file, splitc=' ',**params) :
    import json
    from gensim.models import Word2Vec
    
    class SentenceGenerator(object) :
        import os
        def __init__(self, dirname) :
            self.dirname = dirname
        def __iter__(self) :
            for fname in os.listdir(self.dirname) :
                print("Processing~ '{}'".format(fname))
                with open(self.dirname+fname,'r',encoding='utf8') as f :
                    for line in f.readlines() :
                        yield string2spacingChar(line.strip()).split(splitc)
                        
    class SentenceGenerator_another(object) :
        import os
        import pandas as pd

        def __init__(self, fname) :
            self.fname = fname
        def __iter__(self) :
            df_origin = pd.read_csv(base_path+'dataset/spacing.csv').origin
            for idx in range(len(df_origin)) :
                yield string2spacingChar(df_origin[idx].strip()).split(' ')

    sentences = SentenceGenerator(data_dir)
    
    model = Word2Vec(sentences,**params)
    
    model.save(model_file)
    
    weights = model.wv.syn0
    default_vec = np.mean(weights, axis=0, keepdims = True)
    padding_vec = np.zeros((1,weights.shape[1]))
    
    weights_default = np.concatenate([weights, default_vec, padding_vec], axis=0)
    
    np.save(open(embeddings_file, 'wb'), weights_default)
    
    vocab = dict([(k,v.index) for k,v in model.wv.vocab.items()])
    vocab['__ETC__'] = weights_default.shape[0] -2
    vocab['__PAD__'] = weights_default.shape[0] -1
    
    with open(vocab_file,'w') as f :
        f.write(json.dumps(vocab))


def pad_sequences(sequences,maxlen=None,dtype='int32',padding='pre',truncating='pre',value=0.):

    if not hasattr(sequences, '__len__'):
        raise ValueError('`sequences` must be iterable.')
    lengths = []
    for x in sequences:
        if not hasattr(x, '__len__'):
            raise ValueError('`sequences` must be a list of iterables. '
                             'Found non-iterable: ' + str(x))
        lengths.append(len(x))

    num_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((num_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if not len(s):
            continue  # empty list/array was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' %truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError(
                'Shape of sample %s of sequence at position %s is different from expected shape %s'
                % (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x

def encoding_and_padding(word2idx_dic, sequences,**params) :
    seq_idx = [[word2idx_dic.get(a,word2idx_dic['__ETC__']) for a in i] for i in sequences]
    params['value'] = word2idx_dic['__PAD__']
    return pad_sequences(seq_idx, **params)

def get_transDtype(x,y,batch_size) :
    from mxnet import gluon
    tr_set = gluon.data.ArrayDataset(x,y.astype('float32'))
    tr_data_iterator = gluon.data.DataLoader(tr_set,batch_size=batch_size)
    return tr_data_iterator

    
def y_encoding(n_grams, maxlen=50):
    init_mat = np.zeros(shape=(len(n_grams), maxlen), dtype=np.int8)
    for i in range(len(n_grams)):
        init_mat[i, np.cumsum([len(j) for j in n_grams[i]]) - 1] = 1
    return init_mat

def split_train_set(x_train, p=0.98):
    
    import numpy as np
    train_idx = np.random.choice(range(x_train.shape[0]),int(x_train.shape[0] * p),replace=False)
    set_tr_idx = set(train_idx)
    test_index = [i for i in range(x_train.shape[0]) if i not in set_tr_idx]
    return ((train_idx, np.array(test_index)))

def input_data(input_path, train_ratio = 1,isMake_dif_set=False,max_seq_len=300,vocab_path=base_path+'ipynbCode/util/w2idx.dic',batch_size=100) :
    import os
    
    # get_rawData
    dir_list =  [txt for txt in os.listdir(input_path) if txt.endswith(".txt")]
    
    X_origin = []
    for txt in dir_list :
        with open(input_path+txt, 'r', encoding='utf8') as f:
            X_origin.extend(f.readlines())
            
    # del '::','{', etc..
    X_origin = replaceEx(X_origin)
    # add <SoS>, <EoS>, replace(' ','^')
    processed_seq_ = pre_processing(X_origin)
    # del blank_list
    processed_seq_ = list(filter(lambda x : x != "«»",processed_seq_))
    # shuffle sequence list
    samp_idx = np.random.choice(range(len(processed_seq_)), int(len(processed_seq_) * train_ratio), replace=False)
    
    processed_seq = [processed_seq_[i] for i in samp_idx]
    sp_sents = [i.split('^') for i in processed_seq]
    
    # 8어절로 나눠서 테스트
    if isMake_dif_set is True:
        n_gram = [[k, v, z, a, c, d, e, f] for sent in sp_sents for k, v, z, a, c, d, e, f in zip(sent, sent[1:], sent[2:],
                                                               sent[3:], sent[4:], sent[5:],sent[6:], sent[7:])]
    else:
        n_gram = sp_sents
    
    # make_target(space = 1, others = 0)
    n_gram_y = y_encoding(n_gram, max_seq_len)
    
    w2idx, idx2w = load_vocab(vocab_path)
    
    # input seq - del blank
    ngram_encoded_padded = encoding_and_padding(word2idx_dic=w2idx,
                                                sequences=[''.join(gram) for gram in n_gram], maxlen=max_seq_len,
                                                padding='post', truncating='post')
    
    if train_ratio < 1:
        # make train_set
        tr_idx, te_idx = split_train_set(ngram_encoded_padded, train_ratio)

        y_train = n_gram_y[tr_idx, ]
        x_train = ngram_encoded_padded[tr_idx, ]

        y_test = n_gram_y[te_idx, ]
        x_test = ngram_encoded_padded[te_idx, ]

        # train generator
        train_generator = get_transDtype(x_train, y_train, batch_size)
        valid_generator = get_transDtype(x_test, y_test, 500)
        return (train_generator, valid_generator)
    else :
        return get_transDtype(ngram_encoded_padded,n_gram_y,batch_size)

def input_data_csv(input_path, train_ratio = 1,isMake_dif_set=False,max_seq_len=300,vocab_path=base_path+'ipynbCode/util/w2idx.dic',batch_size=100) :
    import os
    
    # get_rawData
    dir_list =  [txt for txt in os.listdir(input_path) if txt.endswith(".txt")]
    
    X_origin = []
    for txt in dir_list :
        with open(input_path+txt, 'r', encoding='utf8') as f:
            X_origin.extend(f.readlines())
            
    # del '::','{', etc..
    X_origin = replaceEx(X_origin)
    # add <SoS>, <EoS>, replace(' ','^')
    processed_seq_ = pre_processing(X_origin)
    # del blank_list
    processed_seq_ = list(filter(lambda x : x != "«»",processed_seq_))
    # shuffle sequence list
    samp_idx = np.random.choice(range(len(processed_seq_)), int(len(processed_seq_) * train_ratio), replace=False)
    
    processed_seq = [processed_seq_[i] for i in samp_idx]
    sp_sents = [i.split('^') for i in processed_seq]
    
    # 8어절로 나눠서 테스트
    if isMake_dif_set is True:
        n_gram = [[k, v, z, a, c, d, e, f] for sent in sp_sents for k, v, z, a, c, d, e, f in zip(sent, sent[1:], sent[2:],
                                                               sent[3:], sent[4:], sent[5:],sent[6:], sent[7:])]
    else:
        n_gram = sp_sents
    
    # make_target(space = 1, others = 0)
    n_gram_y = y_encoding(n_gram, max_seq_len)
    
    w2idx, idx2w = load_vocab(vocab_path)
    
    # input seq - del blank
    ngram_encoded_padded = encoding_and_padding(word2idx_dic=w2idx,
                                                sequences=[''.join(gram) for gram in n_gram], maxlen=max_seq_len,
                                                padding='post', truncating='post')
    
    if train_ratio < 1:
        # make train_set
        tr_idx, te_idx = split_train_set(ngram_encoded_padded, train_ratio)

        y_train = n_gram_y[tr_idx, ]
        x_train = ngram_encoded_padded[tr_idx, ]

        y_test = n_gram_y[te_idx, ]
        x_test = ngram_encoded_padded[te_idx, ]

        # train generator
        train_generator = get_transDtype(x_train, y_train, batch_size)
        valid_generator = get_transDtype(x_test, y_test, 500)
        return (train_generator, valid_generator)
    else :
        return get_transDtype(ngram_encoded_padded,n_gram_y,batch_size)


def evaluate_accuracy(data_iterator, net, pad_idx, ctx, n=5000):
    # 각 시퀀스의 길이만큼 순회하며 정확도 측정
    # 최적화되지 않음
    acc = mx.metric.Accuracy(axis=0)
    num_of_test = 0
    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        # get sentence length
        data_np = data.asnumpy()
        lengths = np.argmax(np.where(data_np == pad_idx, np.ones_like(data_np),
                                     np.zeros_like(data_np)),
                            axis=1)
        output = net(data)
        pred_label = output.squeeze(axis=2) > 0.5

        for i in range(data.shape[0]):
            num_of_test += data.shape[0]
            acc.update(preds=pred_label[i, :lengths[i]],
                       labels=label[i, :lengths[i]])
        if num_of_test > n:
            break
    return acc.get()[1]

def train(epochs,train_data,test_data, vali_data, model, loss, trainer,pad_idx,ctx, decay=False,mdl_desc='spacing_model') :
    import time
    from tqdm import tqdm
    from mxnet import gluon
    import mxnet.autograd as autograd


    
    outputs = base_path + 'ipynbCode/output/'
    mkdirs_(outputs)
    
    tot_test_acc = []
    tot_train_loss = []
    
    for e in range(epochs) :
        tic = time.time()
        
        if e > 1 and decay :
            trainer.set_learning_rate(trainer.learning_rate * 0.7)
            
        train_loss = []
        
        iter_tqdm = tqdm(train_data, 'Batches')
        
        for  i, (x_data,y_data) in enumerate(iter_tqdm) :
            x_data_l = gluon.utils.split_and_load(x_data, ctx, even_split=False)
            y_data_l = gluon.utils.split_and_load(y_data, ctx, even_split=False)
            
            with autograd.record() :
                losses = [loss(model(x), y) for x,y in zip(x_data_l, y_data_l)]
                
            for l in losses :
                l.backward()
            
            trainer.step(x_data.shape[0],ignore_stale_grad=True)
            curr_loss = np.mean([mx.nd.mean(l).asscalar() for l in losses])
            train_loss.append(curr_loss)
            iter_tqdm.set_description("loss {}".format(curr_loss))
            mx.nd.waitall()

        # caculate test loss
        test_acc = evaluate_accuracy(test_data,model,pad_idx,
                                     ctx=ctx[0] if isinstance(ctx, list) else mx.gpu(0))
        valid_acc = evaluate_accuracy(vali_data,model,pad_idx,
                                      ctx=ctx[0] if isinstance(ctx, list) else mx.gpu(0))
        logger.info('[Epoch %d] time cost: %f' % (e, time.time() - tic))
        logger.info("[Epoch %d] Train Loss: %f, Test acc : %f Valid acc : %f" %
                    (e, np.mean(train_loss), test_acc, valid_acc))
        tot_test_acc.append(test_acc)
        tot_train_loss.append(np.mean(train_loss))
        model.save_parameters(outputs + '/' + "{}_{}.params".format(mdl_desc, e+1))
    return (tot_test_acc, tot_train_loss)

In [0]:
# main_run

# default_path
train_data_path = base_path + 'dataset/train_data/'
test_data_path = base_path + 'dataset/test_data/'
util_path = base_path + 'ipynbCode/util/'

mkdirs_(util_path)

w2idx_model = util_path + 'model.mdl'
w2idx_embed = util_path + 'emb.np'
vocab_path = util_path + 'w2idx.dic'


In [0]:

# create embedding_files
create_embedding(train_data_path, w2idx_model, w2idx_embed, vocab_path )


In [0]:

# parameter and dataset, model train
w2idx, idx2w = load_vocab(vocab_path)

weights = load_embedding(w2idx_embed)

vocab_size = weights.shape[0]
embed_dim = weights.shape[1]

max_seq_len = 300
n_hidden = 50
gpu_count = 1
batch_size = 100
ctx = [mx.gpu(i) for i in range(gpu_count)] 

In [0]:

# dataset, modeling, train
# train dataset_init
train_generator, vali_generator = input_data(train_data_path,train_ratio=0.9,
                                             isMake_dif_set=True,max_seq_len = max_seq_len,batch_size=batch_size)

# test dataset_init
test_generator = input_data(test_data_path)

# model_init
model, loss, trainer = model_init(n_hidden=n_hidden, vocab_size=vocab_size,
                                  embed_dim=embed_dim, max_seq_length=max_seq_len,ctx=ctx, embed_weights=weights)


train(epochs=100, train_data=train_generator,test_data=test_generator,
      vali_data= vali_generator,model=model,loss=loss,trainer=trainer,
      pad_idx=w2idx['__PAD__'],ctx=ctx)

In [0]:
# predict_spacing

from functools import lru_cache
import re

class pred_spacing:
    def __init__(self, model, w2idx):
        self.model = model
        self.w2idx = w2idx
        self.pattern = re.compile(r'\s+')

    @lru_cache(maxsize=None)
    def get_spaced_sent(self, raw_sent):
        raw_sent_ = "«" + raw_sent + "»"
        raw_sent_ = raw_sent_.replace(' ', '^')
        sents_in = [
            raw_sent_,
        ]
        mat_in = encoding_and_padding(word2idx_dic=self.w2idx, sequences=sents_in, maxlen= max_seq_len,
                                      padding='post', truncating='post')
        mat_in = mx.nd.array(mat_in, ctx=mx.cpu(0))
        results = self.model(mat_in)
        mat_set = results[0, ]
        preds = np.array(
            ['1' if i > 0.5 else '0' for i in mat_set[:len(raw_sent_)]])
        return self.make_pred_sents(raw_sent_, preds)

    def make_pred_sents(self, x_sents, y_pred):
        res_sent = []
        for i, j in zip(x_sents, y_pred):
            if j == '1':
                res_sent.append(i)
                res_sent.append(' ')
            else:
                res_sent.append(i)
        subs = re.sub(self.pattern, ' ', ''.join(res_sent).replace('^', ' '))
        subs = subs.replace('«', '')
        subs = subs.replace('»', '')
        return subs

In [0]:
import os

model_params_dir = base_path + 'ipynbCode/output/'

model_weights = os.listdir(model_params_dir)

model_params = model_params_dir + model_weights[-1]

In [0]:

# predict_modeling

model = auto_spacing(n_hidden, vocab_size, embed_dim, max_seq_len)

# model.collect_params().initialize(mx.init.Xavier(), ctx=mx.cpu(0))
# model.embedding.weight.set_data(weights)
model.load_parameters(model_params, ctx=mx.cpu(0))
predictor = pred_spacing(model, w2idx)

while 1:
    sent = input("sent > ")
    print(sent)
    spaced = predictor.get_spaced_sent(sent)

In [0]:
logger.info("calculate accuracy!")

model = auto_spacing(n_hidden, vocab_size, embed_dim, max_seq_len)

# model.initialize(ctx=ctx[0] if isinstance(ctx, list) else mx.gpu(0))
model.load_parameters(model_params,
                        ctx=ctx[0] if isinstance(ctx, list) else mx.gpu(0))
valid_generator = input_data(test_data_path,train_ratio=1, isMake_dif_set=True,max_seq_len = max_seq_len, batch_size=100)


valid_acc = evaluate_accuracy(valid_generator, model, w2idx['__PAD__'], ctx=ctx[0] if isinstance(ctx, list) else mx.gpu(0), n=30000)
logger.info('valid accuracy : {}'.format(valid_acc))

In [0]:
valid_acc

In [0]:
model.summary

In [0]:
model.collect_params