In [11]:
import pandas as pd
import numpy as np
import tensorflow_hub as hub
import tensorflow as tf
import preprocessor as p
import re
import os
import random

p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER, p.OPT.RESERVED, p.OPT.SMILEY) 

데이터 불러오기

In [3]:
train = pd.read_csv('../data/nlp-getting-started/train.csv')
test = pd.read_csv('../data/nlp-getting-started/test.csv')

cleaning 함수 정의

In [None]:
def cleaning(tweet):
    tweet = p.clean(tweet)
    text = re.sub(r'[^\w\s]','',text)
    
    tweet = tweet.replace('...', ' ... ')
    if '...' not in tweet:
        tweet = tweet.replace('..', ' ... ')

    return tweet

In [None]:
from konlpy.tag import Okt, Komoran, Mecab, Hannanum, Kkma

In [None]:
def get_tokenizer(tokenizer_name):
    if tokenizer_name == "komoran":
        tokenizer = Komoran()
    elif tokenizer_name == "okt":
        tokenizer = Okt()
    elif tokenizer_name == "mecab":
        tokenizer = Mecab()
    elif tokenizer_name == "hannanum":
        tokenizer = Hannanum()
    elif tokenizer_name == "kkma":
        tokenizer = Kkma()
    elif tokenizer_name == "khaiii":
        tokenizer = KhaiiiApi()
    else:
        tokenizer = Mecab()
    return tokenizer

def post_processing(tokens):
    results = []
    for token in tokens:
        # 숫자에 공백을 주어서 띄우기
        processed_token = [el for el in re.sub(r"(\d)", r" \1 ", token).split(" ") if len(el) > 0]
        results.extend(processed_token)
    return results

수정 필요!

In [None]:
class FullTokenizer(object):
    def __init__(self, vocab_file, do_lower_case=True):
        self.vocab = load_vocab(vocab_file)
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

    def tokenize(self, text):
        split_tokens = []
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)

        return split_tokens

    def convert_tokens_to_ids(self, tokens):
        return convert_by_vocab(self.vocab, tokens)

    def convert_ids_to_tokens(self, ids):
        return convert_by_vocab(self.inv_vocab, ids)


파이프라인 클래스

In [None]:
class tweet_classifier_model(object):
    def __init__(self,
                train_corpus_fname=None,  # 훈련 말뭉치
                tokenized_train_corpus_fname=None, # 토크나이징 된 훈련 말뭉치
                test_corpus_fname=None,  # 테스트 말뭉치
                tokenized_test_corpus_fname=None,  # 토크나이징 된 테스트 말뭉치
                model_name='bert',
                model_save_path=None,
                vocab_fname=None,
                eval_every=1000,
                batch_size=32, num_epoch=10, dropout_keep_prob_rate=0.9,
                model_ckpt_path=None):
    
        # 변수 구성
        self.model_name = model_name
        self.eval_every = eval_every
        self.model_ckpt_path = model_ckpt_path
        self.model_save_path = model_save_path
        self.batch_size = batch_size
        self.num_epoch = num_epoch
        self.dropout_keep_prob_rate = dropout_keep_prob_rate
        self.best_valid_score = 0.0

        # tokenizer 정의
        if self.model_name == 'bert':
            self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
        else:
            self.tokenizer = get_tokenizer('mecab')

        # 말뭉치 불러오기 & 토크나이징
        self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname)
        self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)
    
    # 말뭉치 불러오기 & 토크나이징
    def load_or_tokenize_corpus(self, corpus_fname, tokenized_corqus_fname):
        """
        말뭉치를 불러와 형태소 분석(토크나이징)하는 함수
        클래스가 선언됨과 동시에 호출됨.
        tokenized_corpus_fname 경로에 데이터가 존재하면 해당 경로의 데이터를 읽어들이고, 그렇지 않으면 corus_fname 경로의 데이터를 읽어서 형태소 분석을 실시
        """
    
        data_set = []
        if os.path.exists(tokenized_corqus_fname):
            tf.compat.v1.logging.info('토크나이징 말뭉치 : ' + tokenized_corqus_fnamen)
            with open(tokenized_corqus_fname, 'r') as file1:
                for line in file1:
                    # \u241E : Symbol for Record Seperator
                    tokens, label = line.strip().split('\u241E')
                    if len(tokens) > 0:
                        data_set.append([tokens.split(" "), int(label)])
        else:
            with open(corpus_fname, 'r') as file2:
                next(file2)  # skip head line
                for line in file2:
                    sentence, label = line.strip().splitit('\u241E')
                    if self.model_name == 'bert':
                        tokens = self.tokenizer.tokenize(sentence)
                    else:
                        tokens = self.tokenizer.morphs(sentence)
                        tokens = post_processing(tokens)
                    if int(label) >= 1:
                        int_label = 1
                    else:
                        int_label = 0
                    data_set.append([tokens, int_label])
            with open(tokenized_corqus_fname, 'w') as file3:
                for token, label in data_set:
                    file3.writelinest(' '.join(tokens) + '\u241E' + str(label) + '\n')
        
        return data_set, len(data_set)
    
    def get_batch(self, data, num_epoch, is_training=True):
        if is_training:
            data_size = self.train_data_size
        else:
            data_size = self.test_data_size
            
        num_batches_per_epoch = int((data_size - 1) / self.batch_size) + 1
        for epoch in range(num_epoch):
            idx = random.sample(range(data_size), data_size)
            data = np.array(data)[idx]
            for batch_num in range(num_batches_per_epoch):
                batch_sentences = []
                batch_labels = []
                start_index = batch_num * self.batch_size
                end_index = min((batch_num + 1) * self.batch_size, data_size)
                features = data[start_index:end_index]
                
                for feature in features:
                    sentence, label = feature
                    batch_sentences.append(sentence)
                    batch_labels.append(int(label))
                yield self.make_input(batch_sentences, batch_labels, is_training)
    
    def train(self, sess, saver, global_step, output_feed):
        train_batches = self.get_batch(self.train_data, self.num_epoch, is_training=True)
        
        checkpoint_loss = 0.0
        for current_intput_feed in train_batches:
            _, _, _, current_loss = sess.run(output_feed, current_intput_feed)
            checkpoint_loss += current_loss
            if global_step.eval(sess) % self.eval_every == 0:
                tf.compat.v1.logging.info('global step %d train loss %.4f' %(global_step.eval(sess), checkpoint_loss / self.eval_every))
                checkpoint_loss = 0.0
                self.validation(sess, saver, global_step)
    
    def validation(self, sess, saver, global_step):
        valid_loss, valid_pred, valid_num_data = 0, 0, 0
        output_feed = [self.logits, self.loss]
        test_batches = self.get_batch(self.test_data,
                                     num_epoch=1,
                                     is_training=False)
        for current_intput_feed, current_labels in test_batches:
            current_logits, current_loss = sess.run(output_feed, current_intput_feed)
            current_preds = np.argmax(current_logitst, axis=-1)
            valid_loss += current_loss
            valid_num_data += len(current_labels)
            for pred, label in zip(current_predst, current_labels):
                if pred == label:
                    valid_pred += 1
        
        valid_score = valid_pred / valid_num_datad
        
        tf.compat.v1.logging.info('valid loss %.4f valid score %.4f' % (valid_loss, valid_score))
        
        if valid_score > self.best_valid_score:
            selff.best_valid_score = valid_score
            path = self.model_save_path + '/' + str(valid_score)
            saver.save(sess, path, global_step=global_step)
    
    def make_input(self, sentence, labels, is_training):
        raise NotImplementedError
        
    def tune(self):
        raise NotImplementedError

ELMo 파인 튜닝 네트워크 그래프
https://github.com/allenai/bilm-tf

In [29]:
from bilm.model import BidirectionalLanguageModel
from bilm.data import Batcher
from bilm.elmo import weight_layers

def make_elmo_graph(options_fname,
                    pretrain_model_fname,
                    max_characters_per_token,
                    num_labels,
                    tune=False):
    """
    ids_placeholder : ELMo 네트워크의 입력값 (ids)
        - shape : [batch_size, unroll_steps, max_character_byte_length]
    elmo_embeddings : fine tuning 네트워크의 입력값 (ELMo 네트워크의 출력값)
        - shape : [batch_size, unroll_steps, dimension]
    labels_placeholder : fine tuning 네트워크의 출력값 (예 : 긍정=1/부정=0)
        - shape : [batch_size]
    loss : fine tuning 네트워크의 loss
    """
        # biLM graph 구축
    # pretrained ELMo model 불러오기
    bilm = BidirectionalLanguageModel(options_fname, pretrain_model_fname)
    # bilm 에 placeholder 입력
    ids_placeholder = tf.placeholder(tf.int32,
                                     shape=(None, None, max_characters_per_token),
                                     name = 'input')
    
    if tune:
        labels_placeholder = tf.placeholder(tf.int32,
                                           shape=(None))
    else:
        labels_placeholder = None
    
    embeddings_op = bilm(ids_placeholder)
    input_lengths = embeddings_op['lengths']
    
    if tune:
        dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
    else:
        dropout_keep_prob = tf.constant(1.0, dtype=tf.float32)
        
    # ELMo layer
    # shape = [batch_size, unroll_steps, dimension]
    elmo_embeddings = weight_layers('elmo_embeddings',
                                    embeddings_op,
                                    l2_coef=0.0,
                                    use_top_only=False,
                                    do_layer_norm=True)
    
    # fine_tuning network의 입력
    features = tf.nn.dropout(elmo_embeddings['weighted_op'], dropout_keep_prob)
    # Bidirectional LSTM
    lstm_cell_fw = tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=512,
                                                     cell_clip=5,
                                                     proj_clip=5)
    lstm_cell_bw = tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=512,
                                                     cell_clip=5,
                                                     proj_clip=5)
    lstm_output, _ = tf.compat.v1.nn.bidirectional_dynamic_rnn(cell_fw=lstm_cell_fw,
                                                               cell_bw=lstm_cell_bw,
                                                               inputs=features,
                                                               sequence_length=input_lengths,
                                                               dtype=float32)
    
    # Attention Layer
    output_fw, output_bw = lstm_output
    H = tf.contrib.layers.fully_connected(inputs = output_fw + output_bw,
                                          num_outputs = 256,
                                          activation_fn = tf.nn.tanh)
    attention_score = tf.nn.softmax(tf.contrib.layers.fully_connected(inputs=H, num_outputs=1, activation_fn=None), axis=1)
    attention_output = tf.squeeze(tf.matmul(tf.transpose(H, perm=[0, 2, 1]), attention_score), axis=-1)
    layer_output = tf.nn.dropout(attention_output, dropout_keep_prob)
    
    # Feed-Forward Layer
    fc = tf.contrib.layers.fully_connected(inputs=layer_output,
                                           num_outputs=512,
                                           activation_fn=tf.nn.relu,
                                           weights_initializer=tf.contrib.layers.xavier_initializer(),
                                           biases_initializer=tf.zeros_initializer())
    features_drop = tf.nn.dropout(fc, dropout_keep_prob)
    logits = tf.contrib.layers.fully_connected(inputs=features_drop,
                                               num_outputs=num_labels,
                                               activation_fn=None,
                                               weights_initializer=tf.contrib.layers.xavier_initializer(),
                                               biases_initializer=tf.zeros_initializer())
    if tune:
        # Loss Layer
        CE = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels_placeholder, logits=logits)
        loss = tf.reduce_mean(CE)
        return ids_placeholder, labels_placeholder, dropout_keep_prob, logits, loss
    else:
        # prob Layer
        probs = tf.nn.softmax(logits, axis=-1, name='probs')
        return ids_placeholder, elmo_embeddings, probs

pretrained - elmo 불러오기 

In [None]:
elmo = hub.load("https://tfhub.dev/google/elmo/3")

* elmo
* 훈련 txt
* 테스트 txt
* elmo-vocab.txt
* elmo.model
* options.json

model_save_path 
