In [1]:
import codecs
import boto3
import numpy as np
from keras.callbacks import EarlyStopping
from keras.layers import Embedding, Dropout, Dense, Bidirectional, LSTM
from konlpy.tag import Okt
import pandas as pd
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from io import StringIO

OBJECT_KEY_DIC = {"all": "SNU_All_b.csv", "politics": "SNU_정치_b.csv", "economics": "SNU_경제_b.csv",
                  "society": "SNU_사회_b.csv", "etc": "SNU_기타_b.csv"}

# AWS에서는 loadWikiModel 메소드의 f.open 파라미터를 아래로 바꿀것.
WIKI_VEC_DIR = "/home/ubuntu/FastText/wiki.ko.vec"

LOCAL_WIKI_VEC_DIR = "./wiki.ko.vec"

# training params
BATCH_SIZE = 256
NUM_EPOCHS = 40

# model parameters
MAX_NB_WORDS = 100000
NUM_FILTERS = 64
EMBED_DIM = 300
WEIGHT_DECAY = 1e-4
EMBEDDING_INDEX = {}


# Vectorizer의 argument인 tokenizer에 KoNLPy의 pos 함수로 대체.
class MyTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger

    def __call__(self, sent):
        pos = self.tagger.pos(sent)
        clean_words = []  # 정제된 단어 리스트
        for word in pos:
            # word[1]은 품사를 의미하며, 여기서는 조사, 문장기호, 접두사, Foreign('\n'을 빼주기 위함)인 것은 제외시킴.
            if word[1] not in ['Josa', 'Punctuation', 'Suffix', 'Foreign']:
                if len(word[0]) >= 2:  # 한 글자인 단어들도 의미가 없는 경우가 많으므로 일단 제외.
                    # if word[0] not in ['있다', '했다', '한다', '없다', '된다']:
                    clean_words.append(word[0])
        return clean_words


def string_from_AWS() -> str:
    # get your credentials from environment variables
    AWS_ID = ''
    AWS_SECRET = ''

    client = boto3.client('s3', aws_access_key_id=AWS_ID,
                          aws_secret_access_key=AWS_SECRET)
    bucket_name = 'snucsv'

    # OBJECT_KEY_DIC 참고.
    object_key = OBJECT_KEY_DIC["politics"]
    csv_obj = client.get_object(Bucket=bucket_name, Key=object_key)
    body = csv_obj['Body']
    csv_string = body.read().decode('cp949')

    return csv_string


# load data
def stringToTrainDf(csv_string_: str) -> tuple:
    train_df = pd.read_csv(StringIO(csv_string_))

    train_df['doc_len'] = train_df['document'].apply(lambda words: len(words.split(" ")))

    # 모델 학습에 파라미터로 쓰임.
    max_seq_len = np.round(train_df['doc_len'].mean() + train_df['doc_len'].std()).astype(int)
    return train_df, max_seq_len


def loadWikiModel():
    f = codecs.open(WIKI_VEC_DIR, encoding='utf-8')

    for line in tqdm(f):
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        EMBEDDING_INDEX[word] = coefs
    f.close()


def word_indexing(train_df_, max_seq_len_) -> tuple:
    my_Tokenizer = MyTokenizer(Okt())

    raw_docs_train = train_df_['document'].tolist()
    # raw_docs_test = test_df['document'].tolist()
    # print(raw_docs_test)

    processed_docs_train = []
    # processed_docs_test = []

    for doc in tqdm(raw_docs_train):
        tokens = my_Tokenizer(doc)
        processed_docs_train.append(tokens)

    # for doc in tqdm(raw_docs_test):
    #     tokens = my_Tokenizer(doc)
    #     processed_docs_test.append(tokens)

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
    # tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)
    tokenizer.fit_on_texts(processed_docs_train)
    # tokenizer.fit_on_texts(processed_docs_train+proccessed_docs_test)
    word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
    word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len_)
    # word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)

    word_index = tokenizer.word_index
    print("dictionary size : ", len(word_index))
    return word_index, word_seq_train


def makingModel(word_index_, max_seq_len_):
    # embedding matrix
    words_not_found = []
    nb_words = min(MAX_NB_WORDS, len(word_index_) + 1)
    embedding_matrix = np.zeros((nb_words, EMBED_DIM))

    # print(word_index)
    for word, i in word_index_.items():
        if i >= nb_words:
            continue
        embedding_vector = EMBEDDING_INDEX.get(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            embedding_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
            # cc.ko.300.vec에서 찾지 못한 단어들의 리스트.
    print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
    print("sample words not found: ", np.random.choice(words_not_found, 10))

    from keras.layers import BatchNormalization
    import tensorflow as tf

    model = tf.keras.Sequential()

    model.add(Embedding(nb_words, EMBED_DIM, input_length=max_seq_len_, weights=[embedding_matrix], trainable=False))

    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Bidirectional(LSTM(32)))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.3))

    model.add(Dense(1, activation='sigmoid'))
    return model


def learning(model_, word_seq_train_, train_df_):
    y_train = train_df_['label'].values
    model_.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    es_callback = EarlyStopping(monitor='val_loss', patience=3)

    history = model_.fit(word_seq_train_, y_train, batch_size=32,
                         epochs=NUM_EPOCHS,
                         validation_split=0.2,
                         callbacks=[es_callback],
                         shuffle=False)
    print(history)


if __name__ == '__main__':
    csv_string = string_from_AWS()

    train_df, max_seq_len = stringToTrainDf(csv_string)

    loadWikiModel()

    word_index, word_seq_train = word_indexing(train_df, max_seq_len)

    model = makingModel(word_index, max_seq_len)

    learning(model, word_seq_train, train_df)


879131it [03:27, 4235.34it/s]
100%|██████████| 509/509 [00:12<00:00, 40.08it/s] 


dictionary size :  2095
number of null word embeddings: 167
sample words not found:  ['60%' '2.1%' '30일' '2%' '스쿨존' '1987년' '메가시티' '199' '2007년' '못연']
Extension horovod.torch has not been built: /home/ubuntu/anaconda3/envs/tensorflow2_latest_p37/lib/python3.7/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-37m-x86_64-linux-gnu.so not found
If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.
[2021-10-12 08:55:59.012 ip-172-31-32-76:3836 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2021-10-12 08:55:59.822 ip-172-31-32-76:3836 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
<tensorflow.python.keras.callbacks.History object at 0x7f2b8c611fd0>
