In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train

In [None]:
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test

In [None]:
import tensorflow_hub as hub
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1",
                           trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()  # bert모델 내 단어 로드

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

# Bert모델의 장점
* 1. 단어 학습시 겹치지 않는단어 또는 오탈자에 대한 올바른 학습가능 ex) happppppy
* 2. 동음이의어에 대해 올바른 학습가능


In [None]:
import tokenization
tk = tokenization.FullTokenizer(vocab_file)   # Bert용 토크나이저 --> 단어를 음절단위로 구분

In [None]:
# 데이터 전처리 함수
def bert_encode(texts,tk,max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    for text in texts:
        text = tk.tokenize(text)   # 토크나이징
        text = text[:max_len-2]  
        # 단어 맨 뒤 2개 안가져오는 이유? 빈칸을 만들어놓고 special token을 넣기 위함
        # special token을 통해 어떤 학습을 할지 Bert 모델이 결정함
        input_sequence = ["[CLS]"] + text + ["[SEP]"]    # 문장 앞뒤로 special token을 넣어줌
        # cls --> 분류모델용 special token
        # sep --> 문장간 구분자 역할
        pad_len = max_len-len(input_sequence)  # 패딩 길이        
        tokens = tk.convert_tokens_to_ids(input_sequence)    # 숫자 패딩
        tokens += [0] * pad_len
        all_tokens.append(tokens)
        
        pad_masks = [1] * len(input_sequence) + [0] * pad_len   
        # 어디서부터 어디까지가 실제 문장이고 의미없는 패딩인지 구분
        all_masks.append(pad_masks)
        
        segment_ids = [0] * max_len      # 앞뒤문장 간 관계학습
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
    # array형태로 바꿔주어야 학습

train_input = bert_encode(train['text'].values,tk,max_len=50)  # max_len은 변경가능!
test_input = bert_encode(test['text'].values,tk,max_len=50) 

In [None]:
# 모델링(input 요소가 3개이므로 input층도 3개)
from tensorflow.keras import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam

def build_model(bert_layer,max_len=512):
    input_word_ids = Input(shape=(max_len),dtype=tf.int32,name="input_word_ids") # name은 고정!
    input_mask = Input(shape=(max_len),dtype=tf.int32,name="input_mask")
    input_segment_ids = Input(shape=(max_len),dtype=tf.int32,name="segment_ids")
    _,sequence_output = bert_layer([input_word_ids,input_mask,input_segment_ids]) 
    clf_output = sequence_output[:,0,:]  
    # :(데이터 batch--> 모든 데이터 처리) , 0(단어 개수 --> tokens) , :(각 단어 임베딩)
    output = Dense(2,activation='softmax')(clf_output)
    model = Model(inputs=[input_word_ids,input_mask,input_segment_ids],outputs=output)
    model.compile(loss='categorical_crossentropy',metrics=['acc'],optimizer=Adam(lr=0.00002))
    return model

In [None]:
import tensorflow as tf
model = build_model(bert_layer,max_len=50)
model.summary()

In [None]:
model.fit(train_input,pd.get_dummies(train['target']),epochs=2,batch_size=32)

In [None]:
result = model.predict(test_input)

In [None]:
sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
sub['target'] = result.argmax(1)
sub.to_csv('result.csv',index=False)