In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import time
!pip install contractions
import contractions
import re
import string
import operator
import nltk
from nltk.corpus import stopwords

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 5.2 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 69.8 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.24


In [27]:
train = pd.read_csv('/content/drive/MyDrive/projects/kaggle-disaster/train.csv', index_col = 0)
test = pd.read_csv('/content/drive/MyDrive/projects/kaggle-disaster/test.csv', index_col = 0)

In [28]:
def col_concat(df):
    df.keyword.fillna('', inplace=True)
    df["text"] = df["keyword"].astype(str) + ' ' + df["text"]
    
col_concat(train)
col_concat(test)

train = train.drop_duplicates(subset=['text'], keep=False)

In [19]:
!pip install bert-for-tf2
from bert import bert_tokenization
import tensorflow
import tensorflow as tf
import tensorflow_hub as hub
from sklearn import preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [29]:
def get_bert_layer():

    m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
    bert_layer = hub.KerasLayer(m_url, trainable=True)

    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)
    
    return bert_layer, tokenizer

def bert_encode(texts, tokenzier, max_len=512):

    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len-len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

def build_model(bert_layer, max_len=512):

    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    # net = tf.keras.layers.Dense(32, activation='relu')(net)
    # net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(net)
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [30]:
from sklearn.model_selection import StratifiedKFold

# prepare cross validation
kf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [31]:
import gc

results = []
models = []
max_len = 64
train_num = 0


for train_index, val_index in kf.split(train.text.values, train.target.values):
    train_num += 1

    train_df = train.iloc[train_index]
    val_df = train.iloc[val_index]

    bert_layer, tokenizer = get_bert_layer()
    x_train = bert_encode(train_df.text.values, tokenizer, max_len=max_len)
    x_valid = bert_encode(val_df.text.values, tokenizer, max_len=max_len)
    train_labels = train_df.target.values
    valid_labels = val_df.target.values

    model = build_model(bert_layer, max_len=max_len)

    checkpoint = tf.keras.callbacks.ModelCheckpoint(f'/kaggle/working/model_ver{train_num}.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
    earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2, verbose=1)

    train_history = model.fit(
        x_train, train_labels, 
        validation_data=(x_valid, valid_labels),
        epochs=30,
        callbacks=[checkpoint, earlystopping],
        batch_size=16,
        verbose=1
    )

    results.append(max(train_history.history['val_accuracy']))

    model.load_weights(f'/kaggle/working/model_ver{train_num}.h5')
    models.append(model)
    
    del model
    gc.collect()
    tf.keras.backend.clear_session()
    tf.compat.v1.reset_default_graph()
    
print(np.mean(results))

Epoch 1/30


  super(Adam, self).__init__(name, **kwargs)


Epoch 1: val_accuracy improved from -inf to 0.82754, saving model to /kaggle/working/model_ver1.h5
Epoch 2/30
Epoch 2: val_accuracy improved from 0.82754 to 0.82888, saving model to /kaggle/working/model_ver1.h5
Epoch 3/30
Epoch 3: val_accuracy did not improve from 0.82888
Epoch 4/30
Epoch 4: val_accuracy did not improve from 0.82888
Epoch 4: early stopping
Epoch 1/30
Epoch 1: val_accuracy improved from -inf to 0.81952, saving model to /kaggle/working/model_ver2.h5
Epoch 2/30
Epoch 2: val_accuracy improved from 0.81952 to 0.82152, saving model to /kaggle/working/model_ver2.h5
Epoch 3/30
Epoch 3: val_accuracy improved from 0.82152 to 0.82286, saving model to /kaggle/working/model_ver2.h5
Epoch 4/30
Epoch 4: val_accuracy did not improve from 0.82286
Epoch 5/30
Epoch 5: val_accuracy did not improve from 0.82286
Epoch 5: early stopping
Epoch 1/30
Epoch 1: val_accuracy improved from -inf to 0.85896, saving model to /kaggle/working/model_ver3.h5
Epoch 2/30
Epoch 2: val_accuracy improved from

- contractions - 0.8429144382476806 평균 나왔으나 제출 점수 0.83787
- 마지막 레이어만 학습 - 0.8417112350463867 평균으나 제출 점수 동일 0.84094

In [32]:
results

[0.8288770318031311,
 0.8228609561920166,
 0.8629679083824158,
 0.8495989441871643,
 0.8442513346672058]

In [33]:
soft_voting = pd.DataFrame()
x_test = bert_encode(test.text.values, tokenizer, max_len=max_len)

for n, m in enumerate(models):
    pred_prob = m.predict(x_test)
    soft_voting[n] = pred_prob.flatten()

In [34]:
soft_voting[5] = (soft_voting[0] + soft_voting[1] + soft_voting[2] + soft_voting[3] + soft_voting[4])/5

In [36]:
sub = pd.read_csv('/content/drive/MyDrive/projects/kaggle-disaster/sample_submission.csv')
sub['target'] = soft_voting[5].values.round().astype('int32')
sub.to_csv('/content/drive/MyDrive/projects/kaggle-disaster/soft_voted_last_layer.csv', index=False)