In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import time

In [None]:
train = pd.read_csv('/content/drive/MyDrive/projects/kaggle-disaster/train.csv', index_col = 0)
test = pd.read_csv('/content/drive/MyDrive/projects/kaggle-disaster/test.csv', index_col = 0)
sub = pd.read_csv('/content/drive/MyDrive/projects/kaggle-disaster/sample_submission.csv')

In [None]:
print(train.shape)
print(test.shape)

(7613, 4)
(3263, 3)


# Pre-processing

best score : 
- col_concat 적용
- rm_stopwords 미적용
- rp_pattern 미적용
- 중복 제거 적용

## keyword & location

In [None]:
def col_concat(df):
    df.keyword.fillna('', inplace=True)
    df["text"] = df["keyword"].astype(str) + ' ' + df["text"]

    # df['location'] = df.location.where(~df.location.notna(), 'LOCATION')
    # df.location.fillna('', inplace=True)
    # df["text"] = df["location"].astype(str) + ' ' + df["text"]

In [None]:
col_concat(train)
col_concat(test)

## lowercase + remove stopwords

In [None]:
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords[:10]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [None]:
def rm_stopwords(df, stopwords):
    df['text'] = df['text'].str.lower()
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

In [None]:
rm_stopwords(train, stopwords)
rm_stopwords(test, stopwords)

## masking/replacing using regex

In [None]:
def rp_pattern(df):
    url_ptrn = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
    time_ptrn = '[0-9]{2}:[0-9]{2}(:[0-9]{2})*'
    web_ptrn = '&[a-zA-Z]*;'

    df.text.replace(to_replace=url_ptrn, value='URL', regex=True, inplace=True)
    df.text.replace(to_replace=time_ptrn, value='TIME', regex=True, inplace=True)
    df.text.replace(to_replace=web_ptrn, value=' ', regex=True, inplace=True)

    df.replace('%', ' ', inplace=True)
    df.replace('\n', ' ', inplace=True)

In [None]:
rp_pattern(train)
rp_pattern(test)

## remove duplicates / misslabelled

In [None]:
train = train.drop_duplicates(subset=['text'], keep=False)

# split

stratify 옵션, test size 작게 (train 데이터 크기가 작은 관계로)

In [None]:
from sklearn.model_selection import train_test_split

#stratify
x_train, x_valid, y_train, y_valid = train_test_split(train['text'], train['target'], stratify=train['target'], test_size=0.1)

# Text Classification using BERT

In [None]:
!pip install bert-for-tf2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[K     |████████████████████████████████| 41 kB 147 kB/s 
[?25hCollecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30535 sha256=586cdb65c8357e3d2e2ed30e497b191a3f53031118fef6ba5309d281bf2a8979
  Stored in directory: /root/.cache/pip/wheels/47/b6/e5/8c76ec779f54bc5c2f1b57d2200bb9c77616da83873e8acb53
  Building wheel for params-flow (setup.py) ... [?25l[?25hdone
  Created wheel for params-flow: filename=params_flow-0.8.2-py3-none-any.whl size=19472 sha256=07d6c552543e78753198a95529893a2ad1176c

In [None]:
from bert import bert_tokenization
import tensorflow
import tensorflow as tf
import tensorflow_hub as hub
from sklearn import preprocessing

In [None]:
m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(m_url, trainable=True)

모델 구조 좀더 단순하게 조정

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len-len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    # net = tf.keras.layers.Dense(32, activation='relu')(net)
    # net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(net)

    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

max_len 을 120에서 80으로 조정

In [None]:
max_len = 80
x_train = bert_encode(x_train.values, tokenizer, max_len=max_len)
x_valid = bert_encode(x_valid.values, tokenizer, max_len=max_len)
train_labels = y_train.values
valid_labels = y_valid.values

In [None]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 80)]         0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 80)]         0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 80)]         0           []                               
                                                                                                  
 keras_layer_2 (KerasLayer)     [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 80, 768)]                 'input_mask[0][0]',       

  super(Adam, self).__init__(name, **kwargs)


In [None]:
%%time
checkpoint = tf.keras.callbacks.ModelCheckpoint('/content/drive/MyDrive/projects/kaggle-disaster/model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_history = model.fit(
    x_train, y_train, 
    validation_data=(x_valid,y_valid),
    epochs=30,
    callbacks=[checkpoint, earlystopping],
    batch_size=16,
    verbose=1
)

Epoch 1/30
Epoch 1: val_accuracy improved from -inf to 0.85829, saving model to /content/drive/MyDrive/projects/kaggle-disaster/model.h5
Epoch 2/30
Epoch 2: val_accuracy did not improve from 0.85829
Epoch 3/30
Epoch 3: val_accuracy did not improve from 0.85829
Epoch 4/30
Epoch 4: val_accuracy did not improve from 0.85829
Epoch 5/30
 89/421 [=====>........................] - ETA: 1:33 - loss: 0.1366 - accuracy: 0.9494


KeyboardInterrupt



In [None]:
%%time 
model.load_weights('/content/drive/MyDrive/projects/kaggle-disaster/model.h5')
test_input = bert_encode(test.text.values, tokenizer, max_len=max_len)
test_pred = model.predict(test_input)
sub['target'] = test_pred.round().astype(int)

timestr = time.strftime("%Y%m%d-%H%M%S")
sub.to_csv(f'/content/drive/MyDrive/projects/kaggle-disaster/submission-{timestr}.csv', index=False)

CPU times: user 20.4 s, sys: 355 ms, total: 20.8 s
Wall time: 32.6 s
