In [None]:
pip install bert-for-tf2



In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import pandas as pd
import numpy as np
import re

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow.keras.backend as K

from bert import bert_tokenization
from tqdm import tqdm
from tensorflow.keras.layers import LSTM, Bidirectional, Embedding, Dense, \
    TimeDistributed, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import callbacks
from tensorflow.keras.models import load_model


def get_f1(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val


def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []

    for text in texts:
        text = tokenizer.tokenize(text)

        text = text[:max_len - 2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)

        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len

        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
file = '/content/train.csv'

In [None]:
df = pd.read_csv(file)

del(df['id'])
del(df['keyword'])
del(df['location'])

df_attributes = df.iloc[:, :-1]
df_label = df.iloc[:, -1:]

df_attributes['text'] = df_attributes['text'].str.lower()

texts = df_attributes['text'].to_list()
training_labels = df_label['target'].to_list()

cleaned_texts = []

for text in texts:
    cleaned = re.sub(r'https?://\S+', '', text)
    cleaned = re.sub(r'\n',' ', cleaned)
    cleaned = re.sub('\s+', ' ', cleaned).strip()
    cleaned = re.sub('[\W]+', ' ', cleaned)
    
    emojis = re.compile("["
                        u"\U0001F600-\U0001F64F"
                        u"\U0001F300-\U0001F5FF"
                        u"\U0001F680-\U0001F6FF"
                        u"\U0001F1E0-\U0001F1FF"
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251]+", flags=re.UNICODE)
    cleaned = emojis.sub(r'', cleaned)

    cleaned_split = cleaned.split()
    tokens = []
    for word in cleaned_split:
        if word in abbreviations.keys():
            tokens.append(abbreviations[word])
        else:
            tokens.append(word)
        cleaned = ' '.join(tokens)

    cleaned_texts.append(cleaned)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [None]:
# bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/2', trainable=True)
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/2', trainable=True)

max_seq_length = 25

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

input_word_ids = Input(shape=(max_seq_length,), dtype=tf.int32,
                       name="input_word_ids")
input_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")

_, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
clf_output = sequence_output[:, 0, :]
out = Dense(1, activation='sigmoid')(clf_output)

model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=2e-6), metrics=[
    'accuracy', get_f1])

# Saves the model with the highest f1-score
callbacks_list = [
    callbacks.ModelCheckpoint(
        filepath='/content/bert_model.h5',
        monitor='get_f1',
        mode='max',
        save_best_only=True,
    )
]

cleaned_np = np.asarray(cleaned_texts)
train_input = bert_encode(cleaned_np, tokenizer, max_len=max_seq_length)
train_labels_np = np.asarray(training_labels)

model.fit(
    train_input,
    train_labels_np,
    validation_split=0.10,
    epochs=8,
    batch_size=64,
    callbacks=callbacks_list
)



Epoch 1/8

KeyboardInterrupt: ignored

In [None]:
file_test = '/content/test.csv'
df_test = pd.read_csv(file_test)

id_list = df_test['id'].to_list()

del(df_test['id'])
del(df_test['keyword'])
del(df_test['location'])

df_test['text'] = df_test['text'].str.lower()

texts_test = df_test['text'].to_list()

cleaned_texts_test = []

for text in texts_test:
    cleaned = re.sub(r'https?://\S+', '', text)
    cleaned = re.sub(r'\n',' ', cleaned)
    cleaned = re.sub('\s+', ' ', cleaned).strip()
    cleaned = re.sub('[\W]+', ' ', cleaned)

    emojis = re.compile("["
                        u"\U0001F600-\U0001F64F"
                        u"\U0001F300-\U0001F5FF"
                        u"\U0001F680-\U0001F6FF"
                        u"\U0001F1E0-\U0001F1FF"
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251]+", flags=re.UNICODE)
    cleaned = emojis.sub(r'', cleaned)

    cleaned_texts_test.append(cleaned)

cleaned_texts_test_np = np.asarray(cleaned_texts_test)
test_input = bert_encode(cleaned_texts_test_np, tokenizer, max_len=max_seq_length)

# ---------------- Predicting -----------------------
model = load_model('/content/bert_model.h5', custom_objects={'get_f1': get_f1, 'KerasLayer': hub.KerasLayer})

preds = model.predict(test_input)
output_preds = []

for pred in preds:
    if pred >= 0.50:
        output_preds.append(1)
    else:
        output_preds.append(0)


f = open('DanielChen_bert_submission.csv', 'w')
f.write("id,target\n")
for i in range(len(id_list)):
    f.write("{},{}\n".format(id_list[i], output_preds[i]))
f.close()