# Import libraries

In [1]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
import tensorflow_text as texts
import pandas as pd

# Load data

In [25]:
try:
    project_url = 'https://raw.githubusercontent.com/BrianTshatiwa/99-ML-Learning-Projects/'
    data_path = '/bert-pretrained/bert-text-classification/data/'
    train=pd.read_csv(project_url+data_path+'tweets.csv')
except Exception as ex:
    print(f"ERROR: {ex}")

# print first five rows
train.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0


In [26]:
# select columns to use
train = train[["text", "target"]]

# print first five rows
train.head()

Unnamed: 0,text,target
0,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,Telangana: Section 144 has been imposed in Bha...,1
2,Arsonist sets cars ablaze at dealership https:...,1
3,Arsonist sets cars ablaze at dealership https:...,1
4,"""Lord Jesus, your love brings freedom and pard...",0


# Check class imbalance

In [27]:
train.target.value_counts()

0    9256
1    2114
Name: target, dtype: int64

# Undersample majority class

In [28]:
class_count_0, class_count_1,  = train.target.value_counts()

df_class_0 = train[train["target"]==0]
df_class_1 = train[train["target"]==1]


# Randomly undersample both to 500 samples
df_class_0_under = df_class_0.sample(500, replace=True)
df_class_1_under = df_class_1.sample(500, replace=True)

train = pd.concat([df_class_0_under, df_class_1_under], axis=0)
train.target.value_counts()

0    500
1    500
Name: target, dtype: int64

# Split data into train and test - stratified sampling

In [29]:
import sklearn 
from sklearn.model_selection import train_test_split

train, test = train_test_split(train, test_size=0.1, stratify=train.target)

# Download BERT preprocesses and the encoder

In [30]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

# Build a model

In [32]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
x = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
x = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(x)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [x])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [33]:
model.fit(train["text"], train["target"], validation_data=(test["text"], test["target"]), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21098c0d460>

# Save Model

In [34]:
model.save("BERTDisasterTweetClassifier.h5")