In [None]:
!pip install -q tensorflow-text
!pip install -q tf-models-official

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import csv
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization

import matplotlib.pyplot as plt

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

tf.test.gpu_device_name()

In [None]:
# BERT preprocessor: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3
# ALBERT preprocessor: https://tfhub.dev/tensorflow/albert_en_preprocess/3
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

# BERT small: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2
# BERT base : https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
# BERT large: https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4
# BERT pubmed: https://tfhub.dev/google/experts/bert/pubmed/2
# ELECTRA base: https://tfhub.dev/google/electra_base
# ELECTRA large: https://tfhub.dev/google/electra_large/2
# ALBERT base: https://tfhub.dev/tensorflow/albert_en_base/2
# ALBERT xxlarge: https://tfhub.dev/tensorflow/albert_en_xxlarge/3
tfhub_handle_encoder = "https://tfhub.dev/google/electra_large/2"

In [None]:
%%time
records = []
with open("/kaggle/input/sentiment-analysis-of-covid-19-related-tweets/training.csv") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            current_row = [row[0], row[1]]
            classes = list(map(int, row[2].split(" ")))
            for i in range(0, 11):
                if i in classes:
                    current_row.append(1)
                else:
                    current_row.append(0)

            records.append(tuple(current_row))
            line_count += 1

    print(f'Processed {line_count} lines.')

df = pd.DataFrame.from_records(records, columns=["ID", "Tweet"] + [x for x in range(0, 11)])
df.info()

In [None]:
df.head()

In [None]:
X, y = df.iloc[:, 1].values, df.iloc[:, 2:].values

#X_train, y_train = X[:4000], y[:4000]
#X_test, y_test   = X[4000:4700], y[4000:4700]
#X_val, y_val     = X[4700:], y[4700:]

In [None]:
%%time
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [None]:
text_test = [X[0]]
text_preprocessed = bert_preprocess_model(text_test)

print(text_test)
print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

In [None]:
%%time
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)

  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)

  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(11, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

In [None]:
classifier_model.summary()

In [None]:
tf.keras.utils.plot_model(classifier_model)

In [None]:
epochs = 40
batch_size = 32

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [
    tf.metrics.BinaryAccuracy(),
    tf.keras.metrics.Precision(),
    tf.keras.metrics.Recall()
]

steps_per_epoch = X.shape[0] #X_train.shape[0]
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
earlystopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [None]:
%%time
print(f'Training model with {tfhub_handle_encoder}')
#history = classifier_model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=batch_size, epochs=epochs)
history = classifier_model.fit(X, y, validation_split=0.11, batch_size=batch_size, epochs=epochs, callbacks=[earlystopping_callback])

In [None]:
classifier_model.save("bert-large1", include_optimizer=False)

In [None]:
#loss, accuracy = classifier_model.evaluate(X_test, y_test)
#print(f'Loss: {loss}')
#print(f'Accuracy: {accuracy}')

In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

## Evaluate or Predictions

In [None]:
submission_df = pd.read_csv("/kaggle/input/sentiment-analysis-of-covid-19-related-tweets/validation.csv")

In [None]:
%%time
#y_pred = tf.sigmoid(classifier_model(submission_df["Tweet"].values))
y_pred = tf.sigmoid(classifier_model.predict(submission_df["Tweet"].values, batch_size=batch_size))

In [None]:
def process_cols(columns):
    classes = []
    for i, col in enumerate(columns):
        if col == 1:
            classes.append(i)
    
    if len(classes) == 0:
      classes.append(np.argmax(columns))
      
    return " ".join(map(str, classes))

In [None]:
#df_indexes = df[4000:4700].copy().reset_index()[["ID"]]
#df_predictions = pd.DataFrame(np.where(y_pred > 0.5, 1, 0))
df_indexes = submission_df.copy().reset_index()[["ID"]]
df_predictions = pd.DataFrame(np.where(y_pred >= 0.5, 1, 0))

df_result = pd.concat([df_indexes, df_predictions], axis=1)
df_result["Labels"] = df_result.iloc[:, 1:12].apply(process_cols, axis=1)
df_result[["ID", "Labels"]].to_csv("challenge1-bert_en_uncased_L-4_H-512_A-8-1.csv", index=None)

In [None]:
df_result.iloc[:, 1:12].sum() / 2500.0

In [None]:
df_result