In [None]:
!pip install tf-nightly tensorflow-text tensorflow-addons tf-models-official scikit-learn tensorflow-gpu

In [13]:
import re
import pandas as pd
from google.colab import drive 
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_addons as tfa

from official.nlp import optimization 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [14]:
drive.mount('/content/gdrive')

data_path = "/content/gdrive/MyDrive/Colab Notebooks/sentimaster_data/"
train_file = data_path + "train_complete.csv"
test_file = data_path + "test_text.txt"

random_state = 42
df = pd.read_csv(train_file)
df.head()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Unnamed: 0,tweet,label
0,"""QT @user In the original draft of the 7th boo...",2
1,"""Ben Smith / Smith (concussion) remains out of...",1
2,Sorry bout the stream last night I crashed out...,1
3,Chase Headley's RBI double in the 8th inning o...,1
4,@user Alciato: Bee will invest 150 million in ...,2


In [26]:
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"


In [4]:
def text_preprocessing_bert(text):

    # Remove @mentions
    text = re.sub(r'(@.*?)[\s]', ' ', text)
    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)
    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def get_bert_preprocessed_dataset(df):
    df['tweet'] = df['tweet'].map(text_preprocessing_bert)
    return df


def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(3, activation='softmax', name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [30]:
classifier_model = build_classifier_model()
loss = tf.keras.losses.CategoricalCrossentropy()
metrics = [tfa.metrics.F1Score(num_classes=3, average='macro'),
            tf.keras.metrics.CategoricalAccuracy()]


bert_df = get_bert_preprocessed_dataset(df)


label_encoder = LabelBinarizer()
label_encoder.fit(df['label'].values)

X = bert_df['tweet'].values
y = label_encoder.transform(bert_df['label'].values)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, 
                                                    random_state=random_state)

train_tf_df = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)
val_tf_df = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(32)



bert_model = hub.KerasLayer(tfhub_handle_encoder)
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)


epochs = 4
steps_per_epoch = tf.data.experimental.cardinality(train_tf_df).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')


classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [31]:
history = classifier_model.fit(x=train_tf_df,
                               validation_data=val_tf_df,
                               epochs=epochs)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [32]:
classifier_model.save(data_path + "classifier_L12_H768_A12.h5")

In [5]:
classifier_model = tf.keras.models.load_model(
      data_path + "classifier_L12_H768_A12.h5",
      custom_objects={'KerasLayer':hub.KerasLayer,
                      'AdamWeightDecay': optimization.AdamWeightDecay,
                      'WarmUp': optimization.WarmUp})

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [6]:
import numpy as np
with open(test_file) as f:
    challange_test_tweets = [
            text_preprocessing_bert(line.rstrip()) for line in f
        ]

In [8]:
from tqdm import tqdm

predictions = []
for tweet in tqdm(challange_test_tweets):
  predictions.append(classifier_model(tf.constant([tweet])))

100%|██████████| 12284/12284 [1:28:02<00:00,  2.33it/s]


In [18]:
np.squeeze(np.array(predictions), axis=1)

array([[9.8609239e-01, 1.3829094e-02, 7.8557052e-05],
       [1.0449306e-02, 6.3339937e-01, 3.5615137e-01],
       [4.3494666e-01, 5.6160021e-01, 3.4531113e-03],
       ...,
       [1.2228541e-02, 9.8630446e-01, 1.4669833e-03],
       [9.9906904e-01, 8.7922724e-04, 5.1832892e-05],
       [2.1158939e-03, 2.4772488e-02, 9.7311169e-01]], dtype=float32)

In [19]:
with open(test_file) as f:
    original_tweets = [line.rstrip() for line in f]

csv_result = pd.DataFrame()
csv_result['tweet'] = original_tweets
csv_result['label'] = label_encoder.inverse_transform(np.squeeze(np.array(predictions), axis=1))

csv_result.head()

Unnamed: 0,tweet,label
0,@user @user what do these '1/2 naked pics' hav...,0
1,OH: “I had a blue penis while I was this” [pla...,1
2,"@user @user That's coming, but I think the vic...",1
3,I think I may be finally in with the in crowd ...,2
4,"@user Wow,first Hugo Chavez and now Fidel Cast...",2


In [20]:
csv_result.to_csv(data_path + 'model_predictions2.csv', index=False)