## BERT - TensorFlow 2 & Hugging Face Transformers Library

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers==2.3.0 -q

[K     |████████████████████████████████| 447 kB 35.4 MB/s 
[K     |████████████████████████████████| 1.2 MB 58.6 MB/s 
[K     |████████████████████████████████| 880 kB 58.9 MB/s 
[K     |████████████████████████████████| 132 kB 64.9 MB/s 
[K     |████████████████████████████████| 79 kB 9.9 MB/s 
[K     |████████████████████████████████| 9.0 MB 56.6 MB/s 
[K     |████████████████████████████████| 139 kB 68.8 MB/s 
[K     |████████████████████████████████| 127 kB 75.2 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m


In [3]:
dataset_directory = '/content/drive/MyDrive/temp/train.csv'

In [5]:
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf

## 1. Data Pipeline
- Loading the datasets from CSVs
- Preprocessing (Tokenization, Truncation & Padding)
- Creating efficient data pipelines using tf.data

In [5]:
# !unzip /content/drive-download-20220717T191232Z-001.zip

In [6]:
train_path = '/content/drive/MyDrive/temp/train.csv'
test_path = '/content/drive/MyDrive/temp/train.csv'
#test_labels_path = 'data/test_labels.csv'
#subm_path = 'data/sample_submission.csv'

In [9]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']
       
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
# df_test_labels = pd.read_csv(test_labels_path)
# df_test_labels = df_test_labels.set_index('id')
# df_train.drop(["original","genre","split"], axis=1, inplace=True)
# df_test.drop(["original","genre","split"], axis=1, inplace=True)
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
df_train.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [10]:
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

bert_model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
MAX_LEN = 128

def tokenize_sentences(sentences, tokenizer, max_seq_len = 128):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                    )
        
        tokenized_sentences.append(tokenized_sentence)

    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)

input_ids = tokenize_sentences(df_train['comment_text'], tokenizer, MAX_LEN)
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
attention_masks = create_attention_masks(input_ids)

  0%|          | 0/159571 [00:00<?, ?it/s]

In [11]:
from sklearn.model_selection import train_test_split

labels =  df_train[label_cols].values

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=0, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=0, test_size=0.1)

train_size = len(train_inputs)
validation_size = len(validation_inputs)

In [12]:
BATCH_SIZE = 32
NR_EPOCHS = 1

def create_dataset(data_tuple, epochs=1, batch_size=32, buffer_size=10000, train=True):
    dataset = tf.data.Dataset.from_tensor_slices(data_tuple)
    if train:
        dataset = dataset.shuffle(buffer_size=buffer_size)
    dataset = dataset.repeat(epochs)
    dataset = dataset.batch(batch_size)
    if train:
        dataset = dataset.prefetch(1)
    
    return dataset

train_dataset = create_dataset((train_inputs, train_masks, train_labels), epochs=NR_EPOCHS, batch_size=BATCH_SIZE)
validation_dataset = create_dataset((validation_inputs, validation_masks, validation_labels), epochs=NR_EPOCHS, batch_size=BATCH_SIZE)

## 2. BERT Model
- Load the pretrained BERT base-model from Transformers library
- Take the first hidden-state from BERT output (corresponding to CLS token) and feed it into a Dense layer with 6 neurons and sigmoid activation (Classifier). The outputs of this layer can be interpreted as probabilities for each of the 6 classes.

In [None]:
from transformers import TFBertModel
from tensorflow.keras.layers import Dense, Flatten

class BertClassifier(tf.keras.Model):    
    def __init__(self, bert: TFBertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = Dense(num_classes, activation='sigmoid')
        
    @tf.function
    def call(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        outputs = self.bert(input_ids,
                               attention_mask=attention_mask,
                               token_type_ids=token_type_ids,
                               position_ids=position_ids,
                               head_mask=head_mask)
        cls_output = outputs[1]
        cls_output = self.classifier(cls_output)
                
        return cls_output

model = BertClassifier(TFBertModel.from_pretrained(bert_model_name), len(label_cols))

## 3. Training Loop
- Use BinaryCrossentropy as loss function (is calculated for each of the output 6 output neurons ...that's like training 6 binary classification tasks at the same time) 
- Use the AdamW optimizer with 1-cycle-policy from the Transformers library
- AUC evaluation metrics

In [14]:
import time
from transformers import create_optimizer

steps_per_epoch = train_size // BATCH_SIZE
validation_steps = validation_size // BATCH_SIZE

# | Loss Function
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=False)
train_loss = tf.keras.metrics.Mean(name='train_loss')
validation_loss = tf.keras.metrics.Mean(name='test_loss')

# | Optimizer (with 1-cycle-policy)
warmup_steps = steps_per_epoch // 3
total_steps = steps_per_epoch * NR_EPOCHS - warmup_steps
optimizer = create_optimizer(init_lr=2e-5, num_train_steps=total_steps, num_warmup_steps=warmup_steps)

# | Metrics
train_auc_metrics = [tf.keras.metrics.AUC() for i in range(len(label_cols))]
validation_auc_metrics = [tf.keras.metrics.AUC() for i in range(len(label_cols))]

@tf.function
def train_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)

    with tf.GradientTape() as tape:
        predictions = model(token_ids, attention_mask=masks)
        loss = loss_object(labels, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables), 1.0)

    train_loss(loss)

    for i, auc in enumerate(train_auc_metrics):
        auc.update_state(labels[:,i], predictions[:,i])
        
@tf.function
def validation_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)

    predictions = model(token_ids, attention_mask=masks, training=False)
    v_loss = loss_object(labels, predictions)

    validation_loss(v_loss)
    for i, auc in enumerate(validation_auc_metrics):
        auc.update_state(labels[:,i], predictions[:,i])
                                              
def train(model, train_dataset, val_dataset, train_steps_per_epoch, val_steps_per_epoch, epochs):
    for epoch in range(epochs):
        print('=' * 50, f"EPOCH {epoch}", '=' * 50)

        start = time.time()

        for i, (token_ids, masks, labels) in enumerate(tqdm(train_dataset, total=train_steps_per_epoch)):
            train_step(model, token_ids, masks, labels)
            if i % 1000 == 0:
                print(f'\nTrain Step: {i}, Loss: {train_loss.result()}')
                for i, label_name in enumerate(label_cols):
                    print(f"{label_name} roc_auc {train_auc_metrics[i].result()}")
                    train_auc_metrics[i].reset_states()
        
        for i, (token_ids, masks, labels) in enumerate(tqdm(val_dataset, total=val_steps_per_epoch)):
            validation_step(model, token_ids, masks, labels)

        print(f'\nEpoch {epoch+1}, Validation Loss: {validation_loss.result()}, Time: {time.time()-start}\n')

        for i, label_name in enumerate(label_cols):
            print(f"{label_name} roc_auc {validation_auc_metrics[i].result()}")
            validation_auc_metrics[i].reset_states()

        print('\n')

        
train(model, train_dataset, validation_dataset, train_steps_per_epoch=steps_per_epoch, val_steps_per_epoch=validation_steps, epochs=NR_EPOCHS)



  0%|          | 0/4487 [00:00<?, ?it/s]


Train Step: 0, Loss: 0.8150482177734375
toxic roc_auc 0.5848214626312256
severe_toxic roc_auc 0.0
obscene roc_auc 0.6774193644523621
threat roc_auc 0.0
insult roc_auc 0.16129031777381897
identity_hate roc_auc 0.0

Train Step: 1000, Loss: 0.1602817326784134
toxic roc_auc 0.911155641078949
severe_toxic roc_auc 0.8838526606559753
obscene roc_auc 0.8998681902885437
threat roc_auc 0.7615116834640503
insult roc_auc 0.9084868431091309
identity_hate roc_auc 0.7844926118850708

Train Step: 2000, Loss: 0.1009809821844101
toxic roc_auc 0.980647623538971
severe_toxic roc_auc 0.9884923100471497
obscene roc_auc 0.9879279136657715
threat roc_auc 0.9603656530380249
insult roc_auc 0.9844100475311279
identity_hate roc_auc 0.9683650135993958

Train Step: 3000, Loss: 0.08058936893939972
toxic roc_auc 0.9842858910560608
severe_toxic roc_auc 0.986116349697113
obscene roc_auc 0.9906655550003052
threat roc_auc 0.935275137424469
insult roc_auc 0.9860646724700928
identity_hate roc_auc 0.9839714765548706

Train

  0%|          | 0/498 [00:00<?, ?it/s]


Epoch 1, Validation Loss: 0.0371781624853611, Time: 3639.825960636139

toxic roc_auc 0.9884118437767029
severe_toxic roc_auc 0.9878920316696167
obscene roc_auc 0.9908899068832397
threat roc_auc 0.9723553657531738
insult roc_auc 0.9855911135673523
identity_hate roc_auc 0.9765434861183167




## 4. Run predictions on test-set

In [15]:
import torch
torch.save(model, '/content/drive/MyDrive/temp/bert_toxic')



INFO:tensorflow:Assets written to: ram://3e03a399-bfba-403f-8781-b181a174e5cf/assets


INFO:tensorflow:Assets written to: ram://3e03a399-bfba-403f-8781-b181a174e5cf/assets


In [16]:
model.save("/content/drive/MyDrive/temp/bert_toxic_2")



INFO:tensorflow:Assets written to: /content/drive/MyDrive/temp/bert_toxic_2/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/temp/bert_toxic_2/assets


In [None]:
test = pd.read_csv("/content/disaster_response_messages_validation.csv")

In [11]:
import torch
DEVICE = "cuda"
bert_model_name = 'bert-base-uncased'

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']
       

MODEL = BertClassifier(TFBertModel.from_pretrained(bert_model_name), len(label_cols))

In [12]:
MODEL.load_weights("/content/drive/MyDrive/temp/bert_toxic_2")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fbd3254d250>

In [13]:
test_input_ids = tokenize_sentences("hello", tokenizer, MAX_LEN)
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
test_attention_masks = create_attention_masks(test_input_ids)

NameError: ignored

In [None]:
TEST_BATCH_SIZE = 32
#test_steps = len(df_test) // TEST_BATCH_SIZE

test_dataset = create_dataset((test_input_ids, test_attention_masks), batch_size=TEST_BATCH_SIZE, train=False, epochs=1)

df_submission = pd.DataFrame(columns=label_cols)



for i, (token_ids, masks) in enumerate(tqdm(test_dataset, total=80)):
    #sample_ids = df_test.iloc[i*TEST_BATCH_SIZE:(i+1)*TEST_BATCH_SIZE]['id']
    predictions = MODEL(token_ids, attention_mask=masks).numpy()
    print(df_train['message'][i])
    print(predictions)

    #df_submission.loc[sample_ids, label_cols] = predictions

  0%|          | 0/80 [00:00<?, ?it/s]

Weather update - a cold front from Cuba that could pass over Haiti
[[0.60245955 0.02606841 0.08452496 ... 0.05844284 0.03321033 0.13639358]
 [0.55025965 0.02695139 0.07751189 ... 0.06023452 0.0310071  0.13594578]
 [0.5498777  0.02588744 0.08446865 ... 0.05637839 0.03414922 0.13074762]
 ...
 [0.59339905 0.02780957 0.07931907 ... 0.05667381 0.03295945 0.13070114]
 [0.6033715  0.05274021 0.13855931 ... 0.08763091 0.04146462 0.156612  ]
 [0.46919376 0.02845895 0.07920785 ... 0.04989734 0.03116344 0.10990162]]
Is the Hurricane over or is it not over
[[0.50015867 0.02571551 0.06839665 0.04365779 0.130823   0.07788306
  0.04477542 0.04272451 0.04810173 0.0840669  0.07280004 0.04997395
  0.07203158 0.06421284 0.04008645 0.06491669 0.07688624 0.11956803
  0.041901   0.13152003 0.07268601 0.09905725 0.07666575 0.11034013
  0.0979768  0.07685266 0.04003416 0.03078483 0.11986863 0.18407114
  0.07111117 0.06981549 0.03162016 0.08904915 0.05339475 0.03129373
  0.10600613]
 [0.5933988  0.02780954 0.0

In [None]:
!cp -r /content/model2 /content/drive/MyDrive/Colab%Notebooks/BERT/2/

## Inference

In [None]:
!pip install transformers keras_preprocessing tensorflow pandas numpy

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import TFBertModel
from transformers import BertTokenizer
from tensorflow.keras.layers import Dense, Flatten
from keras_preprocessing.sequence import pad_sequences

In [None]:
modelpath = "add model path here" #downloaded model path

In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat',
              'insult', 'identity_hate']

In [None]:
predictionArrray = []

class BertClassifier(tf.keras.Model):    
    def __init__(self, bert: TFBertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = Dense(num_classes, activation='sigmoid')
        
    @tf.function
    def call(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        outputs = self.bert(input_ids,
                               attention_mask=attention_mask,
                               token_type_ids=token_type_ids,
                               position_ids=position_ids,
                               head_mask=head_mask)
        cls_output = outputs[1]
        cls_output = self.classifier(cls_output)
                
        return cls_output

MODEL_PATH_BERT2 = modelpath + '/'

In [None]:
MODEL_BERT2 = BertClassifier(TFBertModel.from_pretrained('bert-base-uncased'), len(label_cols))
MODEL_BERT2.load_weights(MODEL_PATH_BERT2)
tokenizer_BERT2 = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
def tokenize_sentences_multiclass(sentences, tokenizer, max_seq_len = 128):
    tokenized_sentences = []

    for sentence in sentences:
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                    )
        
        tokenized_sentences.append(tokenized_sentence)

    return tokenized_sentences


def create_dataset_multiclass(data_tuple, epochs=1, batch_size=32, buffer_size=10000, train=True):
    dataset = tf.data.Dataset.from_tensor_slices(data_tuple)
    if train:
        dataset = dataset.shuffle(buffer_size=buffer_size)
    dataset = dataset.repeat(epochs)
    dataset = dataset.batch(batch_size)
    if train:
        dataset = dataset.prefetch(1)
    
    return dataset

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)

def predict_text_class(text,MAX_LEN=64):
  test_input_ids = tokenize_sentences_multiclass(text, tokenizer_BERT2, MAX_LEN)
  test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
  test_attention_masks = create_attention_masks(test_input_ids)
  test_dataset = create_dataset_multiclass((test_input_ids, test_attention_masks), batch_size=1, train=False, epochs=1)
  for i, (token_ids, masks) in enumerate(test_dataset):
    predictions = MODEL_BERT2(token_ids, attention_mask=masks).numpy()
    predictionDict = dict(zip(label_cols,predictions[0]))
    return predictionDict

### Scrappe twitter data

In [None]:
!pip3 install --user --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint

In [None]:
import twint,json

tempath = ""

def top_tweets(username): 
    for user in username:
        c = twint.Config()
        c.Limit = 4
        c.Min_likes = 1
        c.Output = os.path.join(tempath, f"tweets_{user}.json")
        c.Store_json = True
        c.Username = user
        c.Filter_retweets = True
        a = twint.run.Search(c)

    # scrapping done & preprocessing start
    totalTweets = {}
    for user in username:
        tweets = []
        for line in open(os.path.join(os.path.join(tempath, f"tweets_{user}.json")), 'r'):
            tweets.append(json.loads(line))
            totalTweets[user] = tweets
    return totalTweets

In [None]:
tweets = top_tweets(["maestroxv_"])

In [None]:
for tweet in tweets['maestroxv_']:
    print(tweet['tweet'])
    print(predict_text_class([tweet['tweet']]))