In [24]:
import numpy as np 
import pandas as pd 

from transformers import TFDistilBertForSequenceClassification, DistilBertConfig

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input

import pickle

### Checks

In [6]:
tf.test.is_built_with_cuda(), tf.config.list_physical_devices('GPU')

(True, [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')])

### Load Data

In [7]:
f = open('../data/preprocessed.pkl','rb')
train, valid = pickle.load(f)
labels = train.columns[2:]
ys_train = train[labels]
ys_valid = valid[labels]

## COMBINE TOXIC CATEGORIES
y_train = ys_train.sum(axis=1)
y_valid = ys_valid.sum(axis=1)
y_train.loc[y_train>1] = 1
y_valid.loc[y_valid>1] = 1

### Build tokenizer

In [8]:
## Load pretrained Distil Bert Tokenizer
from transformers import DistilBertTokenizer
distil_bert = 'distilbert-base-uncased' # pre-trained model
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert, do_lower_case=True, add_special_tokens=True,
                                                max_length=128, pad_to_max_length=True)

In [9]:
## Tokenizer function
def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in sentences:
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=128, pad_to_max_length=True, 
                                             return_attention_mask=True, return_token_type_ids=True, truncation=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')

In [10]:
## Tokenize all
train_ids, train_masks, train_segs = tokenize(train['comment_text'], tokenizer)
valid_ids, valid_masks, valid_segs = tokenize(valid['comment_text'], tokenizer)

In [11]:
train_ids.shape, train_masks.shape, train_segs.shape

((106912, 128), (106912, 128), (106912, 128))

### Build transformer model

In [16]:
config = DistilBertConfig(num_labels=1)
config.output_hidden_states = False
transformer_model = TFDistilBertForSequenceClassification.from_pretrained(distil_bert, config = config)

input_ids = Input(shape=(128,), name='input_token', dtype='int32')
input_masks_ids = Input(shape=(128,), name='masked_token', dtype='int32')
X = transformer_model(input_ids, input_masks_ids)
model = tf.keras.Model(inputs=[input_ids, input_masks_ids], outputs = X)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_transform', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_39', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [17]:
print(model.summary())

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_for_sequence_cla ((None, 1),)         66954241    input_token[0][0]                
                                                                 masked_token[0][0]               
Total params: 66,954,241
Trainable params: 66,954,241
Non-trainable params: 0
__________________________________________________________________________________________________
None


In [21]:
## Custom metrics
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [32]:
model.compile(optimizer='adam', loss= 'binary_crossentropy', metrics=['acc'])

In [31]:
model.fit(x = [train_ids, train_masks],
          y = y_train,
          batch_size = 32,
          epochs = 2,
          validation_data=([valid_ids, valid_masks], y_valid))

Epoch 1/2


InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (functional_3/tf_distil_bert_for_sequence_classification_1/classifier/BiasAdd:0) = ] [[-3.01141644][-3.02550435][-3.17375875]...] [y (Cast_8/x:0) = ] [0]
	 [[{{node assert_greater_equal/Assert/AssertGuard/else/_1/assert_greater_equal/Assert/AssertGuard/Assert}}]]
	 [[assert_less_equal/Assert/AssertGuard/pivot_f/_13/_45]]
  (1) Invalid argument:  assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (functional_3/tf_distil_bert_for_sequence_classification_1/classifier/BiasAdd:0) = ] [[-3.01141644][-3.02550435][-3.17375875]...] [y (Cast_8/x:0) = ] [0]
	 [[{{node assert_greater_equal/Assert/AssertGuard/else/_1/assert_greater_equal/Assert/AssertGuard/Assert}}]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_63011]

Function call stack:
train_function -> train_function
