In [1]:
import numpy as np 
import pandas as pd 

from transformers import TFDistilBertModel, DistilBertConfig

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense, Dropout, Conv1D, BatchNormalization, GlobalMaxPooling1D

import pickle

### Checks

In [2]:
tf.test.is_built_with_cuda(), tf.config.list_physical_devices('GPU')

(True, [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')])

### Load Data

In [3]:
f = open('../data/preprocessed.pkl','rb')
train, valid = pickle.load(f)

## Sample subset of train for testing
train = train.sample(frac = 0.2)
valid = valid.sample(frac = 0.2)

labels = train.columns[2:]
ys_train = train[labels]
ys_valid = valid[labels]

## COMBINE TOXIC CATEGORIES
y_train = ys_train.sum(axis=1)
y_valid = ys_valid.sum(axis=1)
y_train.loc[y_train>1] = 1
y_valid.loc[y_valid>1] = 1

### Build tokenizer

In [4]:
## Load pretrained Distil Bert Tokenizer
from transformers import DistilBertTokenizer
distil_bert = 'distilbert-base-uncased' # pre-trained model
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert, do_lower_case=True, add_special_tokens=True,
                                                max_length=128, pad_to_max_length=True)

In [5]:
## Tokenizer function
def tokenize(sentences, tokenizer, max_length = 128):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in sentences:
        inputs = tokenizer.encode_plus(sentence, 
                                    add_special_tokens=True, 
                                    max_length = max_length, 
                                    pad_to_max_length=True, 
                                    return_attention_mask=True, 
                                    return_token_type_ids=True, 
                                    truncation=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')

In [6]:
## Tokenize all
train_ids, train_masks, train_segs = tokenize(train['comment_text'], tokenizer)
valid_ids, valid_masks, valid_segs = tokenize(valid['comment_text'], tokenizer)

In [7]:
train_ids.shape, train_masks.shape, train_segs.shape

((21382, 128), (21382, 128), (21382, 128))

### Build transformer model

In [27]:
# Configure transformer model
config = DistilBertConfig(dropout = 0.2, attention_dropout = 0.2)
config.output_hidden_states = False
transformer_model = TFDistilBertModel.from_pretrained(distil_bert, config = config)

# Inputs
input_ids = Input(shape=(128,), name='input_token', dtype='int32')
input_masks_ids = Input(shape=(128,), name='masked_token', dtype='int32')

# Transformer layer
X = transformer_model(input_ids, attention_mask=input_masks_ids)[0]

# Dense layers
X = Conv1D(filters = 32, kernel_size = 2, padding = 'valid', activation = 'relu')(X)
X = BatchNormalization()(X)
#X = Conv1D(filters = 50, kernel_size = 3, padding = 'valid', activation = 'relu')(X)
#X = BatchNormalization()(X)
#X = Conv1D(filters = 50, kernel_size = 4, padding = 'valid', activation = 'relu')(X)
#X = BatchNormalization()(X)
X = GlobalMaxPooling1D()(X)
X = Dense(1, activation = 'sigmoid')(X)
model = tf.keras.Model(inputs=[input_ids, input_masks_ids], outputs = X)

# Freeze transformer layers
for layer in model.layers[:3]:
    layer.trainable = False

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_transform', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [28]:
print(model.summary())

Model: "functional_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model_4 (TFDisti ((None, 128, 768),)  66362880    input_token[0][0]                
                                                                 masked_token[0][0]               
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 127, 32)      49184       tf_distil_bert_model_4

In [29]:
model.compile(optimizer='adam', loss= 'binary_crossentropy', metrics=['acc'])

In [31]:
model.fit(x = [train_ids, train_masks],
          y = y_train,
          batch_size = 5,
          epochs = 10,
          validation_split = 0.1)

Epoch 1/10


UnknownError:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node functional_9/conv1d_9/conv1d (defined at <ipython-input-30-4c0d4494d99c>:5) ]] [Op:__inference_train_function_114092]

Function call stack:
train_function


In [None]:
## Get predictions
preds = model.predict([valid_ids, valid_masks], batch_size=5)
preds_df = pd.DataFrame(data=preds, columns=['distilbert'])
preds_df.to_csv('../artifacts/simple/preds/distilbert.csv')
preds_t = preds
preds_t[preds_t<=0.5] = 0
preds_t[preds_t>0.5] = 1

In [None]:
results = pd.DataFrame(columns=['Label','Accuracy', 'Recall', 'Precision', 'F1', 'Vectorizer', 'model'])

## Print results
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

i_preds = preds_t
i_true = y_valid

# Evaluate predictions
acc, prec, recall, f1 = (accuracy_score(i_true, i_preds), 
                        precision_score(i_true, i_preds), 
                        recall_score(i_true, i_preds), 
                        f1_score(i_true, i_preds))

# Save results to dataframe
results = results.append({'Label': 'Toxic_Combined',
                        'Accuracy':acc,
                        'Recall':recall,
                        'Precision':prec,
                        'F1':f1,
                        'Vectorizer':'N/A',
                        'model': 'distilbert'}, 
                        ignore_index = True)

# print results
print('Results for {0} comments: Accuracy - {1:.2f}; Precision - {2:.2f}; Recall - {3:.2f}; F1 - {4:.2f}'.format(
                                'Toxic_Combined', 
                                acc, 
                                prec, 
                                recall,
                                f1))