In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow_addons.losses import SigmoidFocalCrossEntropy
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
from math import ceil

 The versions of TensorFlow you are currently using is 2.8.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
# I'm working with a very small dataset, and so I acknowledge that this isn't the right model for the job. But this is also some practice with transformers & Tensorflow

In [3]:
df = pd.read_csv('final-text-2.csv')

In [4]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [5]:
df = df.dropna()

In [6]:
X = df.drop('is_ass', axis=1)
y = df['is_ass']

In [7]:
X['body'] = X['title'] + X['body']
X = X.drop('title', axis=1)

In [8]:
# 80-10-10 ratio- this is a small dataset
X_train, X_split, y_train, y_split = train_test_split(X, y, test_size=0.8, random_state=42)

In [9]:
X_val, X_test, y_val, y_test = train_test_split(X_split, y_split, test_size=0.5, random_state=42)

In [10]:
# Define the maximum number of words to tokenize (DistilBERT can tokenize up to 512)
MAX_LENGTH = 512


# Define function to encode text data in batches
def batch_encode(tokenizer, texts, batch_size=128, max_length=MAX_LENGTH):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed 
    into a pre-trained transformer model.
    
    Input:
        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""
    
    input_ids = []
    attention_mask = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer.batch_encode_plus(batch,
                                             max_length=max_length,
                                             padding='longest', #implements dynamic padding
                                             truncation=True,
                                             return_attention_mask=True,
                                             return_token_type_ids=False,
                                             is_split_into_words=True
                                             )
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])
    
    
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)
    
    
# Encode X_train
X_train_ids, X_train_attention = batch_encode(tokenizer, X_train.to_numpy().tolist())

# Encode X_valid
X_val_ids, X_val_attention = batch_encode(tokenizer, X_val.to_numpy().tolist())

# Encode X_test
X_test_ids, X_test_attention = batch_encode(tokenizer, X_test.to_numpy().tolist())

In [11]:
from transformers import TFDistilBertModel, DistilBertConfig

DISTILBERT_DROPOUT = 0.2
DISTILBERT_ATT_DROPOUT = 0.2
 
# Configure DistilBERT's initialization
config = DistilBertConfig(dropout=DISTILBERT_DROPOUT, 
                          attention_dropout=DISTILBERT_ATT_DROPOUT, 
                          output_hidden_states=True)
                          
# The bare, pre-trained DistilBERT transformer model outputting raw hidden-states 
# and without any specific head on top.
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [12]:
MAX_LENGTH = 512
LAYER_DROPOUT = 0.2
LEARNING_RATE = 5e-5
RANDOM_STATE = 42

def build_model(transformer, max_length=MAX_LENGTH):
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, 
    # which is located at index 0 of every encoded sequence.  
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    ##                                                 ##
    ## Define additional dropout and dense layers here ##
    ##                                                 ##
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(cls_token)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), 
                  loss=SigmoidFocalCrossEntropy(),
                  metrics=['accuracy'])
    
    return model

In [13]:
model = build_model(distilBERT)

In [14]:
EPOCHS = 10
BATCH_SIZE = 128
NUM_STEPS = len(X_train.index) // BATCH_SIZE

# Train the model
train_history1 = model.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_ids, X_val_attention], y_val.to_numpy()),
    verbose=2
)

Epoch 1/10
3/3 - 171s - loss: 0.1817 - accuracy: 0.5729 - val_loss: 0.1169 - val_accuracy: 0.7674 - 171s/epoch - 57s/step
Epoch 2/10
3/3 - 122s - loss: 0.0988 - accuracy: 0.7854 - val_loss: 0.0892 - val_accuracy: 0.7674 - 122s/epoch - 41s/step
Epoch 3/10
3/3 - 126s - loss: 0.0730 - accuracy: 0.7586 - val_loss: 0.0546 - val_accuracy: 0.7674 - 126s/epoch - 42s/step
Epoch 4/10
3/3 - 149s - loss: 0.0493 - accuracy: 0.8123 - val_loss: 0.0605 - val_accuracy: 0.7674 - 149s/epoch - 50s/step
Epoch 5/10
3/3 - 176s - loss: 0.0596 - accuracy: 0.7839 - val_loss: 0.0537 - val_accuracy: 0.7674 - 176s/epoch - 59s/step
Epoch 6/10
3/3 - 125s - loss: 0.0444 - accuracy: 0.8161 - val_loss: 0.0559 - val_accuracy: 0.7674 - 125s/epoch - 42s/step
Epoch 7/10
3/3 - 120s - loss: 0.0542 - accuracy: 0.7701 - val_loss: 0.0588 - val_accuracy: 0.7674 - 120s/epoch - 40s/step
Epoch 8/10
3/3 - 128s - loss: 0.0561 - accuracy: 0.7739 - val_loss: 0.0549 - val_accuracy: 0.7674 - 128s/epoch - 43s/step
Epoch 9/10
3/3 - 164s - 

In [15]:
y_eval = model.evaluate(
    x = [X_test_ids, X_test_attention],
    y = y_test.to_numpy(),
    verbose=2
)

25/25 - 65s - loss: 0.0535 - accuracy: 0.7609 - 65s/epoch - 3s/step


In [1]:
# This data looks like it's just saying marking all rows as "not ass"