In [10]:
 !pip install datasets
 !pip install transformers
 !pip install pandas

from datasets import load_dataset
import matplotlib.pyplot as plt
import tensorflow.keras as keras
import pandas as pd

try: # this is only working on the 2nd try in colab :)
  from transformers import DistilBertTokenizer, TFDistilBertModel
except Exception as err: # so we catch the error and import it again
  from transformers import DistilBertTokenizer, TFDistilBertModel

import numpy as np
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense, Input, Dropout
from pandas_profiling import ProfileReport

dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')




# Data Preparation

## Clean the text and your targets
Hints: 
1. Use the exploration cell to explore the data and identify cleaning steps
2. Inspect the tokenized sentences and ensure they make sense and can leverage already trained word embeddings
3. These resources will help you understand what type of cleaning will be required and how you can encode your text for the network:
    - a) Preprocessing: https://huggingface.co/transformers/preprocessing.html
    - b) Summary of tokenizers (DistilBERT uses WordPiece): https://huggingface.co/transformers/tokenizer_summary.html#wordpiece
4. Consider the text length, is this too big/small for DistilBERT? what impact would padding/truncation have?
5. In load data you generated a profiling report of this dataset, might be helpful to review that as well

In [11]:
def prepare_raw_data(df):
  raw_data = df.loc[:, ["id", "statement", "label"]]
  raw_data["label"] = raw_data["label"].astype('category')
  return raw_data

def load_data(save_dir="./"):
  dataset = load_dataset("liar")
  train = prepare_raw_data(pd.DataFrame(dataset["train"]))
  val = prepare_raw_data(pd.DataFrame(dataset["validation"]))
  test = prepare_raw_data(pd.DataFrame(dataset["test"]))
  return train, val, test
         
def clean_data(raw_data):
  # TODO: What data cleaning/filtering should you consider?
  # Hint: check for duplicates or contradictions
  # Hint: What is the minimum and maximum lengths of the statements?
  # DO NOT CHANGE THE INPUTS OR OUTPUTS TO THIS FUNCTION

  clean_data = raw_data.drop_duplicates(subset=["statement"])


  return clean_data

def extract_raw_text_and_y(clean_data):
  raw_text, raw_y = clean_data["statement"].values, clean_data["label"].values
  raw_text = raw_text.tolist()
  return raw_text, raw_y

max_length=40

def encode_text(text):
    # TODO: encode text using dbert_tokenizer
    # DO NOT CHANGE THE INPUTS OR OUTPUTS TO THIS FUNCTION
                                                           
    token_dict = dbert_tokenizer(text=text, max_length = max_length, padding='max_length', truncation=True, return_attention_mask=True)


    input_ids = token_dict["input_ids"]
    attention_mask = token_dict["attention_mask"]

    return input_ids, attention_mask


def convert_labels(label, column):
  if label in column:
    return 1
  return 0  


def prepare_target(raw_y):
    # TODO: convert labels to 0/1
    # DO NOT CHANGE THE INPUTS OR OUTPUTS TO THIS FUNCTION
    # NOTE: labels map as follows: ['false', 'half-true', 'mostly-true', 'true', 'barely-true', 'pants-fire']
    # y should have:
    # column 0 = "pants-fire" or "false" posts
    # column 1 = "true" posts
    # column 2 = "pants-fire"

    y_column0 = list(map(lambda x: convert_labels(x, [0, 5]), raw_y))
    y_column1 = list(map(lambda x: convert_labels(x, [3]), raw_y))
    y_column2 = list(map(lambda x: convert_labels(x, [5]), raw_y))
    y = pd.DataFrame(np.array([y_column0, y_column1, y_column2])).T


    return y


# Modelling

## Build and Train Model

Resources:
- DistilBERT paper: https://arxiv.org/abs/1910.01108
- DistilBERT Tensorflow Documentation: https://huggingface.co/transformers/model_doc/distilbert.html#tfdistilbertmodel

In [12]:
def build_model(base_model, trainable=False, params={}):
    # TODO: build the model, with the option to freeze the parameters in distilBERT
    # DO NOT CHANGE THE INPUTS OR OUTPUTS TO THIS FUNCTION
    # Hint 1: the cls token (token for classification in bert / distilBERT)  corresponds to the first element in the sequence in DistilBERT
    # Hint 2: this guide may be helpful for parameter freezing: https://keras.io/guides/transfer_learning/
    # Hint 3: double check your number of parameters make sense
    # Hint 4: carefully consider your final layer activation and loss function

    max_seq_len = max_length

    # Refer to https://keras.io/api/layers/core_layers/input/
    inputs = Input(shape = (max_seq_len,), dtype='int64', name='inputs')
    masks  = Input(shape = (max_seq_len,), dtype='int64', name='masks')

    base_model.trainable = trainable

    dbert_output = base_model(inputs, attention_mask=masks)
    dbert_last_hidden_state = dbert_output.last_hidden_state

    # Any additional layers should go here
    # use the 'params' as a dictionary for hyper parameter to facilitate experimentation

    my_outputs = keras.layers.Dense(512)(dbert_last_hidden_state[:, 0, :])
    my_outputs = keras.layers.Dropout(0.2)(my_outputs)
    my_outputs = keras.layers.Dense(64)(my_outputs)
    my_outputs = keras.layers.Dropout(0.2)(my_outputs)
    probs = keras.layers.Dense(3, activation = "sigmoid")(my_outputs)

    
    model = keras.Model(inputs=[inputs, masks], outputs=probs)
    model.summary()
    return model



In [13]:
def compile_model(model):
    # TODO: compile the model, include relevant auc metrics when training
    # DO NOT CHANGE THE INPUTS OR OUTPUTS TO THIS FUNCTION
    # Hint: you may want to read up on the "multi_label" parameter in the keras AUC metrics

    model.compile(optimizer="adam", loss="binary_crossentropy", 
                  metrics = [keras.metrics.AUC(), keras.metrics.AUC(curve="PR"), keras.metrics.Recall(), keras.metrics.Precision()])

    return model

In [14]:
batch_size = 64
num_epochs = 3

def train_model(model, model_inputs_and_masks_train, model_inputs_and_masks_val,
    y_train, y_val, batch_size, num_epochs):
    # TODO: train the model
    # DO NOT CHANGE THE INPUTS OR OUTPUTS TO THIS FUNCTION

    history = model.fit([np.array(train_model_inputs_and_masks['inputs']), np.array(train_model_inputs_and_masks['masks'])],
                        y_train, batch_size = batch_size, epochs = num_epochs,
                        validation_data = ([np.array(model_inputs_and_masks_val['inputs']), np.array(model_inputs_and_masks_val['masks'])], y_val)
                        )

    return model, history

In [15]:
def evaluate_model(model, model_inputs_and_masks_test, y_test):
    # TODO: evaluate the model
    # DO NOT CHANGE THE INPUTS OR OUTPUTS TO THIS FUNCTION
    # HINT: for pr_auc: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html 

    eval_dict = model.evaluate([np.array(model_inputs_and_masks_test['inputs']),
                                np.array(model_inputs_and_masks_test['masks'])], np.array(y_test), verbose = 2)
    
    

   # eval_dict = {
   #     "false": {
   #         "pr_auc": ???, "pr_auc_random_guess": ???, 
   #         "roc_auc": ???, "roc_auc_random_guess": ???, 
   #         "precision": ???, "recall": ???
   #     }, 
   #     "true": {
   #         "pr_auc": ???, "pr_auc_random_guess": ???, 
   #         "roc_auc": ???, "roc_auc_random_guess": ???, 
   #         "precision": ???, "recall": ???
   #     }, 
   #     "pants": {
   #        "pr_auc": ???, "pr_auc_random_guess": ???, 
   #         "roc_auc": ???, "roc_auc_random_guess": ???, 
   #         "precision": ???, "recall": ???
   #     }
   # }
    return eval_dict

# Execution



In [16]:
## DO NOT Change
train, val, test = load_data()
train_raw_x, train_raw_y = extract_raw_text_and_y(clean_data(train))
val_raw_x, val_raw_y = extract_raw_text_and_y(clean_data(val))
test_raw_x, test_raw_y = extract_raw_text_and_y(clean_data(test))

train_input, train_mask = encode_text(train_raw_x)
train_y = prepare_target(train_raw_y)

val_input, val_mask = encode_text(val_raw_x)
val_y = prepare_target(val_raw_y)

test_input, test_mask = encode_text(test_raw_x)
test_y = prepare_target(test_raw_y)

train_model_inputs_and_masks = {
    'inputs' : train_input,
    'masks' : train_mask
}

val_model_inputs_and_masks = {
    'inputs' : val_input,
    'masks' : val_mask
}

test_model_inputs_and_masks = {
    'inputs' : test_input,
    'masks' : test_mask
}


Using custom data configuration default
Reusing dataset liar (/root/.cache/huggingface/datasets/liar/default/1.0.0/1a6abd9863f27194da30fcb66988477abfa3780df3b0ad1d0032979c48ec7918)



Use the cell below to execute and experiment with your model

In [17]:
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
model = build_model(dbert_model, params={})
model = compile_model(model)
model, history = train_model(model, train_model_inputs_and_masks, val_model_inputs_and_masks, train_y, val_y, batch_size, num_epochs)
eval_dict = evaluate_model(model, test_model_inputs_and_masks, test_y)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'activation_13', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, 40)]         0                                            
__________________________________________________________________________________________________
masks (InputLayer)              [(None, 40)]         0                                            
__________________________________________________________________________________________________
tf_distil_bert_model_1 (TFDisti TFBaseModelOutput(la 66362880    inputs[0][0]                     
                                                                 masks[0][0]                      
__________________________________________________________________________________________________
tf.__operators__.getitem_1 (Sli (None, 768)          0           tf_distil_bert_model_1[0][0

## Conclusions (TODO)
TODO: Make Your Final Conclusions About Your Model (Answer questions below, answer in this cell)
- a) What is driving your model's decisions?
- b) Is your model biased in some ways? If so how? 
- c) Does your model accomplish the objectives? If not, is your model useful and how can you justify this?