## **ICAAD DATA - Last n tokens, some manipulation of data**

> Indented block



In [1]:
#!pip install transformers

In [None]:
#Imports
from transformers import BertTokenizer, BertConfig, TFBertModel, TFBertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertConfig, TFDistilBertModel, TFDistilBertForSequenceClassification

import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import precision_recall_fscore_support, classification_report

In [None]:
# Configs

train_file = "train_80_10_10.csv"
test_file = "test_80_10_10.csv"
val_file = "val_80_10_10.csv"

skip_lines = 6
split_length = 510 # The max as we add two tokens

# BERT CONFIG

# BERT BASE
#bert_file = 'bert-base-uncased'
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
#bert_model = TFBertForSequenceClassification.from_pretrained(bert_file)

# DistilBert
bert_file = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(bert_file, do_lower_case=True)
bert_model = TFDistilBertForSequenceClassification.from_pretrained(bert_file)

max_length = 512

# Model Training
batch_size = 8
epochs = 3
learning_rate = 2e-5


In [None]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [None]:
# Function to get data
def get_data(fname):
    df = pd.read_csv(fname)
    df = df[['docid', 'cleaned_contents', 'Discrimination_Label']]
    df = df.rename(columns = {'cleaned_contents':'text', 'Discrimination_Label':'label'})
    df.reset_index(inplace=True, drop=True)
    return df

In [None]:
#Funtion to get the last 510 tokens only
def end_tokens(df):
  end_tokens = []

  for row in df['tokens']:
    end_tokens.append([row][0][-510:])
  return end_tokens

In [None]:
#Function to create dictionary from lists for use in preparing tensorflow model input
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

In [None]:
# Function to explode out tokens seried into pre-defined chunk lengths and return as inputs to model 
def prepare_df(df):

  # add special tokens to beginning and end (assuming Bert tokenizer)
  for row in df['end_tokens']:
        row.insert(0,101)
        row.append(102)

  # create our input lists
  tokenized = df['end_tokens']
  input_ids = np.array([i + [0]*(split_length+2-len(i)) for i in tokenized.values])
  attention_mask = np.where(input_ids != 0, 1, 0)
  token_type_ids = np.where(input_ids != 0, 0, 0)
  labels = df['label'].tolist()

  # convert to tensorflow dataset object and return
  return tf.data.Dataset.from_tensor_slices((input_ids, attention_mask, token_type_ids, labels)).map(map_example_to_dict)

In [None]:
# GET THE DATA
df_train = get_data(train_file)
df_test = get_data(test_file)
df_val = get_data(val_file)


In [None]:
#remove double new lines
df_train['text'] = df_train['text'].replace('\n\s*\n', '\n',regex=True)
df_test['text'] = df_test['text'].replace('\n\s*\n', '\n',regex=True)
df_val['text'] = df_val['text'].replace('\n\s*\n', '\n',regex=True)

In [None]:
# strip last n lines
df_train['text'] = df_train.apply(lambda L: L.text.rsplit("\n",skip_lines)[0], axis=1)
df_test['text'] = df_test.apply(lambda L: L.text.rsplit("\n",skip_lines)[0], axis=1)
df_val['text'] = df_val.apply(lambda L: L.text.rsplit("\n",skip_lines)[0], axis=1)

In [None]:
# Tokenize data - capture all tokens. %%capture supresses message about length being too long
%%capture 
df_train['tokens'] = df_train['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=False,)))
df_val['tokens'] = df_val['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=False,)))
df_test['tokens'] = df_test['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=False,)))

In [None]:
# Grab just the last 510 tokens (that is where the magic happens!)
df_train['end_tokens'] = end_tokens(df_train)
df_val['end_tokens'] = end_tokens(df_val)
df_test['end_tokens'] = end_tokens(df_test)

In [None]:
# Create model input tensorflow dataset
ds_encode_val =  prepare_df(df_val).batch(batch_size)
ds_encode_train =  prepare_df(df_train).batch(batch_size)
ds_encode_test =  prepare_df(df_test).batch(batch_size)

In [None]:
# THE MODEL
model = bert_model
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


## ALTERNATIVE MODEL - FOR ILLUSTRATION PURPOSES ONLY
###  NOT ACTIVELY USED
#### NOTE HAS DIFFERENT INPUT - EXPECTS INPUT_ID, ATTENTION_MASK SEPERATELY INSTEAD OF IN A DATASET

```
config = DistilBertConfig.from_pretrained("distilbert-base-cased", 
                                          dropout=0.2, 
                                          attention_dropout=0.2)

input_ids_in = tf.keras.layers.Input(shape=(max_len,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(max_len,), name='masked_token', dtype='int32') 

transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-cased', config = config)
embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
cls_token = embedding_layer[:,0,:]

X = tf.keras.layers.BatchNormalization()(cls_token)
X = tf.keras.layers.Dense(64, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
outputs = tf.keras.layers.Dense(1, activation='softmax')(X)

model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = outputs)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
```


In [None]:
model.summary()

In [None]:
# Train (fine tune) the model
bert_history = model.fit(ds_encode_train, epochs=epochs, validation_data=ds_encode_val)

In [None]:
# EVALUATE THE MODEL
model.evaluate(ds_encode_test)

In [None]:
# Get Predictions
log_pred = model.predict(ds_encode_test)
y_pred = np.argmax(log_pred[0], axis=1)

In [None]:
# Show classification report
#print("Bert base, standard inputs, chunk size = ", split_length)
#print(classification_report(df_test['label'], y_pred))

In [None]:
# Show classification report
print("DistilBert, standard inputs, chunk size = ", split_length)
print(classification_report(df_test['label'], y_pred))