# ICAAD TEST USING TRUNCATED INPUT OF 512 TOKENS

#DistilBERT - First 512 tokens, Tensforflow


In [None]:
#Imports
from transformers import BertTokenizer, BertConfig, TFBertModel, TFBertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertConfig, TFDistilBertModel, TFDistilBertForSequenceClassification

import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import precision_recall_fscore_support, classification_report

In [None]:
# Configs

train_file = "train.csv"
test_file = "test.csv"

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
bert_file = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(bert_file, do_lower_case=True)

# can be up to 512 for BERT
max_length = 512
batch_size = 8
epochs = 3


In [None]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
# TOKENIZE 
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

def tokenize_plus(df):

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    sentences = df['text'].values
    labels = df['label'].values

    input_ids_list = []
    attention_mask_list = []
    token_type_ids_list = []

    # For every sentence...
    for sent in sentences:
        inputs = tokenizer.encode_plus(sent, 
                                       add_special_tokens=True, 
                                       max_length=max_length, 
                                       truncation = True,
                                       pad_to_max_length=True, 
                                       return_attention_mask=True,
                                       return_token_type_ids=True)

        input_ids_list.append(inputs['input_ids'])
        attention_mask_list.append(inputs['attention_mask'])
        token_type_ids_list.append(inputs['token_type_ids'])   
    
    label_list = df['label'].tolist()

    #return np.asarray(input_ids_list, dtype='int32'), np.asarray(attention_mask_list, dtype='int32'), np.asarray(label_list, dtype='int32')

    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [None]:
# Function to get data - GET THE DATA
def get_data(fname):
    df = pd.read_csv(fname)
    df = df[['docid', 'cleaned_contents', 'Discrimination_Label']]
    df = df.rename(columns = {'cleaned_contents':'text', 'Discrimination_Label':'label'})
    df.reset_index(inplace=True, drop=True)
    return df

train = get_data(train_file)
df_test = get_data(test_file)
msk = np.random.rand(len(train)) < 0.9
df_train = train[msk]
df_val = train[~msk]

In [None]:
# TOKENIZE AND PUT INTO TENSORFLOW DATASET
ds_train_encoded = tokenize_plus(df_train).shuffle(100).batch(batch_size)
ds_val_encoded = tokenize_plus(df_val).batch(batch_size)
ds_test_encoded = tokenize_plus(df_test).batch(batch_size)

In [None]:
# THE MODEL

learning_rate = 2e-5
number_of_epochs = epochs
model = TFDistilBertForSequenceClassification.from_pretrained(bert_file)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_19', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [None]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  66362880  
_________________________________________________________________
pre_classifier (Dense)       multiple                  590592    
_________________________________________________________________
classifier (Dense)           multiple                  1538      
_________________________________________________________________
dropout_19 (Dropout)         multiple                  0         
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Train (fine tune) the model
bert_history = model.fit(ds_train_encoded, epochs=epochs, validation_data=ds_val_encoded)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
# EVALUATE THE MODEL
model.evaluate(ds_test_encoded)



[0.6383978724479675, 0.6728395223617554]

In [None]:
# Get Predictions
log_pred = model.predict(ds_test_encoded)
y_pred = np.argmax(log_pred[0], axis=1)

In [None]:
# Show classification report
print("DistilBert")
print(classification_report(df_test['label'], y_pred))

DistilBert
              precision    recall  f1-score   support

           0       0.65      0.40      0.50        65
           1       0.68      0.86      0.76        97

    accuracy                           0.67       162
   macro avg       0.67      0.63      0.63       162
weighted avg       0.67      0.67      0.65       162



## **ICAAD TEST USING STACKED RESULTS WITH MEAN OUTPUT**

DistilBert. 
- Split Documents into 100 token (arbitrary chunks). 
- Apply label to new chunks. 
- Run Model. 
- Take mean output for each document. 


In [None]:
#!pip install transformers

In [None]:
from transformers import BertTokenizer, BertConfig, TFBertModel, TFBertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertConfig, TFDistilBertModel, TFDistilBertForSequenceClassification

import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import precision_recall_fscore_support, classification_report

In [None]:
# Configs

train_file = "train.csv"
test_file = "test.csv"

#split_length = 100
split_length = 510 # The max as we add two tokens

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
bert_file = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(bert_file, do_lower_case=True)

# can be up to 512 for BERT
max_length = 512
batch_size = 8
epochs = 3

In [None]:
# Function to get data - GET THE DATA
def get_data(fname):
    df = pd.read_csv(fname)
    df = df[['docid', 'cleaned_contents', 'Discrimination_Label']]
    df = df.rename(columns = {'cleaned_contents':'text', 'Discrimination_Label':'label'})
    df.reset_index(inplace=True, drop=True)
    return df

In [None]:
#Funtion to split tokens into arbitrary length token chunks
def split_tokens(df):
  split_tokens = []

  for row in df['tokens']:
      split_tokens.append([row[i:i + split_length] for i in range(0, len(row), split_length)] )

  return split_tokens


In [None]:
#Function to create dictionary from lists
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

In [None]:
# Function to explode out tokens seried into pre-defined chunk lengths and return as inputs to model 
def prepare_df(df):

  # add special tokens to beginning and end (assuming Bert tokenizer)
  for row in df['split_tokens']:
        row.insert(0,101)
        row.append(102)

  # create our input lists
  tokenized = df['split_tokens']
  input_ids = np.array([i + [0]*(split_length+2-len(i)) for i in tokenized.values])
  attention_mask = np.where(input_ids != 0, 1, 0)
  token_type_ids = np.where(input_ids != 0, 0, 0)
  labels = df['label'].tolist()

  # convert to tensorflow dataset object and return
  return tf.data.Dataset.from_tensor_slices((input_ids, attention_mask, token_type_ids, labels)).map(map_example_to_dict)


In [None]:
train = get_data(train_file)
df_test = get_data(test_file)
msk = np.random.rand(len(train)) < 0.9
df_train = train[msk]
df_val = train[~msk]

In [None]:
%%capture 
df_train['tokens'] = df_train['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=False,)))
df_val['tokens'] = df_val['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=False,)))
df_test['tokens'] = df_test['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=False,)))

In [None]:
# Explode the dataframes so each document is now ~100 tokens liong
df_train['split_tokens'] = split_tokens(df_train)
df_train = df_train.explode('split_tokens').reset_index(drop=True)

df_val['split_tokens'] = split_tokens(df_val)
df_val = df_val.explode('split_tokens').reset_index(drop=True)

df_test['split_tokens'] = split_tokens(df_test)
df_test = df_test.explode('split_tokens').reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
print(len(df_train))
print(len(df_val))
print(len(df_test))

2405
335
647


In [None]:
ds_encode_val =  prepare_df(df_val).batch(batch_size)

In [None]:
ds_encode_train =  prepare_df(df_train).shuffle(100).batch(batch_size)

In [None]:
ds_encode_test =  prepare_df(df_test).batch(batch_size)

In [None]:
# THE MODEL

learning_rate = 2e-5
number_of_epochs = epochs
model = TFDistilBertForSequenceClassification.from_pretrained(bert_file)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_projector', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_19', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [None]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  66362880  
_________________________________________________________________
pre_classifier (Dense)       multiple                  590592    
_________________________________________________________________
classifier (Dense)           multiple                  1538      
_________________________________________________________________
dropout_19 (Dropout)         multiple                  0         
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Train (fine tune) the model
bert_history = model.fit(ds_encode_train, epochs=epochs, validation_data=ds_encode_val)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
# EVALUATE THE MODEL
model.evaluate(ds_encode_test)



[0.6635349988937378, 0.6105100512504578]

In [None]:
# Get Predictions
log_pred = model.predict(ds_encode_test)
y_pred = np.argmax(log_pred[0], axis=1)

In [None]:
# Show classification report
print("DistilBert, chunk size = ", split_length)
print(classification_report(df_test['label'], y_pred))

DistilBert, chunk size =  510
              precision    recall  f1-score   support

           0       0.58      0.83      0.68       326
           1       0.69      0.38      0.49       321

    accuracy                           0.61       647
   macro avg       0.64      0.61      0.59       647
weighted avg       0.64      0.61      0.59       647



In [None]:
print(np.count_nonzero(y_pred == 0))
print(np.count_nonzero(y_pred == 1))

470
177


In [None]:
df_test['y_pred'] = y_pred

In [None]:
df_test['y_pos'] = np.where(df_test['y_pred'] == 1, 1, 0)
df_test['y_neg'] = np.where(df_test['y_pred'] == 0, 1, 0)

In [None]:
df_test.head()

Unnamed: 0,docid,text,label,tokens,split_tokens,y_pred,y_pos,y_neg
0,80646,SENTENCE\n\n[Name of the victim is suppressed....,1,"[6251, 1031, 2171, 1997, 1996, 6778, 2003, 137...","[101, 6251, 1031, 2171, 1997, 1996, 6778, 2003...",1,1,0
1,80646,SENTENCE\n\n[Name of the victim is suppressed....,1,"[6251, 1031, 2171, 1997, 1996, 6778, 2003, 137...","[101, 2308, 2013, 2107, 16627, 1998, 12603, 10...",0,0,1
2,80646,SENTENCE\n\n[Name of the victim is suppressed....,1,"[6251, 1031, 2171, 1997, 1996, 6778, 2003, 137...","[101, 3755, 2030, 19601, 1025, 1019, 1012, 200...",1,1,0
3,80646,SENTENCE\n\n[Name of the victim is suppressed....,1,"[6251, 1031, 2171, 1997, 1996, 6778, 2003, 137...","[101, 1013, 2286, 14397, 3669, 2072, 1024, 938...",1,1,0
4,81372,JUDGMENT\n\nThis is an appeal against convicti...,1,"[8689, 2023, 2003, 2019, 5574, 2114, 10652, 19...","[101, 8689, 2023, 2003, 2019, 5574, 2114, 1065...",0,0,1


In [None]:
y_pos_sum = df_test.groupby(['docid'], as_index=False)['y_pos'].sum().rename(columns = {'y_pos':'y_pos_sum'})
y_pos_mean = df_test.groupby(['docid'], as_index=False)['y_pos'].mean().rename(columns = {'y_pos':'y_pos_mean'})
y_neg_sum = df_test.groupby(['docid'], as_index=False)['y_neg'].sum().rename(columns = {'y_neg':'y_neg_sum'})
y_neg_mean = df_test.groupby(['docid'], as_index=False)['y_neg'].mean().rename(columns = {'y_neg':'y_neg_mean'})

chunks = df_test.groupby(['docid'], as_index=False)['y_pos'].count().rename(columns = {'y_pos':'chunk_count'})

In [None]:
lab = df_test.groupby(['docid'], as_index=False)['label'].min()

In [None]:
results = y_pos_sum.join(y_pos_mean['y_pos_mean']).join(y_neg_sum['y_neg_sum']).join(y_neg_mean['y_neg_mean']).join(chunks['chunk_count']).join(lab['label'])

In [None]:
results.head()

Unnamed: 0,docid,y_pos_sum,y_pos_mean,y_neg_sum,y_neg_mean,chunk_count,label
0,70139,2,0.5,2,0.5,4,1
1,70164,0,0.0,5,1.0,5,1
2,70302,1,0.333333,2,0.666667,3,0
3,70320,3,0.75,1,0.25,4,0
4,70405,2,0.333333,4,0.666667,6,1


In [None]:
results.to_csv('bert_chunk_results.csv')