In [1]:
%matplotlib inline
import torch
import pandas as pd
import numpy as np
import datetime
import pickle

import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import Dataset, random_split
from torch.utils.data.sampler import SubsetRandomSampler

import transformers as ppb
from transformers import LongformerTokenizer, LongformerModel
from transformers import get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification, AdamW, BertConfig

In [2]:
def get_data(fname):
    df = pd.read_csv(fname)
    df = df[['docid', 'cleaned_contents', 'Discrimination_Label']]
    df = df.rename(columns = {'cleaned_contents':'text', 'Discrimination_Label':'label'})
    df.reset_index(inplace=True, drop=True)
    return df

In [3]:
# Initialize Variables
chunk_len = 512
overlap_len = 0 #50

train_raw = get_data("data/train.csv")
test_raw = get_data("data/test.csv")
train_raw['doc_use'] = 'train'
test_raw['doc_use'] = 'test'

BERT_tokenizer_class = ppb.LongformerTokenizer
BERT_pre_trained_weights = 'allenai/longformer-base-4096'
tokenizer = BERT_tokenizer_class.from_pretrained(BERT_pre_trained_weights)
model1 = ppb.LongformerModel.from_pretrained(BERT_pre_trained_weights)


#distilBert
#fname = 'data/embeddings/distilBert_stack_lcase_' + str(chunk_len) + '.pkl'
#BERT_tokenizer_class = ppb.DistilBertTokenizer
#BERT_pre_trained_weights = 'distilbert-base-cased'

df_raw = pd.concat([train_raw, test_raw])
df_raw.reset_index(inplace=True, drop=True)

# Display 10 random rows from the data.
df_raw.sample(10)

Unnamed: 0,docid,text,label,doc_use
383,264935,SENTENCE\n \n \n[1] Mr. Fuatia Monise (Accused...,1,train
157,264885,S E N T E N C E\n \n \nIntroduction\n \n \n•...,0,train
649,73669,"JUDGMENT\n\n1. On 13 May 2008, the Appellant, ...",1,test
30,281338,"SENTENCE\n \n1. Imanueli Senikuba, you have be...",0,train
787,256798,SENTENCE\n \n \n[1] The accused has been convi...,0,test
499,79711,JUDGMENT\n\n[1] The Appellant was sentenced to...,1,train
211,82658,"SENTENCE\n\n\t1.\tOn 5th November, 2013, in th...",1,train
271,247031,"SENTENCE\n \n \n• On 16 July 2018, the court ...",1,train
465,71173,SENTENCE\n\n1. To prevent the identity of the ...,1,train
167,264151,"SENTENCE\n \n \n• Mr. SAMUELA TAWANANUMI, aft...",0,train


In [4]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [5]:
# Function to Extract all the tokenize elements out of data_tokenize from the above tokenizer.encode_plus
# gives us input ids, attention mask and critically overflow tokens

def extract_tokens(data_tokenize, targets):

    previous_input_ids = data_tokenize["input_ids"].reshape(-1) # a tensor of the input IDs )
    previous_attention_mask = data_tokenize["attention_mask"].reshape(-1) # a tensor of the attention mask (200 * 1)
    previous_token_type_ids = data_tokenize["token_type_ids"].reshape(-1) # a tensor of the attention mask (200 * 0)
    remain = data_tokenize.get("overflowing_tokens") # list of the overflow tokens
    # remain = data_tokenize.get("overflowing_tokens").numpy()[0] # list of the overflow tokens for google cloud
    targets = torch.tensor(targets, dtype=torch.int) # a tensor of current target (1)

    return previous_input_ids, previous_attention_mask, previous_token_type_ids, remain, targets

In [6]:
# Do the tokenization
# This returns a transformers object with 5 elements
# We only really need the input_ids and attention mask for modelling
# We will use these IDS to get out embeddings

# overflowing_tokens (list) - all the elements after our 200 word split
# num_truncated_tokens (integer) - how many overflow tokens we have, for text[0] it is 1822
# input_ids (tensor) - the first 200 tokens, with special token 101 at the beginning and 102 at end
# token_type_ids (tensor) - the token types for the input - there are 200, ours are all zero's (why?)
# attention_mask (tensor) - attention mask in case our text < 200 tokens

start_time = datetime.datetime.now()

input_ids_list_head = []
attention_mask_list_head = []
token_type_ids_list_head = []
targets_list_head = []

input_ids_list_olap = []
attention_mask_list_olap = []
token_type_ids_list_olap= []
targets_list_olap= []

input_ids_list_tail = []
attention_mask_list_tail = []
token_type_ids_list_tail = []
targets_list_tail= []

for idx in range(len(df_raw)): 
    
    long_terms_token = []
    input_ids_list = []
    attention_mask_list = []
    token_type_ids_list = []
    targets_list = []
        
    # tokenize for this row in train_raw
    data = tokenizer.encode_plus(
        df_raw['text'][idx],
        max_length=chunk_len,
        pad_to_max_length=True,
        add_special_tokens=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_overflowing_tokens=True,
        return_tensors='pt')

    # grab the targets for this row in train_raw
    targets = int(df_raw['label'][idx])
    
    # extract the tokens
    input_ids, attention_mask, token_type_ids, remain, targets = extract_tokens(data, targets)
    remain = [] if remain is None else remain # For cases where there is no overflow
    
    # CREATE LISTS FOR THE HEAD
    input_ids_list_head.append(input_ids)
    attention_mask_list_head.append(attention_mask)
    token_type_ids_list_head.append(token_type_ids)
    targets_list_head.append(targets)
    
    # GET OVERLAPPING TOKEN LISTS *****************************
    remain = torch.tensor(remain, dtype=torch.long)
    idxs = range(len(remain)+ chunk_len)
    idxs = idxs[(chunk_len-overlap_len-2)::(chunk_len-overlap_len-2)]
    input_ids_first_overlap = input_ids[-(overlap_len+1):-1]
    start_token = torch.tensor([101], dtype=torch.long)
    end_token = torch.tensor([102], dtype=torch.long)
    
    # Get the initial 200 word tensors (same as head)
    input_ids_list.append(input_ids)
    attention_mask_list.append(attention_mask)
    token_type_ids_list.append(token_type_ids)
    targets_list.append(targets)
    
    # For each overlapping section create a tensor of input_ids, attention_masks, token_type_ids and targets (labels)
    # add to a list
    for i, idx in enumerate(idxs):
        if i == 0:
            input_ids = torch.cat((input_ids_first_overlap, remain[:idx]))
        elif i == len(idxs):
            input_ids = remain[idx:]
        elif previous_idx >= len(remain):
            break
        else:
            input_ids = remain[(previous_idx-overlap_len):idx]

        previous_idx = idx

        nb_token = len(input_ids)+2
        attention_mask = torch.ones(chunk_len, dtype=torch.long)
        attention_mask[nb_token:chunk_len] = 0
        token_type_ids = torch.zeros(chunk_len, dtype=torch.long)
        input_ids = torch.cat((start_token, input_ids, end_token))
        if chunk_len-nb_token > 0:
            padding = torch.zeros(chunk_len-nb_token, dtype=torch.long)
            input_ids = torch.cat((input_ids, padding))

        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        token_type_ids_list.append(token_type_ids)
        targets_list.append(targets)
    
    # Add to the overlap list
    input_ids_list_olap.append([input_ids_list])
    attention_mask_list_olap.append([attention_mask_list])
    token_type_ids_list_olap.append([token_type_ids_list])
    targets_list_olap.append([targets_list])      
  

    # GET LISTS FOR THE HEAD
    input_ids_list_tail.append([input_ids_list[-1]])
    attention_mask_list_tail.append([attention_mask_list[-1]])
    token_type_ids_list_tail.append([token_type_ids_list[-1]])
    targets_list_tail.append([targets_list[-1]])
    
print(f'BERT Tokenize Runtime: {datetime.datetime.now() - start_time}')


BERT Tokenize Runtime: 0:00:18.159872


In [7]:
# check we have the correct sizes of things (803)
print("Train length:", len(df_raw))
print("")
print("Head input_ids length:", len(input_ids_list_head))
print("Head attention mask length:", len(attention_mask_list_head))
print("Head target length:", len(targets_list_head))
print("")
print("Tail input_ids length:", len(input_ids_list_tail))
print("Tail attention mask length:", len(attention_mask_list_tail))
print("Tail target length:", len(targets_list_tail))
print("")
print("Overlap input_ids length:", len(input_ids_list_olap))
print("Overlap attention mask length:", len(attention_mask_list_olap))
print("Overlap target length:", len(targets_list_olap))

Train length: 809

Head input_ids length: 809
Head attention mask length: 809
Head target length: 809

Tail input_ids length: 809
Tail attention mask length: 809
Tail target length: 809

Overlap input_ids length: 809
Overlap attention mask length: 809
Overlap target length: 809


# Overlap STUFF

In [8]:
# Convert to lists so we can smash to a dataframe and save for reuse
input_ids_l2_olap = []
attention_mask_l2_olap = []

for i in range(len(input_ids_list_olap)):
    input_ids_l2_olap.append(input_ids_list_olap[i][0])
    attention_mask_l2_olap.append(attention_mask_list_olap[i][0])

In [9]:
# convert to dataframe for saving later
df_olap = pd.DataFrame()
df_olap['docid'] = df_raw['docid']
df_olap['label'] = df_raw['label']
df_olap['doc_use'] = df_raw['doc_use']
df_olap['input_ids'] = input_ids_l2_olap
df_olap['attention_mask'] = attention_mask_l2_olap
df_olap

Unnamed: 0,docid,label,doc_use,input_ids,attention_mask
0,73277,0,train,"[[tensor(0), tensor(208), tensor(5382), tensor...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
1,79776,1,train,"[[tensor(0), tensor(208), tensor(5382), tensor...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
2,75870,1,train,"[[tensor(0), tensor(208), tensor(5382), tensor...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
3,79299,1,train,"[[tensor(0), tensor(208), tensor(5382), tensor...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
4,80603,0,train,"[[tensor(0), tensor(344), tensor(13083), tenso...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
...,...,...,...,...,...
804,74009,0,test,"[[tensor(0), tensor(344), tensor(13083), tenso...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
805,79379,1,test,"[[tensor(0), tensor(208), tensor(5382), tensor...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
806,251317,1,test,"[[tensor(0), tensor(208), tensor(5382), tensor...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
807,79318,1,test,"[[tensor(0), tensor(208), tensor(5382), tensor...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."


In [10]:
# Create two new dataframes with exploded input_ids and attention_masks

df_olap_explode = df_olap.explode('input_ids')
df_olap_explode.reset_index(inplace=True, drop=True)

df_olap_explode_attention_mask = df_olap.explode('attention_mask')
df_olap_explode_attention_mask.reset_index(inplace=True, drop=True)

In [11]:
# Get out two lists of numpy arrays so we can create models
arr_input_ids = df_olap_explode['input_ids'].to_numpy()
arr_attention_mask = df_olap_explode_attention_mask['attention_mask'].to_numpy()

input_ids_np_olap = []
attention_mask_np_olap = []

for i in range(len(arr_input_ids)):
    input_ids_np_olap.append(arr_input_ids[i].numpy())
    attention_mask_np_olap.append(arr_attention_mask[i].numpy())

In [12]:
# grab the exploded tokens and add back to the dataframe in case we want them later
df_olap_explode['input_ids_np_olap'] = input_ids_np_olap
df_olap_explode['attention_mask_np_olap'] = attention_mask_np_olap

In [13]:
# Set up some start and end chunk sizes to do our BERT embeddings as we don't have enough memory to do at once

interval = 100
end_max_range = len(df_olap_explode['input_ids_np_olap']) - len(df_olap_explode['input_ids_np_olap']) % -interval
start_max_range = end_max_range - interval

l_start  = list(range(0, start_max_range + interval, interval))
l_end  = list(range(interval, end_max_range + interval, interval ))

In [14]:
# Create Bert embeddings on the exploded documents and save to a list
l_olap = []

for i in range(len(l_end)):
    
    print("start = ", l_start[i], " end = ", l_end[i])
    
    input_ids_np_olap_run = input_ids_np_olap[l_start[i]:l_end[i]]
    attention_mask_np_olap_run =  attention_mask_np_olap[l_start[i]:l_end[i]]
    
    # Now we can run the model to get the Bert embedding
    input_ids = torch.tensor(input_ids_np_olap_run)
    attention_mask = torch.tensor(attention_mask_np_olap_run)

    start_time = datetime.datetime.now()
    with torch.no_grad(): #deactivates autograd engine
        last_hidden_states_olap = model1(input_ids, attention_mask=attention_mask)
    print(f'BERT Model Runtime: {datetime.datetime.now() - start_time}')
    
    olap_features = last_hidden_states_olap[0][:,0,:].numpy()
    l_olap.append(olap_features)
    

start =  0  end =  100
BERT Model Runtime: 0:01:53.046787
start =  100  end =  200
BERT Model Runtime: 0:01:51.227889
start =  200  end =  300
BERT Model Runtime: 0:01:51.479214
start =  300  end =  400
BERT Model Runtime: 0:01:51.501168
start =  400  end =  500
BERT Model Runtime: 0:01:51.260638
start =  500  end =  600
BERT Model Runtime: 0:35:22.192533
start =  600  end =  700
BERT Model Runtime: 0:01:57.908175
start =  700  end =  800
BERT Model Runtime: 0:01:52.196988
start =  800  end =  900
BERT Model Runtime: 0:01:52.474836
start =  900  end =  1000
BERT Model Runtime: 0:01:52.453293
start =  1000  end =  1100
BERT Model Runtime: 0:01:52.532297
start =  1100  end =  1200
BERT Model Runtime: 0:01:54.086203
start =  1200  end =  1300
BERT Model Runtime: 0:01:52.904174
start =  1300  end =  1400
BERT Model Runtime: 0:01:52.896357
start =  1400  end =  1500
BERT Model Runtime: 0:01:52.944171
start =  1500  end =  1600
BERT Model Runtime: 0:01:52.862902
start =  1600  end =  1700
BE

In [15]:
# Flatten out the ebeddngs - list. Now it is the same size as the overflow dimension
l_olap_flat = [item for sublist in l_olap for item in sublist]

In [16]:
# Add embeddings back to our dataframe
df_olap_explode['embeddings'] = l_olap_flat

In [17]:
df_olap_explode1 = df_olap_explode[['docid', 'label', 'doc_use','input_ids_np_olap', 'embeddings']]

In [18]:
#Save to disk as pickle file
fname = 'data/embeddings/longformer_stack_lcase_' + str(chunk_len) + '.pkl'
with open(fname, 'wb') as fp:
    pickle.dump(df_olap_explode1, fp)

In [19]:
df_olap_explode1


Unnamed: 0,docid,label,doc_use,input_ids_np_olap,embeddings
0,73277,0,train,"[0, 208, 5382, 13471, 50140, 50117, 134, 4, 50...","[-0.03490465, 0.064865395, -0.017987669, -0.03..."
1,73277,0,train,"[101, 5505, 4726, 9304, 8, 172, 24557, 39, 235...","[-0.100065574, 0.022632826, 0.027329236, -0.07..."
2,73277,0,train,"[101, 4, 50140, 50117, 306, 4, 50117, 1121, 10...","[-0.041281126, 0.08823079, -0.007453665, -0.07..."
3,73277,0,train,"[101, 50117, 41562, 5, 1065, 6, 38, 19403, 110...","[-0.07864474, 0.06377331, -0.023088403, -0.045..."
4,73277,0,train,"[101, 746, 95, 7, 192, 549, 24, 1326, 1593, 23...","[-0.06478642, 0.09257552, -0.008796655, -0.075..."
...,...,...,...,...,...
3639,79318,1,test,"[0, 208, 5382, 13471, 50118, 50118, 10975, 134...","[-0.10326244, 0.022347506, 0.035206728, -0.061..."
3640,79318,1,test,"[101, 16, 276, 25, 11, 5, 29643, 31486, 8302, ...","[-0.07819697, 0.04468355, -0.004176976, -0.078..."
3641,79318,1,test,"[101, 14804, 7, 4227, 5, 31904, 3724, 8, 5, 63...","[-0.07878052, 0.07486821, 0.0101483315, -0.024..."
3642,255439,1,test,"[0, 208, 5382, 13471, 50118, 1437, 50118, 1437...","[-0.0740111, 0.0101187825, 0.015575906, -0.059..."
