In [1]:
%matplotlib inline
import torch
import pandas as pd
import numpy as np
import datetime
import pickle

import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import Dataset, random_split
from torch.utils.data.sampler import SubsetRandomSampler

import transformers as ppb
from transformers import RobertaTokenizer, DistilBertTokenizer, BertTokenizer, RobertaModel, BertModel
from transformers import get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification, AdamW, BertConfig

In [2]:
# Initialize Variables
chunk_len = 512 #200
overlap_len = 128 #50

BERT_tokenizer_class = ppb.BertTokenizer
BERT_pre_trained_weights = 'bert-base-cased'
tokenizer = BERT_tokenizer_class.from_pretrained(BERT_pre_trained_weights)
model1 = ppb.BertModel.from_pretrained(BERT_pre_trained_weights)

In [3]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [4]:
# Load the dataset into a pandas dataframe.
# Because we want to get embeddings for both train and test sets we are going to do together
# Add a column to preserve where the doc came from

train_raw = pd.read_csv("../../w266_project/data/train_lcase.csv")
train_raw['doc_use'] = 'train'
test_raw = pd.read_csv("../../w266_project/data/test_lcase.csv")
test_raw['doc_use'] = 'test'

df_raw = pd.concat([train_raw, test_raw])
df_raw.reset_index(inplace=True, drop=True)

# Display 10 random rows from the data.
df_raw.sample(10)

Unnamed: 0,docid,text,label,len_txt,doc_use
770,79466,judgment the appellant was convicted in the la...,0,476,test
198,71087,judgment on appeal the appellant was convicted...,0,299,train
704,75461,sentence 1 you josateki taliga are here today ...,1,869,test
68,82484,sentence 1 the accused tomasi tiko bulivou is ...,1,2132,train
35,73984,sentence the accused has been convicted by thi...,1,1649,train
787,73541,judgment 1 the appellant was charged before th...,0,1724,test
86,71842,decision 1 on DATE the appellant was convicted...,1,841,train
427,281131,sentence 1 on DATE in the presence of your cou...,1,907,train
515,284308,sentence 1 the accused is before the court for...,0,1114,train
407,80433,j u d g m e n t gamalath ja 1 the appellant wa...,1,1513,train


In [5]:
# Function to Extract all the tokenize elements out of data_tokenize from the above tokenizer.encode_plus
# gives us input ids, attention mask and critically overflow tokens

def extract_tokens(data_tokenize, targets):

    previous_input_ids = data_tokenize["input_ids"].reshape(-1) # a tensor of the input IDs )
    previous_attention_mask = data_tokenize["attention_mask"].reshape(-1) # a tensor of the attention mask (200 * 1)
    previous_token_type_ids = data_tokenize["token_type_ids"].reshape(-1) # a tensor of the attention mask (200 * 0)
    remain = data_tokenize.get("overflowing_tokens") # list of the overflow tokens
    targets = torch.tensor(targets, dtype=torch.int) # a tensor of current target (1)

    return previous_input_ids, previous_attention_mask, previous_token_type_ids, remain, targets

In [6]:
# Do the tokenization
# This returns a transformers object with 5 elements
# We only really need the input_ids and attention mask for modelling
# We will use these IDS to get out embeddings

# overflowing_tokens (list) - all the elements after our 200 word split
# num_truncated_tokens (integer) - how many overflow tokens we have, for text[0] it is 1822
# input_ids (tensor) - the first 200 tokens, with special token 101 at the beginning and 102 at end
# token_type_ids (tensor) - the token types for the input - there are 200, ours are all zero's (why?)
# attention_mask (tensor) - attention mask in case our text < 200 tokens

start_time = datetime.datetime.now()

# Reset lists
long_terms_token = []
input_ids_list = []
attention_mask_list = []
token_type_ids_list = []
targets_list = []

input_ids_list_head = []
attention_mask_list_head = []
token_type_ids_list_head = []
targets_list_head = []

input_ids_list_olap = []
attention_mask_list_olap = []
token_type_ids_list_olap= []
targets_list_olap= []

input_ids_list_tail = []
attention_mask_list_tail = []
token_type_ids_list_tail = []
targets_list_tail= []

for idx in range(len(df_raw)): 
    
    long_terms_token = []
    input_ids_list = []
    attention_mask_list = []
    token_type_ids_list = []
    targets_list = []
        
    # tokenize for this row in train_raw
    data = tokenizer.encode_plus(
        df_raw['text'][idx],
        max_length=chunk_len,
        pad_to_max_length=True,
        add_special_tokens=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_overflowing_tokens=True,
        return_tensors='pt')

    # grab the targets for this row in train_raw
    targets = int(df_raw['label'][idx])
    
    # extract the tokens
    input_ids, attention_mask, token_type_ids, remain, targets = extract_tokens(data, targets)
    remain = [] if remain is None else remain # For cases where there is no overflow
    
    # CREATE LISTS FOR THE HEAD
    input_ids_list_head.append(input_ids)
    attention_mask_list_head.append(attention_mask)
    token_type_ids_list_head.append(token_type_ids)
    targets_list_head.append(targets)
    
    # GET OVERLAPPING TOKEN LISTS *****************************
    remain = torch.tensor(remain, dtype=torch.long)
    idxs = range(len(remain)+ chunk_len)
    idxs = idxs[(chunk_len-overlap_len-2)::(chunk_len-overlap_len-2)]
    input_ids_first_overlap = input_ids[-(overlap_len+1):-1]
    start_token = torch.tensor([101], dtype=torch.long)
    end_token = torch.tensor([102], dtype=torch.long)
    
    # Get the initial 200 word tensors (same as head)
    input_ids_list.append(input_ids)
    attention_mask_list.append(attention_mask)
    token_type_ids_list.append(token_type_ids)
    targets_list.append(targets)
    
    # For each overlapping section create a tensor of input_ids, attention_masks, token_type_ids and targets (labels)
    # add to a list
    for i, idx in enumerate(idxs):
        if i == 0:
            input_ids = torch.cat((input_ids_first_overlap, remain[:idx]))
        elif i == len(idxs):
            input_ids = remain[idx:]
        elif previous_idx >= len(remain):
            break
        else:
            input_ids = remain[(previous_idx-overlap_len):idx]

        previous_idx = idx

        nb_token = len(input_ids)+2
        attention_mask = torch.ones(chunk_len, dtype=torch.long)
        attention_mask[nb_token:chunk_len] = 0
        token_type_ids = torch.zeros(chunk_len, dtype=torch.long)
        input_ids = torch.cat((start_token, input_ids, end_token))
        if chunk_len-nb_token > 0:
            padding = torch.zeros(chunk_len-nb_token, dtype=torch.long)
            input_ids = torch.cat((input_ids, padding))

        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        token_type_ids_list.append(token_type_ids)
        targets_list.append(targets)
    
    # Add to the overlap list
    input_ids_list_olap.append([input_ids_list])
    attention_mask_list_olap.append([attention_mask_list])
    token_type_ids_list_olap.append([token_type_ids_list])
    targets_list_olap.append([targets_list])      
  

    # GET LISTS FOR THE HEAD
    input_ids_list_tail.append([input_ids_list[-1]])
    attention_mask_list_tail.append([attention_mask_list[-1]])
    token_type_ids_list_tail.append([token_type_ids_list[-1]])
    targets_list_tail.append([targets_list[-1]])
    
print(f'BERT Tokenize Runtime: {datetime.datetime.now() - start_time}')


BERT Tokenize Runtime: 0:00:23.339582


In [7]:
# check we have the correct sizes of things (641)
print("Train length:", len(df_raw))
print("")
print("Head input_ids length:", len(input_ids_list_head))
print("Head attention mask length:", len(attention_mask_list_head))
print("Head target length:", len(targets_list_head))
print("")
print("Tail input_ids length:", len(input_ids_list_tail))
print("Tail attention mask length:", len(attention_mask_list_tail))
print("Tail target length:", len(targets_list_tail))
print("")
print("Overlap input_ids length:", len(input_ids_list_olap))
print("Overlap attention mask length:", len(attention_mask_list_olap))
print("Overlap target length:", len(targets_list_olap))

Train length: 803

Head input_ids length: 803
Head attention mask length: 803
Head target length: 803

Tail input_ids length: 803
Tail attention mask length: 803
Tail target length: 803

Overlap input_ids length: 803
Overlap attention mask length: 803
Overlap target length: 803


In [8]:
# Convert to lists so we can smash to a dataframe and save for reuse
input_ids_np_head = []
attention_mask_np_head = []

input_ids_np_tail = []
attention_mask_np_tail = []

for i in range(len(input_ids_list_head)):
    input_ids_np_head.append(input_ids_list_head[i].numpy())
    attention_mask_np_head.append(attention_mask_list_head[i].numpy())
    input_ids_np_tail.append(input_ids_list_tail[i][0].numpy())
    attention_mask_np_tail.append(attention_mask_list_tail[i][0].numpy())

In [9]:
# add the lists to the dataframe for future use
df_raw['input_ids_np_head'] = input_ids_np_head
df_raw['input_ids_np_tail'] = input_ids_np_tail

In [10]:
# Model Head tokens to get the last_hidden_state
# Do one row at a time because it crashes otherwise

l_start = [0,100,200,300,400,500,600,700,800]
l_end =   [100,200,300,400,500,600,700,800,900]
l_id =    [0,1,2,3,4,5,6,7,8]

l_head = []

for i in range(len(l_start)):
    
    print("start = ", l_start[i], " end = ", l_end[i], " id = ", l_id[i])
    
    input_ids_np_run = input_ids_np_head[l_start[i]:l_end[i]]
    attention_mask_np_run =  attention_mask_np_head[l_start[i]:l_end[i]]
    
    # Now we can run the model to get the Bert embedding
    input_ids = torch.tensor(input_ids_np_run)
    attention_mask = torch.tensor(attention_mask_np_run)

    start_time = datetime.datetime.now()
    with torch.no_grad(): #deactivates autograd engine
        last_hidden_states = model1(input_ids, attention_mask=attention_mask)
    print(f'BERT Model Runtime: {datetime.datetime.now() - start_time}')
    
    features = last_hidden_states[0][:,0,:].numpy()
    l_head.append(features)
    
# Flatten out the ebeddngs - list. Now it is the same size as the overflow dimension
l_head_flat = [item for sublist in l_head for item in sublist]
df_raw['head_embeddings'] = l_head_flat

start =  0  end =  100  id =  0
BERT Model Runtime: 0:01:14.097850
start =  100  end =  200  id =  1
BERT Model Runtime: 0:01:10.939066
start =  200  end =  300  id =  2
BERT Model Runtime: 0:01:10.843425
start =  300  end =  400  id =  3
BERT Model Runtime: 0:01:10.951512
start =  400  end =  500  id =  4
BERT Model Runtime: 0:01:11.660280
start =  500  end =  600  id =  5
BERT Model Runtime: 0:01:12.639403
start =  600  end =  700  id =  6
BERT Model Runtime: 0:01:11.528518
start =  700  end =  800  id =  7
BERT Model Runtime: 0:01:12.460135
start =  800  end =  900  id =  8
BERT Model Runtime: 0:00:01.502017


In [11]:
# Model Head tokens to get the last_hidden_state
# Do one row at a time because it crashes otherwise

l_start = [0,100,200,300,400,500,600,700,800]
l_end =   [100,200,300,400,500,600,700,800,900]
l_id =    [0,1,2,3,4,5,6,7,8]

l_tail = []

for i in range(len(l_start)):
    
    print("start = ", l_start[i], " end = ", l_end[i], " id = ", l_id[i])
    
    input_ids_np_run = input_ids_np_tail[l_start[i]:l_end[i]]
    attention_mask_np_run =  attention_mask_np_tail[l_start[i]:l_end[i]]
    
    # Now we can run the model to get the Bert embedding
    input_ids = torch.tensor(input_ids_np_run)
    attention_mask = torch.tensor(attention_mask_np_run)

    start_time = datetime.datetime.now()
    with torch.no_grad(): #deactivates autograd engine
        last_hidden_states = model1(input_ids, attention_mask=attention_mask)
    print(f'BERT Model Runtime: {datetime.datetime.now() - start_time}')
    
    features = last_hidden_states[0][:,0,:].numpy()
    l_tail.append(features)
    
# Flatten out the ebeddngs - list. Now it is the same size as the overflow dimension
l_tail_flat = [item for sublist in l_tail for item in sublist]
df_raw['tail_embeddings'] = l_tail_flat

start =  0  end =  100  id =  0
BERT Model Runtime: 0:01:08.344804
start =  100  end =  200  id =  1
BERT Model Runtime: 0:25:39.807769
start =  200  end =  300  id =  2
BERT Model Runtime: 0:01:08.830317
start =  300  end =  400  id =  3
BERT Model Runtime: 0:01:07.750817
start =  400  end =  500  id =  4
BERT Model Runtime: 0:01:07.766886
start =  500  end =  600  id =  5
BERT Model Runtime: 0:01:07.825672
start =  600  end =  700  id =  6
BERT Model Runtime: 0:01:08.403189
start =  700  end =  800  id =  7
BERT Model Runtime: 0:01:07.986288
start =  800  end =  900  id =  8
BERT Model Runtime: 0:00:01.406201


In [12]:
#Save to disk as pickle file
#df_raw.to_csv(r'../data/distilBert_embeddings_all_200.csv', index = False)

fname = '../data/baseBert_embeddings_headtail_' + str(chunk_len) + '.pkl'

with open(fname, 'wb') as fp:
    pickle.dump(df_raw, fp)

# Overlap STUFF

In [13]:
# Convert to lists so we can smash to a dataframe and save for reuse
input_ids_l2_olap = []
attention_mask_l2_olap = []

for i in range(len(input_ids_list_olap)):
    input_ids_l2_olap.append(input_ids_list_olap[i][0])
    attention_mask_l2_olap.append(attention_mask_list_olap[i][0])

In [14]:
# convert to dataframe
df_olap = pd.DataFrame()
df_olap['docid'] = df_raw['docid']
df_olap['label'] = df_raw['label']
df_olap['doc_use'] = df_raw['doc_use']
df_olap['input_ids'] = input_ids_l2_olap
df_olap['attention_mask'] = attention_mask_l2_olap
df_olap

Unnamed: 0,docid,label,doc_use,input_ids,attention_mask
0,73277,0,train,"[[tensor(101), tensor(5650), tensor(122), tens...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
1,79776,1,train,"[[tensor(101), tensor(5650), tensor(122), tens...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
2,75870,1,train,"[[tensor(101), tensor(5650), tensor(122), tens...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
3,79299,1,train,"[[tensor(101), tensor(5650), tensor(122), tens...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
4,80603,0,train,"[[tensor(101), tensor(9228), tensor(1104), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
...,...,...,...,...,...
798,74009,0,test,"[[tensor(101), tensor(9228), tensor(1103), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
799,79379,1,test,"[[tensor(101), tensor(5650), tensor(2666), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
800,251317,1,test,"[[tensor(101), tensor(5650), tensor(122), tens...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
801,79318,1,test,"[[tensor(101), tensor(5650), tensor(122), tens...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."


In [15]:
# Create two new dataframes with exploded input_ids and attention_masks

df_olap_explode = df_olap.explode('input_ids')
df_olap_explode.reset_index(inplace=True, drop=True)

df_olap_explode_attention_mask = df_olap.explode('attention_mask')
df_olap_explode_attention_mask.reset_index(inplace=True, drop=True)

In [16]:
# Get out two lists of numpy arrays so we can create models
arr_input_ids = df_olap_explode['input_ids'].to_numpy()
arr_attention_mask = df_olap_explode_attention_mask['attention_mask'].to_numpy()

input_ids_np_olap = []
attention_mask_np_olap = []

for i in range(len(arr_input_ids)):
    input_ids_np_olap.append(arr_input_ids[i].numpy())
    attention_mask_np_olap.append(arr_attention_mask[i].numpy())

In [17]:
# grab the exploded tokens and add back to the dataframe in case we want them later
df_olap_explode['input_ids_np_olap'] = input_ids_np_olap
df_olap_explode['attention_mask_np_olap'] = attention_mask_np_olap

In [18]:
l_start = list(range(0,3700,100))
l_end =   list(range(100,3800,100))


In [19]:
# Set up some start and end chunk sizes to do our BERT embeddings as we don't have enough memory to do at once
l_start = list(range(0,3700,100))
l_end =   list(range(100,3800,100))

In [20]:
# Create Bert embeddings on the exploded documents and save to a list
l_olap = []

for i in range(len(l_start)):
    
    print("start = ", l_start[i], " end = ", l_end[i])
    
    input_ids_np_olap_run = input_ids_np_olap[l_start[i]:l_end[i]]
    attention_mask_np_olap_run =  attention_mask_np_olap[l_start[i]:l_end[i]]
    
    # Now we can run the model to get the Bert embedding
    input_ids = torch.tensor(input_ids_np_olap_run)
    attention_mask = torch.tensor(attention_mask_np_olap_run)

    start_time = datetime.datetime.now()
    with torch.no_grad(): #deactivates autograd engine
        last_hidden_states_olap = model1(input_ids, attention_mask=attention_mask)
    print(f'BERT Model Runtime: {datetime.datetime.now() - start_time}')
    
    olap_features = last_hidden_states_olap[0][:,0,:].numpy()
    l_olap.append(olap_features)
    

start =  0  end =  100
BERT Model Runtime: 0:01:08.010337
start =  100  end =  200
BERT Model Runtime: 0:01:11.065932
start =  200  end =  300
BERT Model Runtime: 0:01:08.695092
start =  300  end =  400
BERT Model Runtime: 0:01:08.444483
start =  400  end =  500
BERT Model Runtime: 0:01:08.260005
start =  500  end =  600
BERT Model Runtime: 0:01:08.360355
start =  600  end =  700
BERT Model Runtime: 0:01:08.527455
start =  700  end =  800
BERT Model Runtime: 0:01:08.333234
start =  800  end =  900
BERT Model Runtime: 0:01:08.500196
start =  900  end =  1000
BERT Model Runtime: 0:01:08.333071
start =  1000  end =  1100
BERT Model Runtime: 0:01:08.313147
start =  1100  end =  1200
BERT Model Runtime: 0:17:07.013473
start =  1200  end =  1300
BERT Model Runtime: 0:01:08.412440
start =  1300  end =  1400
BERT Model Runtime: 0:01:09.504785
start =  1400  end =  1500
BERT Model Runtime: 0:01:10.748542
start =  1500  end =  1600
BERT Model Runtime: 0:01:11.496825
start =  1600  end =  1700
BE

In [21]:
# Flatten out the ebeddngs - list. Now it is the same size as the overflow dimension
l_olap_flat = [item for sublist in l_olap for item in sublist]

In [22]:
# Add embeddings back to our dataframe
df_olap_explode['embeddings'] = l_olap_flat

In [23]:
df_olap_explode1 = df_olap_explode[['docid', 'label', 'doc_use','input_ids_np_olap', 'embeddings']]

In [24]:
#Save to disk as pickle file

fname = '../data/baseBert_embeddings_olap_' + str(chunk_len) + '.pkl'

with open(fname, 'wb') as fp:
    pickle.dump(df_olap_explode1, fp)