In [1]:
import numpy as np
import pandas as pd
import datetime

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

# PyTorch Imports
import torch # a tensor library
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

#Huggingface Transformers
import transformers
from transformers import LongformerModel, LongformerTokenizer, AdamW

## Model Set Up
Set up the model and the configuration we need

- max_len - how many tokens will be used from the document.
- batch_size - reduce if memory issues. paper reccomends 16-32
- num epochs

- tokenizer - this should be a pretrined tokenizer, e.g. distilBert. 
- model - make sure it uses the same tokenizer for generating the weights

#### input files
- train.csv: heading removed, dates and URL replcaed, un-cased, sentence breaks and punctuation included  
- train_lcase.csv: heading removed, dates and URL replcaed, lower cased, sentence breaks and punctuation removed
  

In [2]:
train_file = "data/train.csv"
test_file = "data/test.csv"

max_len = 4096 #128 #4096
batch_size = 8

model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

## Helper Functions

In [3]:
# Function to get data
def get_data(fname):
    df = pd.read_csv(fname)
    df = df[['docid', 'cleaned_contents', 'Discrimination_Label']]
    df = df.rename(columns = {'cleaned_contents':'text', 'Discrimination_Label':'label'})
    df.reset_index(inplace=True, drop=True)
    return df

# Get Embeddings

In [4]:
df_train = get_data(train_file)
df_test = get_data(test_file)

df_train['doc_use'] = 'train'
df_test['doc_use'] = 'test'

df= pd.concat([df_train, df_test])
df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,docid,text,label,doc_use
0,73277,SENTENCE\n\n\t1.\tYou are charged as follows:\...,0,train
1,79776,"SENTENCE\n\n\t1.\tJOSEFA KOTOBALAVU, you were ...",1,train
2,75870,SENTENCE\n\n1. The Director of Public Prosecut...,1,train
3,79299,"SENTENCE\n\n\t1.\tMOHOMMED NABI UD- DEAN, you ...",1,train
4,80603,JUDGMENT OF THE COURT\n\nBackground\n\n[1] The...,0,train


In [5]:
#Set the attention mask
tokenized = df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_len, truncation=True)))
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)
# Global attention mask on the first element (CLS or <s>') per the documentation. But not the padded elements
global_attention_mask = np.where(padded ==0, 0, 0)
for i in global_attention_mask:
    i[0]=1

In [6]:
#padded[1]
#tokenizer.convert_ids_to_tokens(padded[1])

In [7]:
#check lengths
len(padded[0])

4096

In [8]:
#create a tensor for the input ids and attention_mask and labels
input_ids = torch.tensor(padded) 
attention_mask = torch.tensor(attention_mask)
global_attention_mask = torch.tensor(global_attention_mask)

In [9]:
dataset = TensorDataset(input_ids, attention_mask, global_attention_mask)
#dataset = TensorDataset(input_ids, attention_mask)

In [10]:
# Create the DataLoaders for our training and validation sets.
# Take training samples in random order. 
dataloader = DataLoader(
            dataset,  # The training samples.
            sampler = RandomSampler(dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

In [11]:
model.cuda()

LongformerModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(4098, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): LongformerEncoder(
    (layer): ModuleList(
      (0): LongformerLayer(
        (attention): LongformerAttention(
          (self): LongformerSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (query_global): Linear(in_features=768, out_features=768, bias=True)
            (key_global): Linear(in_features=768, out_features=768, bias=True)
            (value_global): Linear(in_features=768, out_features=768, bias=True)
          )
          (outp

In [None]:
# Get the embeddings
device = torch.device("cuda")
print("GPU memory in use: ", str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')

output = []

start_time = datetime.datetime.now()

# For each batch of training data...
for step, batch in enumerate(dataloader):    
    
    if step % 32 == 0 and not step == 0: 
        elapsed = datetime.datetime.now() - start_time
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(dataloader), elapsed))

     
    # add the input tensors to the GPU
    b_input_ids = batch[0].to(device)
    b_attention_mask = batch[1].to(device)
    b_global_attention_mask = batch[2].to(device)

    # get the weights
    with torch.no_grad(): #deactivates autograd engine
        last_hidden_states = model(b_input_ids, attention_mask=b_attention_mask, global_attention_mask = b_global_attention_mask)
        #last_hidden_states = model(b_input_ids, attention_mask=b_attention_mask)
        features = last_hidden_states[0][:,0,:].cpu().numpy()
        output.append(features)

embeddings = np.concatenate(output)
        
print("")
print(f'Total Runtime: {datetime.datetime.now() - start_time}')

GPU memory in use:  595.412992M


In [None]:
print(len(embeddings))
print(len(embeddings[0]))

In [None]:
df['embeddings'] = embeddings.tolist()

# Now train a model on top of the embeddings, a simple NN or linear classifier

In [None]:
from keras.models import Sequential
from keras.layers import Activation, Dense
from sklearn.metrics import classification_report

In [None]:
X_train = np.array(df['embeddings'][df['doc_use'] == 'train'].to_list())
y_train = np.array(df['label'][df['doc_use'] == 'train'].to_list())


X_test = np.array(df['embeddings'][df['doc_use'] == 'test'].to_list())
y_test = np.array(df['label'][df['doc_use'] == 'test'].to_list())

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
model = Sequential()
model.add(Dense(768/2, input_dim=768, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
epochs = 6
batch_size = 8

history = model.fit(X_train, y_train, 
                    epochs=epochs, 
                    batch_size=batch_size,
                    validation_split=0.1)

In [None]:
predictions = model.predict_classes(X_test)

In [None]:
np.mean(predictions == y_test)

In [None]:
print(classification_report(y_test, predictions))