# LSTM model trained on frozed DistilBert embeddings

Includes embedding function

In [3]:
#!pip install transformers

In [1]:
#Imports
from transformers import DistilBertTokenizer, DistilBertConfig, TFDistilBertModel, TFDistilBertForSequenceClassification

import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import textwrap
import pickle
from sklearn.metrics import precision_recall_fscore_support, classification_report

import keras
from keras import Sequential
from keras.utils import Sequence
from keras.layers import LSTM, Dense, Masking
from keras.utils import np_utils
from keras import optimizers
from keras.models import Sequential, Model
from keras.layers import Embedding, Dense, Input, concatenate, Layer, Lambda, Dropout, Activation
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback, TensorBoard
from keras import layers
from keras.layers import Input

# Helper Functions

In [9]:
train_file = "../data/train_80_10_10.csv"
test_file = "../data/test_80_10_10.csv"
val_file = "../data/val_80_10_10.csv"

skip_lines = 6
max_length = 200
split_length = max_length - 2


# DistilBert
bert_file = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(bert_file, do_lower_case=True)
bert_model = TFDistilBertForSequenceClassification.from_pretrained(bert_file)

# Model Training
batch_size = 8
epochs = 3
learning_rate = 2e-5

In [None]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [10]:
# Function to get data
def get_data(fname):
    df = pd.read_csv(fname)
    df = df[['docid', 'cleaned_contents', 'Discrimination_Label']]
    df = df.rename(columns = {'cleaned_contents':'text', 'Discrimination_Label':'label'})
    df.reset_index(inplace=True, drop=True)
    return df

In [11]:
# Function to tokenize data and return tensors for input ids, attention mask and labels
def tokenize_plus(df):

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    sentences = df['text'].values
    labels = df['label'].values

    input_ids = []
    input_masks = []
    input_segments = []

    # For every sentence...
    for sent in sentences:
        inputs = tokenizer.encode_plus(sent, 
                                       add_special_tokens=True, 
                                       max_length=max_length, 
                                       truncation = True,
                                       pad_to_max_length=True, 
                                       return_attention_mask=True,
                                       return_token_type_ids=True)

        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])   
    
    labels = df['label']

    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(labels, dtype='int32')

In [12]:
# function to split text into smaller chunks of 200 words, overlapping by 50 words
def get_split(text1):
  l_total = []
  l_parcial = []
  if len(text1.split())//150 >0:
    n = len(text1.split())//150
  else: 
    n = 1
  for w in range(n):
    if w == 0:
      l_parcial = text1.split()[:200]
      l_total.append(" ".join(l_parcial))
    else:
      l_parcial = text1.split()[w*150:w*150 + 200]
      l_total.append(" ".join(l_parcial))
  return l_total

# Data Prep


In [13]:
# GET THE DATA
df_train = get_data(train_file)
df_test = get_data(test_file)
df_val = get_data(val_file)

In [14]:
#remove double new lines
df_train['text'] = df_train['text'].replace('\n\s*\n', '\n',regex=True)
df_test['text'] = df_test['text'].replace('\n\s*\n', '\n',regex=True)
df_val['text'] = df_val['text'].replace('\n\s*\n', '\n',regex=True)

# strip last n lines
df_train['text'] = df_train.apply(lambda L: L.text.rsplit("\n",skip_lines)[0], axis=1)
df_test['text'] = df_test.apply(lambda L: L.text.rsplit("\n",skip_lines)[0], axis=1)
df_val['text'] = df_val.apply(lambda L: L.text.rsplit("\n",skip_lines)[0], axis=1)

In [15]:
# split the texts into  chunks & # explode the dataframe
df_train['text_split'] = df_train['text'].apply(get_split)
df_train = df_train.explode('text_split')

# split the texts into   chunks & # explode the dataframe
df_val['text_split'] = df_val['text'].apply(get_split)
df_val = df_val.explode('text_split')

# split the texts into   chunks & # explode the dataframe
df_test['text_split'] = df_test['text'].apply(get_split)
df_test = df_test.explode('text_split')

In [16]:
df_train.head()

Unnamed: 0,docid,text,label,text_split
0,255849,SENTENCE\n• In a judgment delivered on 9 May 2...,0,SENTENCE • In a judgment delivered on 9 May 20...
0,255849,SENTENCE\n• In a judgment delivered on 9 May 2...,0,that she blacked out. • She came to in a frien...
0,255849,SENTENCE\n• In a judgment delivered on 9 May 2...,0,quality life. Those who find themselves on the...
0,255849,SENTENCE\n• In a judgment delivered on 9 May 2...,0,with you as she was asleep. You then raped her...
1,288617,"SENTENCE\n• ELIZABETH GOLMAN, you were charged...",1,"SENTENCE • ELIZABETH GOLMAN, you were charged ..."


In [17]:
len(df_train)

6004

In [20]:
# get the tokens
# Get tokenized labels
input_ids, attention_masks, labels = tokenize_plus(df_train)
val_input_ids, val_attention_masks, val_labels = tokenize_plus(df_val)
test_input_ids, test_attention_masks, test_labels = tokenize_plus(df_test)

In [21]:
#That last step took longer than it should, lets save the train output just in case...
with open('test_last_hidden_states.pkl', 'wb') as f: 
  pickle.dump(input_ids, f)
with open('attention_masks.pkl', 'wb') as f: 
  pickle.dump(attention_masks, f)
with open('labels.pkl', 'wb') as f: 
  pickle.dump(labels, f)

# Model to get token embeddings

In [22]:
config = DistilBertConfig(dropout=0.2, attention_dropout=0.2)
config.output_hidden_states = False
transformer_model = TFDistilBertModel.from_pretrained(bert_file, config = config)

In [23]:
#embedding mode
input_ids_in = tf.keras.layers.Input(shape=(200,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(200,), name='masked_token', dtype='int32') 
cls_token = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = cls_token)


In [24]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 200)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 200)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model (TFDistilB ((None, 200, 768),)  66362880    input_token[0][0]                
Total params: 66,362,880
Trainable params: 66,362,880
Non-trainable params: 0
__________________________________________________________________________________________________


## Run Model on train, test and val to get feature embeddings

In [25]:
val_last_hidden_states = model.predict([val_input_ids, val_attention_masks])

In [27]:
from sklearn.externals import joblib
filename = 'val_last_hidden_states.pkl.sav'
joblib.dump(val_last_hidden_states, filename)  

['val_last_hidden_states.pkl.sav']

In [28]:
test_last_hidden_states = model.predict([test_input_ids, test_attention_masks])

In [29]:
from sklearn.externals import joblib
filename = 'test_last_hidden_states.pkl.sav'
joblib.dump(test_last_hidden_states, filename)  

['test_last_hidden_states.pkl.sav']

In [None]:
last_hidden_states = model.predict([input_ids, attention_masks])

In [None]:
from sklearn.externals import joblib
filename = 'train_last_hidden_states.pkl.sav'
joblib.dump(last_hidden_states, filename)  

In [23]:
with open('test_last_hidden_states.pkl', 'wb') as f: pickle.dump(test_last_hidden_states, f)
with open('val_last_hidden_states.pkl', 'wb') as f: pickle.dump(val_last_hidden_states, f)


In [None]:
with open('train_last_hidden_states.pkl', 'wb') as f: pickle.dump(last_hidden_states, f)

## Flatten the dataframes so that each row is a doc, and each feature is a list of embeddings

In [None]:
#get the pickle file saved earlier
with open('train_last_hidden_states.pkl', 'rb') as f: train_last_hidden_states = pickle.load(f)
with open('val_last_hidden_states.pkl', 'rb') as f: val_last_hidden_states = pickle.load(f)
with open('test_last_hidden_states.pkl', 'rb') as f: test_last_hidden_states = pickle.load(f)

In [None]:
df_train['feature_split'] = last_hidden_states[:,0,:].tolist()

In [26]:
# Put the data back together again - not the feature weights represent
df_train = df_train.groupby(['docid', 'text', 'label']).agg(sum).reset_index()

In [115]:
# Divide up the feature_split column into equal 768 chunks
df_train['features'] = df_train['feature_split'].apply((lambda x: [x[i:i + 768] for i in range(0, len(x), 768)]))

In [121]:
df_test['feature_split'] = test_last_hidden_states[:,0,:].tolist()
df_test = df_test.groupby(['docid', 'text', 'label']).agg(sum).reset_index()
df_test['features'] = df_test['feature_split'].apply((lambda x: [x[i:i + 768] for i in range(0, len(x), 768)]))

In [122]:
df_val['feature_split'] = val_last_hidden_states[:,0,:].tolist()
df_val = df_val.groupby(['docid', 'text', 'label']).agg(sum).reset_index()
df_val['features'] = df_val['feature_split'].apply((lambda x: [x[i:i + 768] for i in range(0, len(x), 768)]))

In [144]:
df_lstm_train = df_train[['features','label']].copy()
df_lstm_test = df_test[['features','label']].copy()
df_lstm_val = df_val[['features','label']].copy()

In [145]:
#check shapes of our new inputs
df_lstm_train.shape, df_lstm_test.shape, df_lstm_val.shape

((647, 2), (81, 2), (81, 2))

In [None]:
import pickle
with open('lstm_train.pkl', 'wb') as f: pickle.dump(df_lstm_train, f)
with open('lstm_test.pkl', 'wb') as f: pickle.dump(df_lstm_test, f)
with open('lstm_val.pkl', 'wb') as f: pickle.dump(df_lstm_val, f)

# LSTM Model on outputs


In [133]:
label_list = [x for x in np.unique(df_lstm_train['label'])]
label_list

[0, 1]

In [136]:
text_input = Input(shape=(None,768,), dtype='float32', name='text')

l_mask = layers.Masking(mask_value=-99.)(text_input)
# Which we encoded in a single vector via a LSTM
encoded_text = layers.LSTM(100,)(l_mask)
out_dense = layers.Dense(30, activation='relu')(encoded_text)
out = layers.Dense(len(label_list), activation='softmax')(out_dense)
model = Model(text_input, out)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['acc'])
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text (InputLayer)            (None, None, 768)         0         
_________________________________________________________________
masking_3 (Masking)          (None, None, 768)         0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               347600    
_________________________________________________________________
dense_4 (Dense)              (None, 30)                3030      
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 62        
Total params: 350,692
Trainable params: 350,692
Non-trainable params: 0
_________________________________________________________________


Thanks to Armand Olivares for his post which helped on teh generator function and mask
https://medium.com/@armandj.olivares/using-bert-for-classifying-documents-with-long-texts-5c3e7b04573d

Because chunk lengths can be different, we pad the shorter chunks with a special value, -99, which is masked and therfefore skipped for the network

In [221]:


#The generator functions (get the max length 3 batches at a time)
num_sequences = len(df_train['features'].to_list())
batch_size = 1
batches_per_epoch =  647
assert batch_size * batches_per_epoch == num_sequences
num_features= 768
def train_generator(df):
    x_list= df['features'].to_list()
    y_list =  df.label.to_list()
    # Generate batches
    while True:
        for b in range(batches_per_epoch):
            longest_index = (b + 1) * batch_size - 1
            timesteps = len(max(df['features'].to_list()[:(b + 1) * batch_size][-batch_size:], key=len))
            x_train = np.full((batch_size, timesteps, num_features), -99.)
            y_train = np.zeros((batch_size,  1))
            for i in range(batch_size):
                li = b * batch_size + i
                x_train[i, 0:len(x_list[li]), :] = x_list[li]
                y_train[i] = y_list[li]
            yield x_train, y_train

In [222]:
num_sequences_val = len(df_val['features'].to_list())
batch_size_val = 1
batches_per_epoch_val = 81
assert batch_size_val * batches_per_epoch_val == num_sequences_val
num_features= 768
def val_generator(df):
    x_list= df['features'].to_list()
    y_list =  df.label.to_list()
    # Generate batches
    while True:
        for b in range(batches_per_epoch_val):
            longest_index = (b + 1) * batch_size_val - 1
            timesteps = len(max(df['features'].to_list()[:(b + 1) * batch_size_val][-31:], key=len))
            # print(len(df_train['emb'].to_list()[:b+batch_size][-7:]))
            x_train = np.full((batch_size_val, timesteps, num_features), -99.)
            y_train = np.zeros((batch_size_val,  1))
            for i in range(batch_size_val):
                li = b * batch_size_val + i
                # print("li", li)
                # print(x_train[i, 0:len(x_list[li]), :].shape, len(x_list[li]))
                x_train[i, 0:len(x_list[li]), :] = x_list[li]
                y_train[i] = y_list[li]
            yield x_train, y_train

In [217]:
from keras.callbacks import ReduceLROnPlateau
call_reduce = ReduceLROnPlateau(monitor='val_acc', factor=0.95, patience=3, verbose=2,
                                mode='auto', min_delta=0.01, cooldown=0, min_lr=0)

In [223]:
model.fit_generator(train_generator(df_lstm_train), 
                    steps_per_epoch=len(df_lstm_train), 
                    epochs=10,
                    validation_data=val_generator(df_lstm_val), 
                    validation_steps=len(df_lstm_val), 
                    callbacks =[call_reduce] )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0009500000451225787.
Epoch 7/10
Epoch 8/10
Epoch 9/10

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0009025000152178108.
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f236874b048>

# LSTM Model: Evaluation

In [224]:
num_sequences_val = len(df_test['features'].to_list())
batch_size_val = 1
batches_per_epoch_val = 81
assert batch_size_val * batches_per_epoch_val == num_sequences_val
num_features= 768
model.evaluate_generator(val_generator(df_test), steps= batches_per_epoch_val)

[0.7586062550544739, 0.604938268661499]

In [225]:
y_log = model.predict_generator(val_generator(df_test), steps= batches_per_epoch_val)

In [226]:
y_pred = np.argmax(y_log, axis=1)

In [227]:
# Show classification report
from sklearn.metrics import precision_recall_fscore_support, classification_report
print("LSTM split chunks")
print(classification_report(df_test['label'], y_pred))

LSTM split chunks
              precision    recall  f1-score   support

           0       0.54      0.95      0.69        37
           1       0.88      0.32      0.47        44

    accuracy                           0.60        81
   macro avg       0.71      0.63      0.58        81
weighted avg       0.72      0.60      0.57        81

