# Get DistilBert Embeddings to use as features in downstream models

In [None]:
#!pip install transformers

In [1]:
#Imports
from transformers import DistilBertTokenizer, DistilBertConfig, TFDistilBertModel, TFDistilBertForSequenceClassification

import gc

import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import textwrap
import pickle
from sklearn.metrics import precision_recall_fscore_support, classification_report

import keras
from keras import Sequential
from keras.utils import Sequence
from keras.layers import Dense, Masking
from keras.utils import np_utils
from keras import optimizers
from keras.models import Sequential, Model
from keras.layers import Embedding, Dense, Input, concatenate, Layer, Lambda, Dropout, Activation
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback, TensorBoard
from keras import layers
from keras.layers import Input

Using TensorFlow backend.


# Helper Functions

In [2]:
train_file = "train_80_10_10.csv"
test_file = "test_80_10_10.csv"
val_file = "val_80_10_10.csv"

skip_lines = 6
max_length = 200
split_length = max_length - 2


# DistilBert
bert_file = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(bert_file, do_lower_case=True)
bert_model = TFDistilBertForSequenceClassification.from_pretrained(bert_file)

# Model Training
batch_size = 8
epochs = 3
learning_rate = 2e-5

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_19', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [3]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [4]:
# Function to get data
def get_data(fname):
    df = pd.read_csv(fname)
    df = df[['docid', 'cleaned_contents', 'Discrimination_Label']]
    df = df.rename(columns = {'cleaned_contents':'text', 'Discrimination_Label':'label'})
    df.reset_index(inplace=True, drop=True)
    return df

In [5]:
# Function to tokenize data and return tensors for input ids, attention mask and labels
def tokenize_plus(df):

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    sentences = df['text'].values
    labels = df['label'].values

    input_ids = []
    input_masks = []
    input_segments = []

    # For every sentence...
    for sent in sentences:
        inputs = tokenizer.encode_plus(sent, 
                                       add_special_tokens=True, 
                                       max_length=max_length, 
                                       truncation = True,
                                       pad_to_max_length=True, 
                                       return_attention_mask=True,
                                       return_token_type_ids=True)

        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])   
    
    labels = df['label']

    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(labels, dtype='int32')

In [6]:
# function to split text into smaller chunks of 200 words, overlapping by 50 words
def get_split(text1):
  l_total = []
  l_parcial = []
  if len(text1.split())//150 >0:
    n = len(text1.split())//150
  else: 
    n = 1
  for w in range(n):
    if w == 0:
      l_parcial = text1.split()[:200]
      l_total.append(" ".join(l_parcial))
    else:
      l_parcial = text1.split()[w*150:w*150 + 200]
      l_total.append(" ".join(l_parcial))
  return l_total

# Data Prep


In [7]:
# GET THE DATA
df_train = get_data(train_file)
df_test = get_data(test_file)
df_val = get_data(val_file)

In [8]:
#remove double new lines
df_train['text'] = df_train['text'].replace('\n\s*\n', '\n',regex=True)
df_test['text'] = df_test['text'].replace('\n\s*\n', '\n',regex=True)
df_val['text'] = df_val['text'].replace('\n\s*\n', '\n',regex=True)

# strip last n lines
df_train['text'] = df_train.apply(lambda L: L.text.rsplit("\n",skip_lines)[0], axis=1)
df_test['text'] = df_test.apply(lambda L: L.text.rsplit("\n",skip_lines)[0], axis=1)
df_val['text'] = df_val.apply(lambda L: L.text.rsplit("\n",skip_lines)[0], axis=1)

In [9]:
# split the texts into  chunks & # explode the dataframe
df_train['text_split'] = df_train['text'].apply(get_split)
df_train = df_train.explode('text_split')

# split the texts into   chunks & # explode the dataframe
df_val['text_split'] = df_val['text'].apply(get_split)
df_val = df_val.explode('text_split')

# split the texts into   chunks & # explode the dataframe
df_test['text_split'] = df_test['text'].apply(get_split)
df_test = df_test.explode('text_split')

In [10]:
df_train.head()

Unnamed: 0,docid,text,label,text_split
0,255849,SENTENCE\n• In a judgment delivered on 9 May 2...,0,SENTENCE • In a judgment delivered on 9 May 20...
0,255849,SENTENCE\n• In a judgment delivered on 9 May 2...,0,that she blacked out. • She came to in a frien...
0,255849,SENTENCE\n• In a judgment delivered on 9 May 2...,0,quality life. Those who find themselves on the...
0,255849,SENTENCE\n• In a judgment delivered on 9 May 2...,0,with you as she was asleep. You then raped her...
1,288617,"SENTENCE\n• ELIZABETH GOLMAN, you were charged...",1,"SENTENCE • ELIZABETH GOLMAN, you were charged ..."


In [11]:
# get the tokens
# Get tokenized labels
# input_ids, attention_masks, labels = tokenize_plus(df_train)
val_input_ids, val_attention_masks, val_labels = tokenize_plus(df_val)
test_input_ids, test_attention_masks, test_labels = tokenize_plus(df_test)

In [12]:
#Reload the ones we saved earlier for train
with open('input_ids.pkl', 'rb') as f: input_ids = pickle.load(f)
with open('attention_masks.pkl', 'rb') as f: attention_masks = pickle.load(f)
with open('labels.pkl', 'rb') as f: labels = pickle.load(f)

In [31]:
#That last step took longer than it should, lets save the train output just in case...
#with open('test_last_hidden_states.pkl', 'wb') as f:  pickle.dump(input_ids, f)
#with open('attention_masks.pkl', 'wb') as f: pickle.dump(attention_masks, f)
#with open('labels.pkl', 'wb') as f: pickle.dump(labels, f)

# Model to get token embeddings

In [13]:
config = DistilBertConfig(dropout=0.2, attention_dropout=0.2)
config.output_hidden_states = False
transformer_model = TFDistilBertModel.from_pretrained(bert_file, config = config)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [14]:
#embedding mode
input_ids_in = tf.keras.layers.Input(shape=(200,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(200,), name='masked_token', dtype='int32') 
embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
# we cannot  do mean pooling here on the embedding token because our data is vertical. 
# We would have to apply the latter functions to smash back together...
# probably doable
model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = cls_token)


In [15]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 200)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 200)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model (TFDistilB ((None, 200, 768),)  66362880    input_token[0][0]                
Total params: 66,362,880
Trainable params: 66,362,880
Non-trainable params: 0
__________________________________________________________________________________________________


## Run Model on train, test and val to get feature embeddings

In [17]:
last_hidden_states = model.predict([input_ids, attention_masks])

In [18]:
from sklearn.externals import joblib
filename = 'train_last_hidden_states.pkl.sav'
joblib.dump(last_hidden_states, filename)  
#with open('train_last_hidden_states.pkl', 'wb') as f: pickle.dump(last_hidden_states, f)

['train_last_hidden_states.pkl.sav']

In [20]:
test_last_hidden_states = model.predict([test_input_ids, test_attention_masks])

In [21]:
from sklearn.externals import joblib
filename = 'test_last_hidden_states.pkl.sav'
joblib.dump(test_last_hidden_states, filename) 
#with open('test_last_hidden_states.pkl', 'wb') as f: pickle.dump(test_last_hidden_states, f)

['test_last_hidden_states.pkl.sav']

In [22]:
val_last_hidden_states = model.predict([val_input_ids, val_attention_masks])

In [23]:
from sklearn.externals import joblib
filename = 'val_last_hidden_states.pkl.sav'
joblib.dump(val_last_hidden_states, filename) 
#with open('val_last_hidden_states.pkl', 'wb') as f: pickle.dump(val_last_hidden_states, f)

['val_last_hidden_states.pkl.sav']

In [24]:
import gc
gc.collect()

1405

## Flatten the dataframes so that each row is a doc, and each feature is a list of embeddings

In [26]:
print(len(df_train))
print(len(df_test))
print(len(df_val))

6004
768
761


In [27]:
df_train['feature_split'] = last_hidden_states[:,0,:].tolist()

In [28]:
# Put the data back together again - not the feature weights represent
df_train = df_train.groupby(['docid', 'text', 'label']).agg(sum).reset_index()

In [29]:
# Divide up the feature_split column into equal 768 chunks
df_train['features'] = df_train['feature_split'].apply((lambda x: [x[i:i + 768] for i in range(0, len(x), 768)]))

In [30]:
df_test['feature_split'] = test_last_hidden_states[:,0,:].tolist()
df_test = df_test.groupby(['docid', 'text', 'label']).agg(sum).reset_index()
df_test['features'] = df_test['feature_split'].apply((lambda x: [x[i:i + 768] for i in range(0, len(x), 768)]))

In [31]:
df_val['feature_split'] = val_last_hidden_states[:,0,:].tolist()
df_val = df_val.groupby(['docid', 'text', 'label']).agg(sum).reset_index()
df_val['features'] = df_val['feature_split'].apply((lambda x: [x[i:i + 768] for i in range(0, len(x), 768)]))

In [32]:
df_lstm_train = df_train[['features','label']].copy()
df_lstm_test = df_test[['features','label']].copy()
df_lstm_val = df_val[['features','label']].copy()

In [33]:
#check shapes of our new inputs
df_lstm_train.shape, df_lstm_test.shape, df_lstm_val.shape

((647, 2), (81, 2), (81, 2))

In [161]:
df_lstm_train['mean_features'] = df_lstm_train['features'].apply((lambda x: [np.mean(x, axis=0)][0]))
df_lstm_test['mean_features'] = df_lstm_test['features'].apply((lambda x: [np.mean(x, axis=0)][0]))
df_lstm_val['mean_features'] = df_lstm_val['features'].apply((lambda x: [np.mean(x, axis=0)][0]))

In [162]:
df_lstm_train['max_features'] = df_lstm_train['features'].apply((lambda x: [np.max(x, axis=0)][0]))
df_lstm_test['max_features'] = df_lstm_test['features'].apply((lambda x: [np.max(x, axis=0)][0]))
df_lstm_val['max_features'] = df_lstm_val['features'].apply((lambda x: [np.max(x, axis=0)][0]))

In [163]:
joblib.dump(df_lstm_train, 'lstm_train.sav') 

['lstm_train.sav']

In [164]:
joblib.dump(df_lstm_test, 'lstm_test.sav') 

['lstm_test.sav']

In [165]:
joblib.dump(df_lstm_val, 'lstm_val.sav') 

['lstm_val.sav']