# Intro to Transformers

Transformers are the current cutting edge in NLP. 

This is part one of a four part in depth discussion of what they are and how they work:
https://towardsdatascience.com/transformers-explained-visually-part-1-overview-of-functionality-95a6dd460452

Here's a good walkthrough for implementing it:
http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

Below is a simple example of prepping an input data set for later task learning. 

In [48]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split


import torch
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

In [49]:
# Dataset from: https://www.kaggle.com/team-ai/spam-text-message-classification
dataset_df = pd.read_csv('spam_n_ham.csv')
dataset_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [50]:
max_len=64
num_spams = 250

In [51]:

mini_df = dataset_df.copy()

In [52]:
mini_df['is_spam'] = np.where(mini_df.Category == 'spam', 1, 0)

In [53]:
mini_df.head()

Unnamed: 0,Category,Message,is_spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [54]:
spam_df = mini_df[mini_df.is_spam==1]
ham_df = mini_df[mini_df.is_spam==0]

In [55]:
train_df = pd.concat([spam_df[0:num_spams], ham_df[0:num_spams]], axis='rows')
train_df = train_df.sample(frac=1.0)
train_df.shape


(500, 3)

In [56]:
train_df.is_spam.sum()


250

In [57]:
train_df.head()

Unnamed: 0,Category,Message,is_spam
274,ham,"Usf I guess, might as well take 1 car",0
41,ham,"Did I forget to tell you ? I want you , I need...",0
205,ham,U call me alter at 11 ok.,0
244,ham,Although i told u dat i'm into baig face watch...,0
1623,spam,U have a secret admirer who is looking 2 make ...,1


In [58]:
X = train_df.Message.values.reshape(-1,1)
y = train_df.is_spam.values


In [59]:
X.shape

(500, 1)

In [60]:
y.shape

(500,)

In [61]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower=True)


In [62]:
X_encoded = []
Y_encoded = []
idx = 0
for obs in X:
    X_encoded.append(tokenizer.encode(obs[0], add_special_tokens=True, max_length=max_len, truncation=True))
    Y_encoded.append([y[idx]])    
    idx += 1

In [63]:
X[0]

array(['Usf I guess, might as well take 1 car'], dtype=object)

In [64]:
X_encoded[0]

[101, 2149, 2546, 1045, 3984, 1010, 2453, 2004, 2092, 2202, 1015, 2482, 102]

In [65]:
padded = []
padded_y = []
a = []
idx = 0
for obs in X_encoded:
    a = obs + ([0] * (max_len - len(obs)))
    #print(len(a))
    if(len(a)<=512):
        padded.append(a)
        padded_y.append(Y_encoded[idx][0])
    idx += 1
    if((idx % 1000 == 0) and (idx>10)):
        print("*", end='', flush=True)

padded = np.array(padded)
padded_y = np.array(padded_y)


In [66]:
padded.shape

(500, 64)

In [67]:
padded_y.shape

(500,)

In [68]:
X_train, X_test, y_train, y_test = train_test_split(padded, padded_y, random_state=78)


In [69]:
X_train.shape

(375, 64)

In [70]:
attention_mask = np.where(X_train != 0, 1, 0)


In [71]:
attention_mask[0]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [72]:
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [73]:
slice_begin = 0
slice_end = 275

batch_ids = torch.tensor(X_train[slice_begin:slice_end])
        # related attention mask will map which entries are words and which are
        # just filler
batch_attn = torch.tensor(attention_mask[slice_begin:slice_end])


In [74]:
batch_output = model(batch_ids, attention_mask = batch_attn)


In [75]:
batch_output[0]

tensor([[[-2.9603e-01, -1.4850e-01, -2.6975e-02,  ..., -1.3846e-01,
           2.0698e-01,  5.4352e-01],
         [ 4.7866e-01, -6.6179e-02,  3.6651e-01,  ...,  9.4846e-02,
           1.4528e-01, -1.9697e-01],
         [-5.5606e-02, -2.2741e-01,  4.9460e-01,  ...,  2.8372e-01,
           2.0324e-01,  4.4331e-01],
         ...,
         [ 9.6166e-02,  9.9748e-03,  3.3189e-01,  ..., -5.0470e-04,
          -6.4877e-02,  3.4909e-01],
         [ 1.9312e-02, -3.5956e-03,  3.5326e-01,  ...,  2.5807e-02,
           1.1696e-02,  3.6736e-01],
         [-1.6542e-01, -1.7842e-01,  3.1661e-01,  ...,  6.0977e-02,
          -1.8500e-01,  2.5941e-01]],

        [[-1.9049e-01, -2.0384e-01, -8.3739e-02,  ..., -2.0450e-01,
           2.7827e-01,  1.0849e-01],
         [ 1.1173e-01, -3.2774e-01,  1.2047e-02,  ..., -1.1130e-01,
           5.3895e-01,  2.3375e-02],
         [ 5.0480e-01, -4.4049e-02,  1.5837e-01,  ..., -7.0441e-01,
          -1.6148e-01, -2.4256e-01],
         ...,
         [ 6.0653e-02,  1

In [76]:
batch_output[0].shape

torch.Size([275, 64, 768])

In [77]:
batch_output[0][0].shape

torch.Size([64, 768])

In [78]:
batch_output[0][0]

tensor([[-2.9603e-01, -1.4850e-01, -2.6975e-02,  ..., -1.3846e-01,
          2.0698e-01,  5.4352e-01],
        [ 4.7866e-01, -6.6179e-02,  3.6651e-01,  ...,  9.4846e-02,
          1.4528e-01, -1.9697e-01],
        [-5.5606e-02, -2.2741e-01,  4.9460e-01,  ...,  2.8372e-01,
          2.0324e-01,  4.4331e-01],
        ...,
        [ 9.6166e-02,  9.9748e-03,  3.3189e-01,  ..., -5.0470e-04,
         -6.4877e-02,  3.4909e-01],
        [ 1.9312e-02, -3.5956e-03,  3.5326e-01,  ...,  2.5807e-02,
          1.1696e-02,  3.6736e-01],
        [-1.6542e-01, -1.7842e-01,  3.1661e-01,  ...,  6.0977e-02,
         -1.8500e-01,  2.5941e-01]], grad_fn=<SelectBackward>)

In [79]:
new_model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=(768,)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='softmax')
    ])
new_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    
new_model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 768)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               98432     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 106,753
Trainable params: 106,753
Non-trainable params: 0
_________________________________________________________________


In [80]:
# If we pull out the first sequence, it should have the information of the sentence mostly encoded in it
train_features = batch_output[0][:,0,:].detach().numpy()

# If you want to try including all features from all words, you can use this:

#train_features = batch_output[0].detach().numpy()

# But be sure to change the first layer of the NN above to a flatten (max_len, 768)

In [81]:
train_features.shape

(275, 768)

In [82]:
train_hist = new_model.fit(train_features, padded_y[:275], epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [83]:
padded_y[:250].sum()

128

In [84]:
from sklearn.metrics import classification_report, confusion_matrix
attention_mask = np.where(X_test != 0, 1, 0)
batch_ids = torch.tensor(X_test)
        # related attention mask will map which entries are words and which are
        # just filler
batch_attn = torch.tensor(attention_mask)
batch_output = model(batch_ids, attention_mask = batch_attn)
test_features = batch_output[0][:,0,:].detach().numpy()



In [85]:
test_features[0]

array([ 8.70237499e-02, -1.37071207e-01,  1.55908063e-01, -3.21425319e-01,
        9.16671678e-02, -1.65757820e-01,  2.56267041e-01,  4.19194609e-01,
       -2.21988097e-01, -2.74156034e-01, -1.63831159e-01, -5.89925572e-02,
       -2.33077392e-01,  3.65397543e-01,  2.45086282e-01,  2.50755370e-01,
       -1.62729979e-01,  2.59829074e-01,  8.61519799e-02, -1.42538294e-01,
        4.20841910e-02, -1.32251576e-01,  1.22908771e-03, -1.03047214e-01,
       -1.30589053e-01, -6.58464059e-02, -4.97956201e-02, -1.41188607e-01,
        6.22740947e-02, -9.80156660e-02,  2.67064981e-02,  1.60294443e-01,
       -9.13608670e-02, -1.54184829e-02, -1.58305429e-02,  1.73502136e-02,
        5.16983261e-03, -1.81295589e-01,  1.29697457e-01,  3.68250720e-02,
       -3.06743151e-03, -4.53605056e-02,  1.91636488e-01, -3.10448706e-02,
       -1.69620216e-01, -2.13097021e-01, -2.39384508e+00, -1.95176333e-01,
       -1.32394210e-01, -1.06793784e-01,  3.16430330e-01,  8.45410898e-02,
        1.20825648e-01,  

In [86]:
pred = new_model.predict(test_features)

In [87]:
pred.shape

(125, 1)

In [88]:
pred.sum()

125.0

In [89]:
test_features.shape

(125, 768)

In [90]:
y_test.sum()

67

In [91]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        58
           1       0.54      1.00      0.70        67

    accuracy                           0.54       125
   macro avg       0.27      0.50      0.35       125
weighted avg       0.29      0.54      0.37       125

