# Intro to Transformers

Transformers are the current cutting edge in NLP. 

This is part one of a four part in depth discussion of what they are and how they work:
https://towardsdatascience.com/transformers-explained-visually-part-1-overview-of-functionality-95a6dd460452

Here's a good walkthrough for implementing it:
http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

Below is a simple example of prepping an input data set for later task learning. 

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split


import torch
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Dataset from: https://www.kaggle.com/team-ai/spam-text-message-classification
dataset_df = pd.read_csv('spam_n_ham.csv')
dataset_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
max_len=64
num_spams = 250

In [4]:

mini_df = dataset_df.copy()

In [5]:
mini_df['is_spam'] = np.where(mini_df.Category == 'spam', 1, 0)

In [6]:
mini_df.head()

Unnamed: 0,Category,Message,is_spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
spam_df = mini_df[mini_df.is_spam==1]
ham_df = mini_df[mini_df.is_spam==0]

In [8]:
train_df = pd.concat([spam_df[0:num_spams], ham_df[0:num_spams]], axis='rows')
train_df = train_df.sample(frac=1.0)
train_df.shape


(500, 3)

In [9]:
train_df.is_spam.sum()


250

In [10]:
train_df.head()

Unnamed: 0,Category,Message,is_spam
292,ham,Haf u found him? I feel so stupid da v cam was...,0
52,ham,K fyi x has a ride early tomorrow morning but ...,0
44,ham,Great! I hope you like your man well endowed. ...,0
939,spam,Urgent! call 09061749602 from Landline. Your c...,1
290,ham,"Dear,shall mail tonite.busy in the street,shal...",0


In [11]:
X = train_df.Message.values.reshape(-1,1)
y = train_df.is_spam.values


In [12]:
X.shape

(500, 1)

In [13]:
y.shape

(500,)

In [14]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower=True)


In [15]:
X_encoded = []
Y_encoded = []
idx = 0
for obs in X:
    X_encoded.append(tokenizer.encode(obs[0], add_special_tokens=True, max_length=max_len, truncation=True))
    Y_encoded.append([y[idx]])    
    idx += 1

In [16]:
X[0]

array(['Haf u found him? I feel so stupid da v cam was working.'],
      dtype=object)

In [17]:
X_encoded[0]

[101,
 5292,
 2546,
 1057,
 2179,
 2032,
 1029,
 1045,
 2514,
 2061,
 5236,
 4830,
 1058,
 11503,
 2001,
 2551,
 1012,
 102]

In [18]:
padded = []
padded_y = []
a = []
idx = 0
for obs in X_encoded:
    a = obs + ([0] * (max_len - len(obs)))
    #print(len(a))
    if(len(a)<=512):
        padded.append(a)
        padded_y.append(Y_encoded[idx][0])
    idx += 1
    if((idx % 1000 == 0) and (idx>10)):
        print("*", end='', flush=True)

padded = np.array(padded)
padded_y = np.array(padded_y)


In [19]:
padded.shape

(500, 64)

In [20]:
padded_y.shape

(500,)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(padded, padded_y, random_state=78)


In [22]:
X_train.shape

(375, 64)

In [23]:
attention_mask = np.where(X_train != 0, 1, 0)


In [24]:
attention_mask[0]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [25]:
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
slice_begin = 0
slice_end = 275

batch_ids = torch.tensor(X_train[slice_begin:slice_end])
        # related attention mask will map which entries are words and which are
        # just filler
batch_attn = torch.tensor(attention_mask[slice_begin:slice_end])


In [27]:
batch_output = model(batch_ids, attention_mask = batch_attn)


In [28]:
batch_output[0]

tensor([[[-1.5536e-02, -1.8336e-01,  7.0141e-02,  ..., -2.4035e-01,
           3.6489e-01,  5.2714e-01],
         [ 4.4695e-01, -1.2391e-01,  1.7030e-01,  ..., -4.4407e-01,
           5.5579e-01,  1.1659e-01],
         [-1.0639e-01, -5.1619e-01,  6.2160e-01,  ..., -1.7987e-01,
           2.9445e-01, -7.1178e-02],
         ...,
         [ 3.2208e-01, -1.0387e-01,  3.9363e-01,  ...,  1.4287e-01,
          -8.2037e-02,  1.4473e-01],
         [ 3.8674e-02, -2.2694e-01,  3.2238e-01,  ..., -2.0021e-02,
          -1.0960e-02,  2.6896e-01],
         [-2.9200e-01, -2.6949e-01,  3.5144e-01,  ...,  4.2624e-02,
           2.6940e-01,  3.1066e-01]],

        [[ 2.5257e-02, -9.4357e-02, -4.2888e-02,  ..., -1.0324e-01,
           4.0492e-01,  1.6112e-01],
         [-1.4277e-01, -2.3902e-02,  3.7000e-01,  ...,  1.0147e-01,
           6.6091e-01, -3.6024e-02],
         [ 3.2458e-01,  3.3677e-01,  1.0551e-01,  ..., -1.4481e-01,
           3.1618e-01,  2.0532e-01],
         ...,
         [ 6.4967e-01, -6

In [29]:
batch_output[0].shape

torch.Size([275, 64, 768])

In [30]:
batch_output[0][0].shape

torch.Size([64, 768])

In [31]:
batch_output[0][0]

tensor([[-0.0155, -0.1834,  0.0701,  ..., -0.2403,  0.3649,  0.5271],
        [ 0.4470, -0.1239,  0.1703,  ..., -0.4441,  0.5558,  0.1166],
        [-0.1064, -0.5162,  0.6216,  ..., -0.1799,  0.2945, -0.0712],
        ...,
        [ 0.3221, -0.1039,  0.3936,  ...,  0.1429, -0.0820,  0.1447],
        [ 0.0387, -0.2269,  0.3224,  ..., -0.0200, -0.0110,  0.2690],
        [-0.2920, -0.2695,  0.3514,  ...,  0.0426,  0.2694,  0.3107]],
       grad_fn=<SelectBackward>)

In [32]:
new_model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=(768,)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='softmax')
    ])
new_model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    
new_model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               98432     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 106,753
Trainable params: 106,753
Non-trainable params: 0
_________________________________________________________________


In [33]:
# If we pull out the first sequence, it should have the information of the sentence mostly encoded in it
train_features = batch_output[0][:,0,:].detach().numpy()

# If you want to try including all features from all words, you can use this:

#train_features = batch_output[0].detach().numpy()

# But be sure to change the first layer of the NN above to a flatten (max_len, 768)

In [34]:
train_features.shape

(275, 768)

In [35]:
train_hist = new_model.fit(train_features, padded_y[:275], epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
padded_y[:250].sum()

136

In [40]:
from sklearn.metrics import classification_report, confusion_matrix
attention_mask = np.where(X_test != 0, 1, 0)
batch_ids = torch.tensor(X_test)
        # related attention mask will map which entries are words and which are
        # just filler
batch_attn = torch.tensor(attention_mask)
batch_output = model(batch_ids, attention_mask = batch_attn)
test_features = batch_output[0][:,0,:].detach().numpy()



In [41]:
test_features[0]

array([-4.55131531e-02, -7.02124089e-02,  1.58276558e-01, -2.24439919e-01,
       -1.34822533e-01, -1.87547103e-01,  3.70394021e-01,  4.06291723e-01,
       -1.62735865e-01, -1.94066331e-01, -5.11843078e-02,  1.38014015e-02,
       -1.18565790e-01,  2.07531929e-01,  7.12348819e-02,  2.68518090e-01,
       -1.32928863e-01,  2.37400368e-01,  1.72064994e-02, -3.03676520e-02,
        2.09883273e-01, -2.37330839e-01,  7.06808046e-02, -7.65044168e-02,
        2.68070512e-02, -2.38155186e-01, -4.42097709e-02, -1.32161155e-01,
       -9.18791667e-02, -6.36347160e-02, -1.52286198e-02,  1.36362389e-01,
        1.27418162e-02, -1.17925115e-01,  1.87667400e-01,  8.32006484e-02,
        1.87538922e-01, -1.44431636e-01, -3.25445225e-03,  1.63205549e-01,
       -2.12099180e-01,  9.20263082e-02,  1.00042626e-01, -1.18296921e-01,
        8.01707432e-03, -1.00520730e-01, -2.38266921e+00, -2.75874436e-01,
       -1.67176604e-01, -4.42195475e-01,  1.56307742e-01,  7.93172717e-02,
       -1.36062965e-01,  

In [42]:
pred = new_model.predict(test_features)

In [43]:
pred.shape

(125, 1)

In [44]:
pred.sum()

125.0

In [45]:
test_features.shape

(125, 768)

In [46]:
y_test.sum()

63

In [47]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        62
           1       0.50      1.00      0.67        63

    accuracy                           0.50       125
   macro avg       0.25      0.50      0.34       125
weighted avg       0.25      0.50      0.34       125

