# Text classification with Transformer

**Author:** [Divyanshu Raghuwanshi](https://www.linkedin.com/in/divyanshu-raghuwanshi-85037b160/)<br>
**Date created:** 2021/03/22<br>
**Last modified:** 2021/31/03<br>

## Setup

In [81]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from numpy import mean
from numpy import std
from numpy import dstack
from pandas import read_csv
from sklearn.model_selection import train_test_split
import pandas as pd
from google.colab import drive

In [82]:
import sys
!{sys.executable} -m pip install xlsxwriter

Collecting xlsxwriter
[?25l  Downloading https://files.pythonhosted.org/packages/34/eb/40aaf7a73fd158aea04ad8812b97fd3049929276c9ea652d8a995cd18425/XlsxWriter-1.3.8-py2.py3-none-any.whl (145kB)
[K     |██▎                             | 10kB 19.3MB/s eta 0:00:01[K     |████▌                           | 20kB 11.2MB/s eta 0:00:01[K     |██████▊                         | 30kB 14.0MB/s eta 0:00:01[K     |█████████                       | 40kB 16.0MB/s eta 0:00:01[K     |███████████▎                    | 51kB 16.6MB/s eta 0:00:01[K     |█████████████▌                  | 61kB 14.9MB/s eta 0:00:01[K     |███████████████▉                | 71kB 14.0MB/s eta 0:00:01[K     |██████████████████              | 81kB 12.1MB/s eta 0:00:01[K     |████████████████████▎           | 92kB 12.8MB/s eta 0:00:01[K     |██████████████████████▌         | 102kB 12.2MB/s eta 0:00:01[K     |████████████████████████▉       | 112kB 12.2MB/s eta 0:00:01[K     |███████████████████████████     | 

In [83]:
import xlsxwriter

## Implement a Transformer block as a layer

In [45]:

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


## Implement embedding layer

Two seperate embedding layers, one for tokens, one for token index (positions).

In [46]:
class PositionEmbedding(layers.Layer):
    def __init__(self, maxlen, embed_dim):
        super(PositionEmbedding, self).__init__()
        #self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-2]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        #x = self.token_emb(x)
        return x + positions


## Download and prepare dataset

In [48]:
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [49]:
df= pd.read_csv('/content/drive/MyDrive/Research data/MeggitSummaryCSV.csv')

In [50]:
Xdf=df.iloc[:,2:18]
Xdf=Xdf.values
ydf=df.iloc[:,-1]
ydf=ydf.values

In [51]:
def prepareData(X,y,W,s_W):
  #X,y = removeNAN(X,y)
  #X,y = removeNULL(X,y)
  # Standardizing with z-score
  #X = stats.zscore(X,axis = 0)
  X_data=list()
  y_data=list()
  
  L=0
  R=W
  
  while(R <= X.shape[0]):
      #print('{}:{}'.format(L,R))
      sample=X[L:R]
      label=y[L:R]
      if len(set(label))==1 and len(label)==W:
          X_data.append(sample)
          y_data.append(label[0])

      L=L+s_W
      R=R+s_W

  X=np.array(X_data)
  y=np.array(y_data)
  return X,y


In [52]:
import numpy as np
W=120
s_W=60
#dataX, datay = prepareData(X,y,W,s_W)
dataX, datay = prepareData(Xdf,ydf,W,s_W)

In [53]:
dataX.shape,datay.shape

((6567, 120, 16), (6567,))

In [66]:
def load_dataset():
  X_train, X_val, y_train, y_val = train_test_split(dataX[:,:,:], datay[:], test_size=0.20, random_state=1)
  X_train, X_test, y_train, y_test = train_test_split(X_train[:,:,:], y_train[:], test_size=0.10, random_state=1)
  return X_train, X_test, X_val, y_train, y_test, y_val

## Create classifier model using transformer layer

Transformer layer outputs one vector for each time step of our input sequence.
Here, we take the mean across all time steps and
use a feed forward network on top of it to classify text.

## Train and Evaluate

In [76]:
def evaluate_model(X_train, y_train, X_val, y_val, X_test, y_test):
    maxlen=120
    embed_dim = 16  # Embedding size for each token
    num_heads = 2  # Number of attention heads
    ff_dim = 32  # Hidden layer size in feed forward network inside transformer
    epochs = 1
    Verbose = 0
    Batch_size=84

    inputs = layers.Input(shape=(maxlen,embed_dim))
    embedding_layer = PositionEmbedding(maxlen, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(20, activation="relu")(x)
    x = layers.Dropout(0.1)(x)
    outputs = layers.Dense(6, activation="softmax")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)

    model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
    model.fit(X_train, y_train, batch_size=Batch_size, epochs=epochs, validation_data=(X_val, y_val), verbose=Verbose)

    # evaluate model
    _, accuracy = model.evaluate(X_test, y_test, batch_size=Batch_size, verbose=Verbose)
  	
    return accuracy

In [77]:
def summarize_results(scores):
    print(scores)
    m, s = mean(scores), std(scores)
    print('Accuracy: %.3f%% (+/-%.3f)' % (m, s))
    
    fullCatWriter = pd.ExcelWriter('/Result.xlsx', engine='xlsxwriter')     
    dataframe1 = pd.DataFrame(scores)            
    dataframe1.to_excel(fullCatWriter, sheet_name='Full Raw', startrow=0 , startcol=0)            
    fullCatWriter.save()      
    fullCatWriter.close() 

In [84]:
def run_experiment(repeats=5):
	# load data
	# repeat experiment
  scores = list()
  X_train, X_test, X_val, y_train, y_test, y_val = load_dataset()
  for r in range(repeats):
    score = evaluate_model(X_train, y_train, X_val, y_val, X_test, y_test)
    score = score * 100.0
    print('>#%d: %.3f' % (r+1, score))
    scores.append(score)
  
	# summarize results
  summarize_results(scores)


In [85]:
run_experiment()

>#1: 51.331
>#2: 71.863
>#3: 67.871
>#4: 57.795
>#5: 58.365
[51.330798864364624, 71.86312079429626, 67.87072420120239, 57.79467821121216, 58.36501717567444]
Accuracy: 61.445% (+/-7.415)


  warn("Calling close() on already closed file.")
