# Basic tutorial: text data
#### Author: Matteo Caorsi

This short tutorial provides you with the basic functioning of *giotto-deep* API.

The main steps of the tutorial are the following:
 1. creation of a dataset
 2. creation of a model
 3. define metrics and losses
 4. run benchmarks
 5. visualise results interactively

In [31]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np

import torch
from torch import nn

from gdeep.models import FFNet

from gdeep.visualisation import  persistence_diagrams_of_activations

from torch.utils.tensorboard import SummaryWriter
from gdeep.data import TorchDataLoader

from gtda.diagrams import BettiCurve

from gtda.plotting import plot_betti_surfaces

# Initialize the tensorboard writer

In order to analyse the reuslts of your models, you need to start tensorboard.
On the terminal, move inside the `/example` folder. There run the following command:

```
tensorboard --logdir=runs
```

Then go [here](http://localhost:6006/) after the training to see all the visualisation results.

In [32]:
writer = SummaryWriter()

# Create your dataset

In [33]:
dl = TorchDataLoader(name="AG_NEWS")
dl_tr, dl_ts = dl.build_dataloader()

# for (_label, _text) in dl_tr:
#     print(_text[0])

In [34]:
from gdeep.data import PreprocessText

prec = PreprocessText((dl_tr, dl_ts))

(dl_tr, dl_ts) = prec.build_new_dataloaders()


## Define and train your model

In [35]:
from gdeep.pipeline import Pipeline
from torch.optim import Adam, SparseAdam, SGD
from captum.attr import Occlusion, IntegratedGradients, visualization
from torchvision import transforms


class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):
        text = text.to(torch.device("cuda"))
        embedded = self.embedding(text)
        mean = torch.mean(embedded,dim=1)
        return self.fc(mean)

In [36]:
vocab_size = len(prec.vocabulary)
print("len", vocab_size)
vocab_size = 225981 # to be discussed, since len(prec.vocabulary) is returning 95810 whereas the index of '.' is 225971
emsize = 64
# print(vocab_size, emsize)
model = TextClassificationModel(vocab_size, emsize, 4)

len 95810


In [37]:
print(model)
loss_fn = nn.CrossEntropyLoss()

pipe = Pipeline(model, (dl_tr, dl_ts), loss_fn, writer)

# train the model
pipe.train(SGD, 1, batch_size=1, lr=0.01)



TextClassificationModel(
  (embedding): Embedding(225981, 64, sparse=True)
  (fc): Linear(in_features=64, out_features=4, bias=True)
)
<gdeep.data.preprocessing.TextDataset object at 0x0000016F8085FD60>
TOTAL EPOCHS  1
Epoch 1
-------------------------------
Training loss: 1.310343  [96000/96000]
Time taken for this epoch: 143s
Validation results: 
 Accuracy: 6.1%,                 Avg loss: 0.275866 

Test results: 
 Accuracy: 30.8%,                 Avg loss: 1.377037 

Done!


# Simply use interpretability tools

In [38]:
from gdeep.analysis.interpretability import Interpreter

inter = Interpreter(model, method="LayerIntegratedGradients")

inter.interpret_text("I am writing about money and business", 0, prec.vocabulary, prec.tokenizer, layer="embedding")



pred:  2 ( 0.37 ) , delta:  6.441707833815258e-09


# Extract inner data from your models

In [39]:
from gdeep.models import ModelExtractor

me = ModelExtractor(model, loss_fn)

lista = me.get_layers_param()

for k, item in lista.items():
    print(k,item.shape)


embedding.weight torch.Size([225981, 64])
fc.weight torch.Size([4, 64])
fc.bias torch.Size([4])


In [40]:
x = next(iter(dl_tr))[0][0]
if x.dtype is not torch.int64:
    res = me.get_decision_boundary(x, n_epochs=1)
    res.shape

In [41]:
x = next(iter(dl_tr))[0]
list_activations = me.get_activations(x)
len(list_activations)


3

In [42]:
x, target = next(iter(dl_tr))
if x.dtype is torch.float:
    for gradient in me.get_gradients(x, target=target)[1]:
        print(gradient.shape)

# Visualise activations and other topological aspects of your model

In [43]:
from gdeep.visualisation import Visualiser

vs = Visualiser(pipe)

vs.plot_data_model()
vs.plot_activations(x)
vs.plot_persistence_diagrams(x)


Sending the plots to tensorboard: 
Step 3/3

In [44]:
vs.betti_plot_layers((0, 1), x)

In [45]:
vs.plot_interpreter_text(inter);

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,2 (0.37),1.0,0.63,i am writing about money and business
,,,,
