# Basic tutorial: text data
#### Author: Matteo Caorsi

This short tutorial provides you with the basic functioning of *giotto-deep* API.

The main steps of the tutorial are the following:
 1. creation of a dataset
 2. creation of a model
 3. define metrics and losses
 4. run benchmarks
 5. visualise results interactively

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np

import torch
from torch import nn

from gdeep.models import FFNet

from gdeep.visualisation import  persistence_diagrams_of_activations

from torch.utils.tensorboard import SummaryWriter
from gdeep.data import TorchDataLoader

from gtda.diagrams import BettiCurve

from gtda.plotting import plot_betti_surfaces

# Initialize the tensorboard writer

In order to analyse the reuslts of your models, you need to start tensorboard.
On the terminal, move inside the `/example` folder. There run the following command:

```
tensorboard --logdir=runs
```

Then go [here](http://localhost:6006/) after the training to see all the visualisation results.

In [2]:
writer = SummaryWriter()

# Create your dataset

In [3]:
dl = TorchDataLoader(name="AG_NEWS")
dl_tr, dl_ts = dl.build_dataloader()

In [5]:
from gdeep.data import PreprocessText

prec = PreprocessText((dl_tr, dl_ts))

(dl_tr, dl_ts) = prec.build_new_dataloaders()


RuntimeError: stack expects a non-empty TensorList

## Define and train your model

In [None]:
from gdeep.pipeline import Pipeline
from torch.optim import Adam, SparseAdam, SGD
from captum.attr import Occlusion, IntegratedGradients, visualization
from torchvision import transforms


class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):
        embedded = self.embedding(text)
        mean = torch.mean(embedded,dim=1)
        return self.fc(mean)

In [None]:

vocab_size = len(prec.vocabulary)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, 4)

In [None]:
print(model)
loss_fn = nn.CrossEntropyLoss()

pipe = Pipeline(model, (dl_tr, dl_ts), loss_fn, writer)

# train the model
pipe.train(SGD, 1, batch_size = 64, lr=0.01)



# Simply use interpretability tools

In [None]:
from gdeep.analysis.interpretability import Interpreter

inter = Interpreter(model, method="LayerIntegratedGradients")

inter.interpret_text("I am writing about money and business", 0, prec.vocabulary, prec.tokenizer, layer="embedding")



# Extract inner data from your models

In [None]:
from gdeep.models import ModelExtractor

me = ModelExtractor(model, loss_fn)

lista = me.get_layers_param()

for k, item in lista.items():
    print(k,item.shape)


In [None]:
x = next(iter(dl_tr))[0][0]
if x.dtype is not torch.int64:
    res = me.get_decision_boundary(x, n_epochs=1)
    res.shape

In [None]:
x = next(iter(dl_tr))[0]
list_activations = me.get_activations(x)
len(list_activations)


In [None]:
x, target = next(iter(dl_tr))
if x.dtype is torch.float:
    for gradient in me.get_gradients(x, target=target)[1]:
        print(gradient.shape)

# Visualise activations and other topological aspects of your model

In [None]:
from gdeep.visualisation import Visualiser

vs = Visualiser(pipe)

vs.plot_data_model()
vs.plot_activations(x)
vs.plot_persistence_diagrams(x)


In [None]:
vs.betti_plot_layers((0, 1), x)

In [None]:
vs.plot_interpreter_text(inter);