# Tutorial 5: Document Embeddings


### Pooling: calculates a pooling operation over all word embeddings in a document



In [1]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence

# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings([glove_embedding,
                                              flair_embedding_backward,
                                              flair_embedding_forward],
                                              mode='mean') # mean (default), min, max
print('Done')

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
Done


### Pooling: mean, min, max

In [4]:
# create an example sentence
sentence = Sentence('The grass is green . And the sky is blue .')

# embed the sentence with our document embedding
document_embeddings.embed(sentence)

# now check out the embedded sentence.
print(sentence.get_embedding())
print(len(sentence.get_embedding()))

tensor([-0.3197,  0.2621,  0.4037,  ..., -0.0013, -0.0026,  0.0170])
4196


In [5]:
sentence2 = Sentence("The grass is blue. And the sky too")
sentence3 = Sentence("And now for something completely different")
document_embeddings.embed(sentence2)
document_embeddings.embed(sentence3)


### document similarity with torch library

In [6]:
from torch.nn.modules import distance
cos = distance.CosineSimilarity(dim=0)

print(cos(sentence.embedding,  sentence.embedding))
print(cos(sentence.embedding,  sentence2.embedding))
print(cos(sentence.embedding,  sentence3.embedding))
print(cos(sentence2.embedding, sentence3.embedding))

tensor(1.0000)
tensor(0.9510)
tensor(0.7985)
tensor(0.8132)


In [7]:
print(sentence.embedding.numpy())

[-0.31969544  0.26205996  0.4037069  ... -0.00134025 -0.00258876
  0.01702889]


## RNN


In [8]:
from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings

glove_embedding = WordEmbeddings('glove')

document_embeddings = DocumentRNNEmbeddings([glove_embedding], rnn_type='LSTM')

In [None]:
### rnn_type: default: GRU, options: GRU, LSTM,  

In [9]:
# create an example sentence
sentence = Sentence('The grass is green . And the sky is blue .')

# embed the sentence with our document embedding
document_embeddings.embed(sentence)

# now check out the embedded sentence.
print(sentence.get_embedding())

tensor([-0.3425, -0.3932, -0.0000,  0.0343,  0.1381,  0.6642,  0.0630, -0.1023,
        -0.0000,  0.0000,  0.1843, -0.0000, -0.1117, -0.1805,  0.2339, -0.0000,
        -0.1395, -0.0000, -0.2321,  0.0000, -0.0000,  0.0000,  0.0000,  0.0000,
        -0.0885,  0.4109, -0.0000, -0.1071,  0.0000,  0.0000,  0.0664,  0.0000,
         0.0000, -0.0082,  0.0192,  0.0000, -0.0000,  0.0000,  0.3879,  0.0872,
         0.5555, -0.0792, -0.0000, -0.1602, -0.2439, -0.4266,  0.4662, -0.0000,
         0.1605,  0.0000, -0.4670, -0.3228, -0.0000,  0.0669, -0.1754, -0.0000,
        -0.3848,  0.1116, -0.0000,  0.0000, -0.3589, -0.3977, -0.0000, -0.0000,
         0.2012, -0.4043,  0.0000,  0.1022, -0.2845, -0.0000,  0.0000, -0.2577,
        -0.0000,  0.0000,  0.0000,  0.0663,  0.1658, -0.6247,  0.0000, -0.0000,
         0.0000,  0.3869,  0.1537,  0.0000,  0.0361, -0.0000,  0.0000,  0.2609,
        -0.0635,  0.3628, -0.0000, -0.0000,  0.0000, -0.4442,  0.0021,  0.4604,
         0.2165,  0.0000, -0.0000, -0.09

### RNN needs to be trained on downstream task -- see Tutorial 7