# Tutorial 5: Document Embeddings


### Pooling: calculates a pooling operation over all word embeddings in a document



In [1]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence

# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings([glove_embedding,
                                              flair_embedding_backward,
                                              flair_embedding_forward],
                                              mode='mean') # mean (default), min, max
print('Done')

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
Done


### Pooling: mean, min, max

In [2]:
# create an example sentence
sentence = Sentence('The grass is green . And the sky is blue .')

# embed the sentence with our document embedding
document_embeddings.embed(sentence)

# now check out the embedded sentence.
print(sentence.get_embedding())
print(len(sentence.get_embedding())

tensor([-0.3197,  0.2621,  0.4037,  ..., -0.0013, -0.0026,  0.0170])
4196


In [4]:
sentence2 = Sentence("The grass is blue. And the sky too")
sentence3 = Sentence("And now for something completely different")
document_embeddings.embed(sentence2)
document_embeddings.embed(sentence3)


In [14]:
from torch.nn.modules import distance
cos = distance.CosineSimilarity(dim=0)

print(cos(sentence.embedding,  sentence.embedding))
print(cos(sentence.embedding,  sentence2.embedding))
print(cos(sentence.embedding,  sentence3.embedding))
print(cos(sentence2.embedding, sentence3.embedding))

tensor(1.0000)
tensor(0.9510)
tensor(0.7985)
tensor(0.8132)


In [16]:
print(sentence.embedding.numpy())

[-0.31969544  0.26205996  0.4037069  ... -0.00134025 -0.00258876
  0.01702889]


## RNN


In [18]:
from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings

glove_embedding = WordEmbeddings('glove')

document_embeddings = DocumentRNNEmbeddings([glove_embedding], rnn_type='LSTM')

In [None]:
### rnn_type: default: GRU, options: GRU, LSTM,  

In [15]:
# create an example sentence
sentence = Sentence('The grass is green . And the sky is blue .')

# embed the sentence with our document embedding
document_embeddings.embed(sentence)

# now check out the embedded sentence.
print(sentence.get_embedding())

tensor([ 0.0000, -0.0000,  0.0809, -0.0000, -0.6245,  0.0000,  0.1986,  0.0000,
         0.0000,  0.0000,  0.5986, -0.0000,  0.0000, -0.0000,  0.0000,  0.0409,
         0.0000,  0.0149, -0.0000, -0.0495,  0.0000,  0.0000,  0.0000, -0.0000,
         0.2511, -0.1134,  0.5852, -0.0000, -0.5777, -0.0122,  0.0000, -0.0000,
         0.4019,  0.2169,  0.6653, -0.6016,  0.0000, -0.3695,  0.7491, -0.0472,
         0.0000,  0.0000, -0.4309,  0.0000,  0.0578,  0.0000, -0.4173,  0.0687,
        -0.0000,  0.0000, -0.0248,  0.0000,  0.3273, -0.5386, -0.7355,  0.0818,
        -0.0000,  0.3960, -0.5707,  0.4357,  0.0000, -0.0000, -0.0000, -0.0000,
         0.0000,  0.8578,  0.0000, -0.9574, -0.7359, -0.0000,  0.8351,  0.1681,
         0.0000,  0.0000,  0.0000,  0.7432, -0.0000,  0.0000,  0.2202,  0.0000,
         0.1871,  0.0809,  0.0000,  0.1320,  0.1852,  0.4779, -0.0000, -0.0000,
        -0.7601, -0.4718,  0.0000,  0.0000, -0.0977, -0.0000,  0.0649,  0.5710,
        -0.2964,  0.0000,  0.0789, -0.00

### RNN needs to be trained on downstream task -- see Tutorial 7