In [1]:
# Let's play around with the flair NLP library, https://github.com/flairNLP/flair
# Contains lots of embeddings that we can use!
# (Although, I wonder how much we'll need to pre-process/post-process...)
# First going through some of the tutorials here: https://github.com/flairNLP/flair/tree/master/resources/docs


In [2]:
from flair.data import Sentence

In [4]:
phrase = "Squalane is a naturally occuring lipid in both plants and animals. This ingredient is listed in the PETA's Caring Consumer guide as primarily derived from animals, especially shark liver oil."


In [5]:
# Let's use the built in tokenizer, segtok

sentence = Sentence(phrase, use_tokenizer=True)
print(sentence)

Sentence: "Squalane is a naturally occuring lipid in both plants and animals . This ingredient is listed in the PETA 's Caring Consumer guide as primarily derived from animals , especially shark liver oil ." - 34 Tokens


In [7]:
for token in sentence:
    print(token)

Token: 1 Squalane
Token: 2 is
Token: 3 a
Token: 4 naturally
Token: 5 occuring
Token: 6 lipid
Token: 7 in
Token: 8 both
Token: 9 plants
Token: 10 and
Token: 11 animals
Token: 12 .
Token: 13 This
Token: 14 ingredient
Token: 15 is
Token: 16 listed
Token: 17 in
Token: 18 the
Token: 19 PETA
Token: 20 's
Token: 21 Caring
Token: 22 Consumer
Token: 23 guide
Token: 24 as
Token: 25 primarily
Token: 26 derived
Token: 27 from
Token: 28 animals
Token: 29 ,
Token: 30 especially
Token: 31 shark
Token: 32 liver
Token: 33 oil
Token: 34 .


In [16]:
# Very nice! 
# But note, this won't stem the words ('derived' did not become 'derive')
# Don't think this should make a huge difference in this case.

In [17]:
# HMM, I wonder what the sentiment classifier would day about this 

# # But, it's like a 1.5 GB download and it's not needed:
# from flair.models import TextClassifier
# classifier = TextClassifier.load('en-sentiment')
# 2020-01-11 20:06:05,681 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4/classy-imdb-en-rnn-cuda%3A0/imdb-v0.4.pt not found in cache, downloading to /var/folders/gh/zjmh_y9d3z5glc4fkrh1nv5c0000gn/T/tmpq0etywpm
#   0%|          | 265216/1501979561 [00:25<50:24:15, 8275.96B/s] 


There are a BUNCH of different embeddings we can use.
I'm most interested in BertEmbeddings, ELMoEmbeddings, OpenAIGPTEmbeddings...
You can also combine the embeddings, using a StackedEmbedding.
Let's start simple.


In [10]:
from flair.embeddings import FlairEmbeddings
# , BertEmbeddings, ELMoEmbeddings, OpenAIGPT2Embeddings

In [None]:
# Let's explore the flairNLP packackge.
# Looks like they have lots of embeddings available.
# https://github.com/flairNLP/flair

In [None]:
# Particularly nice, the ability to stack embeddings. Let's explore that a little.

In [9]:
example_phrase = "Squalane is a naturally occuring lipid in both plants and animals. This ingredient is listed in the PETA's Caring Consumer guide as primarily derived from animals, especially shark liver oil."

In [2]:
example_phrase

"Squalane is a naturally occuring lipid in both plants and animals. This ingredient is listed in the PETA's Caring Consumer guide as primarily derived from animals, especially shark liver oil."

In [11]:
from flair.embeddings import FlairEmbeddings, BertEmbeddings

In [None]:
# from flair.embeddings import StackedEmbeddings

In [12]:
# init Flair embeddings
# uggghhhh, this download is taking for-ev-er (dl speeds of ~100 bytes/sec?!?). I'm going to not bother with this for now.
# flair_fw_embedding = FlairEmbeddings('mix-forward')
# flair_bw_embedding = FlairEmbeddings('mix-backward')


2020-01-11 22:29:12,086 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-forward-v0.2rc.pt not found in cache, downloading to /var/folders/gh/zjmh_y9d3z5glc4fkrh1nv5c0000gn/T/tmp3v12078c


  5%|▍         | 3637248/72818995 [10:40<190:45:08, 100.74B/s]

KeyboardInterrupt: 

In [None]:
# now create the StackedEmbedding object 
# stacked_embeddings = StackedEmbeddings(embeddings=[flair_fw_embedding, flair_bw_embedding, bert_embedding])

  6%|▌         | 4099072/72818995 [06:20<6:26:17, 2964.98B/s]

In [13]:
bert_embedding = BertEmbeddings("bert-base-uncased")

In [20]:
sentence = Sentence(example_phrase.lower(), use_tokenizer=True) # need to lower, since we're using the uncased version

In [21]:
sentence

Sentence: "squalane is a naturally occuring lipid in both plants and animals . this ingredient is listed in the peta 's caring consumer guide as primarily derived from animals , especially shark liver oil ." - 34 Tokens

In [22]:
bert_embedding.embed(sentence)

[Sentence: "squalane is a naturally occuring lipid in both plants and animals . this ingredient is listed in the peta 's caring consumer guide as primarily derived from animals , especially shark liver oil ." - 34 Tokens]

In [29]:
sentence[0] # interesting, token numbering starts at 1, but index starts at 0.

Token: 1 squalane

In [32]:
sentence[0].embedding  # this returns a 1-D pytorch tensor, which we can make into a numpy array.

tensor([-0.2042, -0.3191,  0.2454,  ..., -0.0045,  1.2040,  1.7651])

In [33]:
sentence[0].embedding.shape

torch.Size([3072])

In [34]:
import numpy as np

In [37]:
np.array(sentence[0].embedding)

array([-0.204167  , -0.3191198 ,  0.24537297, ..., -0.0044656 ,
        1.2040308 ,  1.7650682 ], dtype=float32)

In [38]:
from scipy.spatial.distance import cosine

In [51]:
animal_token = Sentence("animal", use_tokenizer=True)
plant_token = Sentence("plant", use_tokenizer=True)
mineral_token= Sentence("mineral", use_tokenizer=True)

squalane_token = Sentence('squalane', use_tokenizer=True)

In [57]:
bert_embedding.embed([animal_token, plant_token, mineral_token, squalane_token])

[Sentence: "animal" - 1 Tokens,
 Sentence: "plant" - 1 Tokens,
 Sentence: "mineral" - 1 Tokens,
 Sentence: "squalane" - 1 Tokens]

In [61]:
animal_token[0].embedding

tensor([-0.4161,  0.0013, -0.3841,  ..., -0.4598, -0.1303, -0.8475])

In [63]:
print(cosine(animal_token[0].embedding, squalane_token[0].embedding))
print(cosine(plant_token[0].embedding, squalane_token[0].embedding))
print(cosine(mineral_token[0].embedding, squalane_token[0].embedding))


0.6683108806610107
0.6890329718589783
0.6874582171440125


In [64]:
# Above are all very similar.
squalane_phrase = Sentence(example_phrase.lower(), use_tokenizer=True)
bert_embedding.embed(squalane_phrase)  # this embeds all the tokens in the sentence, individually.

[Sentence: "squalane is a naturally occuring lipid in both plants and animals . this ingredient is listed in the peta 's caring consumer guide as primarily derived from animals , especially shark liver oil ." - 34 Tokens]

In [65]:
mean_phrase_vector = np.mean([np.array(token.embedding) for token in squalane_phrase], axis=0) # same as pooling approach

In [66]:
print(cosine(animal_token[0].embedding, mean_phrase_vector))  
print(cosine(plant_token[0].embedding, mean_phrase_vector))
print(cosine(mineral_token[0].embedding, mean_phrase_vector))

0.5326693058013916
0.6081691384315491
0.5858257710933685


In [75]:
# Quick sanity checking
print(cosine(np.array([1,1]), np.array([1,1])))  
print(cosine(np.array([.2,0]), np.array([.3,.1])))
print(cosine(np.array([1,1]), np.array([-1,1]))) # 90 deg

0.0
0.05131670194948634
1.0


In [None]:
# And we can play around with other embedding types.

In [None]:
# So let's look into document embeddings.


In [76]:
from flair.embeddings import DocumentRNNEmbeddings, DocumentPoolEmbeddings

In [141]:
DocumentPoolEmbeddings?

[0;31mInit signature:[0m
[0mDocumentPoolEmbeddings[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0membeddings[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mflair[0m[0;34m.[0m[0membeddings[0m[0;34m.[0m[0mTokenEmbeddings[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfine_tune_mode[0m[0;34m=[0m[0;34m'linear'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpooling[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'mean'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      Abstract base class for all document-level embeddings. Ever new type of document embedding must implement these methods.
[0;31mInit docstring:[0m
The constructor takes a list of embeddings to be combined.
:param embeddings: a list of token embeddings
:param pooling: a string which can any value from ['mean', 'max', 'min']
[0;31mFile:[0m           ~/.local/share/virtualenvs/vegan_parser-ujjCMVkc/lib/python3.7/site-packages/flair/embeddings.py
[0;31m

In [137]:
document_embeddings = DocumentPoolEmbeddings([bert_embedding], fine_tune_mode='nonlinear')

In [138]:
document_embeddings.embed(squalane_phrase)
document_embeddings.embed(animal_token)
document_embeddings.embed(plant_token)
document_embeddings.embed(mineral_token)


In [128]:
doc_pool_squalane = np.array(squalane_phrase.get_embedding().data)
doc_pool_animal = np.array(animal_token.get_embedding().data)
doc_pool_plant = np.array(plant_token.get_embedding().data)
doc_pool_mineral = np.array(mineral_token.get_embedding().data)


In [139]:
doc_pool_animal.shape

(3072,)

In [140]:
doc_pool_squalane.shape

(3200,)

### Oh. THat was...unexpected. Was thinking I'd get the same dimensions.
From the documentation:
"Since the document embedding is derived from word embeddings, its dimensionality depends on the dimensionality of word embeddings you are using."

... does that mean we need to first put the words through the embedding then through the document embedding?
I guess it's experiment time.

In [142]:
bert_embedding = BertEmbeddings("bert-base-uncased")

In [143]:
squalene_phrase = "Squalane is a naturally occuring lipid in both plants and animals. This ingredient is listed in the PETA's Caring Consumer guide as primarily derived from animals, especially shark liver oil."

In [144]:
sentence = Sentence(squalene_phrase.lower(), use_tokenizer=True) # need to lower, since we're using the uncased version

In [145]:
bert_embedding.embed(sentence)

[Sentence: "squalane is a naturally occuring lipid in both plants and animals . this ingredient is listed in the peta 's caring consumer guide as primarily derived from animals , especially shark liver oil ." - 34 Tokens]

In [146]:
sentence[0].embedding

tensor([-0.2042, -0.3191,  0.2454,  ..., -0.0045,  1.2040,  1.7651])

In [147]:
sentence[0].embedding.shape

torch.Size([3072])

In [149]:
squalene_term = 'squalene'
term = Sentence(squalene_term.lower(), use_tokenizer=True)
bert_embedding.embed(term)
print(term[0].embedding)
term[0].embedding.shape


tensor([-0.3603, -0.7189,  0.7562,  ..., -0.3645,  0.0680,  0.3883])


torch.Size([3072])

In [None]:
# okay, this is correct.

In [150]:
squalene_term = 'plant'
term = Sentence(squalene_term.lower(), use_tokenizer=True)
bert_embedding.embed(term)
print(term[0].embedding)
term[0].embedding.shape

tensor([-0.2855, -0.1755, -0.2342,  ..., -0.3336, -0.6618, -0.8346])


torch.Size([3072])

In [None]:
# So... these are the same length. Let's try embedding into document now.

In [151]:
document_embeddings = DocumentPoolEmbeddings([bert_embedding], fine_tune_mode='nonlinear')

In [152]:
document_embeddings.embed(sentence)
document_embeddings.embed(term)

In [153]:
sentence.get_embedding()

tensor([ 0.0656,  0.1824,  0.0009,  ...,  0.4025, -0.2712, -0.2329],
       grad_fn=<CatBackward>)

In [159]:
sentence.get_embedding().data.shape

torch.Size([3072])

In [160]:
term.get_embedding().data.shape

torch.Size([3072])

In [None]:
# okay... so now these work fine.
# Maybe you do need to embed the vocab and then embed the phrasehr


In [161]:
cosine(sentence.get_embedding().data,term.get_embedding().data)  # okay...and now it works okay...

0.5007305145263672

# SEE THE FEW CELLS ABOVE FOR WORKING WITH BERT DOCUMENT EMBEDDINGS


In [131]:
doc_pool_animal.shape

(3072,)

In [132]:
doc_pool_squalane.shape

(3200,)

In [119]:
Sentence('animal').get_embedding()

tensor([])

In [108]:
doc_pool_squalane

array([-0.52538466, -0.83625376, -0.42670548, ..., -0.04420931,
        0.05851426,  0.20426503], dtype=float32)

In [109]:
document_embeddings_rnn = DocumentRNNEmbeddings([bert_embedding])

In [110]:
document_embeddings_rnn.embed(squalane_phrase)

In [111]:
print(squalane_phrase.get_embedding())

tensor([ 0.2631, -0.3717, -0.4200,  ..., -0.0442,  0.0585,  0.2043],
       grad_fn=<CatBackward>)


In [112]:
doc_rnn_squalane = np.array(squalane_phrase.get_embedding().data)

In [113]:
doc_rnn_squalane

array([ 0.26310676, -0.37168577, -0.42003363, ..., -0.04420931,
        0.05851426,  0.20426503], dtype=float32)

In [116]:
# BAAAHH these are different sizes.

print(cosine(doc_pool_animal, doc_rnn_squalane))  
# print(cosine(plant_token[0].embedding, doc_pool_squalane))
# print(cosine(mineral_token[0].embedding, doc_rnn_squalane))

ValueError: operands could not be broadcast together with shapes (0,) (3200,) 

In [117]:
doc_pool_animal

array([], dtype=float32)

## General Notion
Read in the definitions of animal, plant, mineral from dictionaries / wikipedia into embeddings.
Then form embeddings of the ingrdient name / synonyms / definition/ dictionary or wikipedia

Need to make sure I'm calculating document vectors appropriately...because sizes need to be the same to compare.
Need to pull in a dictionary / wikipedia source, and figure out how to do that programmatically.


In [None]:
# Let's see about making sure our vectors make sense.

In [135]:
# Let's try using just the BERT without another pooling, etc.