In [1]:
# helpful reference: https://medium.com/@h_bushroh/text-similarity-with-fasttext-word-embeddings-c765d97df682
# a nice writeup about different similarity measures: https://medium.com/@adriensieg/text-similarities-da019229c894

In [5]:
import pandas as pd
import numpy as np
import fasttext

In [45]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

In [50]:
from string import punctuation

In [13]:
from scipy.spatial.distance import cosine

In [6]:
from datetime import datetime

In [7]:
ingredient_data_fp = '/Volumes/ja2/vegan/vegan_parser/data_source/dataframe_with_syns.csv'
embedding_model_fp = '/Volumes/ja2/vegan/vegan_parser/fasttext_wiki/wiki.en/wiki.en.bin'

In [8]:
t1 = datetime.now()
fasttext_model = fasttext.load_model(embedding_model_fp)

print(f'It took {datetime.now() - t1} to load the model')




It took 0:04:16.532097 to load the model


In [9]:
# Whoa, it took over 4 min to load the model...to be fair, the model is like, 7 or 8 GB.

In [11]:
mineral_emb = fasttext_model.get_word_vector('mineral')  # equivalent to fasttext_model['mineral']

In [18]:
salt_emb = fasttext_model.get_word_vector('salt')

In [30]:
cosine(mineral_emb, salt_emb)

0.5615120530128479

In [31]:
print(cosine(fasttext_model['animal'], fasttext_model['dog']))


0.4364222288131714


In [32]:
print(cosine(fasttext_model['animal'], fasttext_model['pig']))


0.4459826946258545


In [33]:
cosine(fasttext_model['animal'], fasttext_model['plant'])


0.6285239160060883

In [34]:
print(cosine(fasttext_model['hot'], fasttext_model['cold']))


0.4628000855445862


In [35]:
print(cosine(fasttext_model['hot'], fasttext_model['hot']))

0.0


In [39]:
# Boo...glycerin, from wiki: "Glycerol is generally obtained from plant and animal sources..."
print(cosine(fasttext_model['glycerin'], fasttext_model['plant']))
print(cosine(fasttext_model['glycerin'], fasttext_model['mineral']))
print(cosine(fasttext_model['glycerin'], fasttext_model['animal']))

0.6746916174888611
0.6110290884971619
0.754726842045784


In [40]:
# but, what if we combine all the synonyms for glycerin?
# Note, the `1,2,` in front of the 3 in the names is concerning, but we'll see.
syns = ['3-PROPANETRIOL',
 '3-TRIHYDROXYPROPANE',
 '3PROPANETRIOL',
 'CONCENTRATED GLYCERIN',
 'GLYCERIN',
 'GLYCERINE',
 'GLYCEROL',
 'GLYCYL ALCOHOL',
 'VEGETABLE GLYCERIN']

syns = [item.lower() for item in syns]

In [41]:
syns

['3-propanetriol',
 '3-trihydroxypropane',
 '3propanetriol',
 'concentrated glycerin',
 'glycerin',
 'glycerine',
 'glycerol',
 'glycyl alcohol',
 'vegetable glycerin']

In [43]:
# Okay, so this one works with this method (fasttext cosine sim)
print(cosine(fasttext_model['green tea extract'], fasttext_model['vegetable']))
print(cosine(fasttext_model['green tea extract'], fasttext_model['animal']))
print(cosine(fasttext_model['green tea extract'], fasttext_model['mineral']))


0.5996586084365845
0.7843036502599716
0.6257796585559845


In [56]:
stopwords = stopwords.words('english')


In [119]:
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
# We'll need to lowercase the ingredient names before searching.

In [104]:
def compare_direct(phrase: str):
    phrase = phrase.lower()
    test_embedding = fasttext_model[phrase]
    
    print(f"Comparing directly the words: {phrase}")
    print(f"Cosine Sim with plant: {cosine(test_embedding, fasttext_model['plant'])}")
    print(f"Cosine Sim with animal: {cosine(test_embedding, fasttext_model['animal'])}")
    print(f"Cosine Sim with mineral: {cosine(test_embedding, fasttext_model['mineral'])}")
    print()

In [109]:
def compare_phrase(phrase: str, stopwords, punctuation):
    """
    phrase is a string which is an ingredient description.
    """
    phrase = phrase.lower()
    
    words = word_tokenize(phrase)
    words = [word for word in words if word not in punctuation]
    words = [word for word in words if word not in stopwords]
    
    test_embedding = np.mean([fasttext_model[word] for word in words], axis=0)

    print(f"Comparing phrase: {phrase}")
    print(f"Cosine Sim with plant: {cosine(test_embedding, fasttext_model['plant'])}")
    print(f"Cosine Sim with animal: {cosine(test_embedding, fasttext_model['animal'])}")
    print(f"Cosine Sim with mineral: {cosine(test_embedding, fasttext_model['mineral'])}")
    print()
    

In [110]:
def compare_syn_list(syns, stopwords, punctuation):
    """
    phrase is a string which is an ingredient description.
    """
    
    syns = [word.lower() for word in syns]
    
    test_embedding = np.mean([fasttext_model[word] for word in syns], axis=0)
    
    print(f"Comparing synonym list: {syns}")
    print(f"Cosine Sim with plant: {cosine(test_embedding, fasttext_model['plant'])}")
    print(f"Cosine Sim with animal: {cosine(test_embedding, fasttext_model['animal'])}")
    print(f"Cosine Sim with mineral: {cosine(test_embedding, fasttext_model['mineral'])}")
    print()

In [44]:
"phrase".split()

['phrase']

In [75]:
string = "Glycerin (also called glycerol) is a naturally occurring alcohol compound and a component of many lipids. Glycerin may be of animal or vegetable origin. This ingredient is listed in the PETA's Caring Consumer guide as a byproduct of soap manufacture which typically uses animal fat."


In [111]:
# EXAMPLE: glycerin. 3 similarities: 1. the chemical_about string 2. the word/phrase direct comparison 3. using avg of syns

In [112]:
# Use the 
compare_phrase(string, stopwords, punctuation) # Hmmm...not quite.

Comparing phrase: glycerin (also called glycerol) is a naturally occurring alcohol compound and a component of many lipids. glycerin may be of animal or vegetable origin. this ingredient is listed in the peta's caring consumer guide as a byproduct of soap manufacture which typically uses animal fat.
Cosine Sim with plant: 0.5171853303909302
Cosine Sim with animal: 0.41598761081695557
Cosine Sim with mineral: 0.49857497215270996



In [113]:
compare_phrase("glycerin", stopwords, punctuation)

Comparing phrase: glycerin
Cosine Sim with plant: 0.6746916174888611
Cosine Sim with animal: 0.754726842045784
Cosine Sim with mineral: 0.6110290884971619



In [114]:
compare_direct("glycerin")  # Okay, so it's similar to both animal, and plant, but how to determine a cutoff?

Comparing directly the words: glycerin
Cosine Sim with plant: 0.6746916174888611
Cosine Sim with animal: 0.754726842045784
Cosine Sim with mineral: 0.6110290884971619



In [115]:
compare_syn_list(['3-propanetriol',
 '3-trihydroxypropane',
 '3propanetriol',
 'concentrated glycerin',
 'glycerin',
 'glycerine',
 'glycerol',
 'glycyl alcohol',
 'vegetable glycerin'], stopwords, punctuation)

Comparing synonym list: ['3-propanetriol', '3-trihydroxypropane', '3propanetriol', 'concentrated glycerin', 'glycerin', 'glycerine', 'glycerol', 'glycyl alcohol', 'vegetable glycerin']
Cosine Sim with plant: 0.6334622800350189
Cosine Sim with animal: 0.7275842130184174
Cosine Sim with mineral: 0.573100745677948



In [116]:
def extract_synonymns_list(str1):
    if pd.isna(str1):
        return []
    
    split_and_list = str1.split('and')
    split_and_comma_list_in_list = [string.split(',') for string in split_and_list]
    split_and_comma_semi_list_in_list = [item.strip().split(';') for sublist in split_and_comma_list_in_list for item in sublist]

    intermediate_list = [item for sublist in split_and_comma_semi_list_in_list for item in sublist if item!='']
   
    # also need to filter out numbers in the ingredients due to odd splitting
    
    return [item for item in intermediate_list if not item.isdigit()]

In [117]:
syns = extract_synonymns_list('2,6,10,15,19,23-HEXAMETHYLTETRACOSANE, SQUALANE, TETRACOSANE, 2,6,10,15,19,23HEXAMETHYL, and VEGETABLE SQUALANE')

In [118]:
# phrase works for this...

compare_phrase("squalane", stopwords, punctuation)
compare_phrase("Squalane is a naturally occuring lipid in both plants and animals. This ingredient is listed in the PETA's Caring Consumer guide as primarily derived from animals, especially shark liver oil.", stopwords, punctuation)
compare_syn_list(syns, stopwords, punctuation)


Comparing phrase: squalane
Cosine Sim with plant: 0.7363308072090149
Cosine Sim with animal: 0.7128980159759521
Cosine Sim with mineral: 0.6274028718471527

Comparing phrase: squalane is a naturally occuring lipid in both plants and animals. this ingredient is listed in the peta's caring consumer guide as primarily derived from animals, especially shark liver oil.
Cosine Sim with plant: 0.4753827452659607
Cosine Sim with animal: 0.32949191331863403
Cosine Sim with mineral: 0.5269950032234192

Comparing synonym list: ['23-hexamethyltetracosane', 'squalane', 'tetracosane', '23hexamethyl', 'vegetable squalane']
Cosine Sim with plant: 0.662769615650177
Cosine Sim with animal: 0.6928196251392365
Cosine Sim with mineral: 0.5563579797744751



In [None]:
# Let's code this up a little better so we can pull random samples and get the phrase.

