## Explore Text Embedding for Produce Item Names

### Import Data

In [1]:
import pandas as pd

In [2]:
# Load data
df = pd.read_csv('../../raw_data/2022-12-07.csv', index_col=0)
df

Unnamed: 0,Name,Price,Organic?,Origin,date,item,price,unit
0,"Aloe-organic $1.94 each Organic, Mexico",$1.94 each,Organic,Mexico,2022-10-30,Aloe-organic,1.94,each
1,Apple- Blue pearmain heirloom ipm $3.34 per p...,$3.34 per pound,"Integrated Pest Management, Waxed",Scott Farm Orchard Dummerston VT,2022-10-30,Apple- Blue pearmain heirloom ipm,3.34,per pound
2,Apple- Calville blanc d'hiver heirloom ipm $3...,$3.77 per pound,"Integrated Pest Management, Waxed",Scott Farm Orchard Dummerston VT,2022-10-30,Apple- Calville blanc d'hiver heirloom ipm,3.77,per pound
3,Apple- Cox's Orange Pippin heirloom label $3....,$3.34 per pound,"Integrated Pest Management, Waxed",Scott Farm Orchard Dummerston VT,2022-10-30,Apple- Cox's Orange Pippin heirloom label,3.34,per pound
4,Apple- Golden delicious ipm $1.50 per pound ...,$1.50 per pound,"Integrated Pest Management, Waxed",Locally grown within 500 miles USA,2022-10-30,Apple- Golden delicious ipm,1.50,per pound
...,...,...,...,...,...,...,...,...
11794,Walnuts in shell organic $4.10 per pound Org...,$4.10 per pound,Organic,USA,2022-12-06,Walnuts in shell organic,4.10,per pound
11795,"Watercress-wonder $1.96 each Conventional, C...",$1.96 each,Conventional,California,2022-12-06,Watercress-wonder,1.96,each
11796,Wheatgrass-1 lb bags organic $16.41 each Org...,$16.41 each,Organic,New York,2022-12-06,Wheatgrass-1 lb bags organic,16.41,each
11797,"Wheatgrass-pots organic $2.96 each Organic, ...",$2.96 each,Organic,New York,2022-12-06,Wheatgrass-pots organic,2.96,each


### Example 1: SentenceTransformer

In [68]:
from sentence_transformers import SentenceTransformer, util
import torch

In [69]:
# create list of cleaned, unique items
df['item_clean']=df['item'].str.lower().str.split('-').str[0] \
                           .str.replace('.', '', regex=False) \
                           .str.strip() \
                           .str.replace(r'\s+', ' ', regex=True)
item_list = df['item_clean'].drop_duplicates().to_list()
item_list

['aloe',
 'apple',
 'artichokes',
 'arugula',
 'arugula bunched',
 'asparagus conventional banded bunched',
 'avocados',
 'bananas',
 'basil',
 'beans',
 'beets',
 'bergamot',
 'blackberries',
 'blueberries',
 'blueberries pint',
 'bok choy',
 'broccoli rabe',
 'broccoli slaw organic',
 'broccoli',
 'broccolini',
 'brussels sprouts loose organic',
 'brussels sprouts stalks organic',
 'cabbage',
 'carrots',
 'cauliflower',
 'celeriac',
 'celery',
 'chard',
 'chestnuts',
 'chicory',
 'cilantro',
 'compost',
 'cranberries',
 'cucumber',
 'curry leaf',
 'daikon',
 'dandelions',
 'dill',
 'eggplant',
 'endive',
 'fennel',
 'flowers',
 'garlic',
 'ginger',
 'gourds',
 'grapefruit',
 'grapes',
 'herbs',
 'horseradish',
 'jerusalem artichokes',
 'jicama',
 'jujubes',
 'kale',
 'kiwi',
 'kohlrabi',
 'komatsuna',
 'leeks',
 'lemongrass loose',
 'lemons',
 'lemons 4958 or blank label',
 'lettuce',
 'lime leaf makrut',
 'limes',
 'mangos',
 'melon',
 'micro greens sampler blue moon pesticide free'

In [70]:
# simple embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(item_list, convert_to_tensor=True)

In [73]:
# test embeddings
queries = ['tomato', 'oranges', 'broccoli']

top_k=min(5,len(item_list))
for query in queries:
    query_embedding = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_embedding, embeddings)[0]
    filtered_item_list = [(idx, item) for idx, item in enumerate(item_list) if query.lower() not in item]
    filtered_cos_scores = torch.index_select(cos_scores,
                                             0,
                                             torch.tensor([i[0] for i in filtered_item_list]))
    top_results = torch.topk(filtered_cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar items in item list:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(filtered_item_list[idx][1], "(Score: {:.4f})".format(score))





Query: tomato

Top 5 most similar items in item list:
grapes (Score: 0.5621)
plants (Score: 0.5533)
strawberries (Score: 0.5512)
pineapple (Score: 0.5497)
mushroom (Score: 0.5420)




Query: oranges

Top 5 most similar items in item list:
lemons (Score: 0.6273)
limes (Score: 0.6057)
pumpkins (Score: 0.5900)
pears (Score: 0.5457)
grapes (Score: 0.5384)




Query: broccoli

Top 5 most similar items in item list:
cabbage (Score: 0.6182)
cauliflower (Score: 0.5494)
tomatoes (Score: 0.5267)
brussels sprouts (Score: 0.5143)
carrots (Score: 0.5137)


In [None]:
# to productionize:
# pkl the model, embeddings, and item_list
# use flask to create API
# https://www.datacamp.com/tutorial/machine-learning-models-api-python

### Example 2: TextVectorization and SpaCy

In [21]:
# from nltk.corpus import stopwords
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf
import spacy
import numpy as np

In [64]:
# Clean item names
df['item_clean']=df['item'].str.split('-').str[0]

Vectorizer = TextVectorization(
    standardize='lower_and_strip_punctuation',
    split=None
)
Vectorizer.adapt(df['item_clean'].dropna().to_numpy())
vocab = Vectorizer.get_vocabulary()

In [65]:
vocab

['',
 '[UNK]',
 'apple',
 'herbs',
 'mushroom',
 'squash',
 'carrots',
 'salad',
 'peppers',
 'potatoes',
 'sprouts',
 'onion',
 'pears',
 'lettuce',
 'cabbage',
 'micro greens',
 'salad loose',
 'sweet potato',
 'cucumber',
 'beets',
 'spinach',
 'cauliflower',
 'parsley',
 'kale',
 'bok choy',
 'tomatoes',
 'garlic',
 'flowers',
 'beans',
 'lemons',
 'radish',
 'limes',
 'persimmons',
 'tangerine',
 'platanos',
 'pineapple',
 'daikon',
 'avocados',
 'turnips',
 'turmeric',
 'peas',
 'ginger',
 'scallions',
 'grapes',
 'dill',
 'cranberries',
 'broccoli rabe',
 'wheatgrass',
 'broccoli',
 'bananas',
 'pumpkin',
 'radicchio',
 'mangos',
 'kohlrabi',
 'oranges',
 'watercress',
 'broccolini',
 'blueberries',
 'yuca manioc or cassava  ',
 'tomato cherry little guys organic  ',
 'spinach loose baby organic  ',
 'shallots',
 'rutabagas',
 'pomegranates 3127  ',
 'nopales',
 'micro greens sampler blue moon pesticide free  ',
 'lime leaf makrut  ',
 'lemongrass loose  ',
 'leeks',
 'kiwi',
 '

In [66]:
# Create embedding matrix
nlp = spacy.load('en_core_web_md')
num_tokens = len(vocab)
embedding_dim = len(nlp('The').vector)
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for i, word in enumerate(vocab):
    embedding_matrix[i] = nlp(str(word)).vector

In [67]:
# Find similar words
queries = ['tomato', 'oranges', 'broccoli']

top_k=min(5, len(vocab))
for query in queries:
    query_embedding = nlp(query).vector
    cos_scores = tf.keras.losses.cosine_similarity(query_embedding, embedding_matrix.astype('float32'))
    values, indices = tf.math.top_k(-1*cos_scores, k=top_k)
    items = tf.gather(vocab, indices)
    print(items)


tf.Tensor([b'cucumber' b'eggplant' b'tomatillos  ' b'tomatillos' b'tomatoes'], shape=(5,), dtype=string)
tf.Tensor(
[b'oranges' b'tomato cherry red lancaster organic  '
 b'tomato grape red lancaster organic  ' b'lime leaf makrut  '
 b'tomato cherry sungold lancaster clamshell organic  '], shape=(5,), dtype=string)
tf.Tensor([b'sprouts' b'cauliflower' b'kale' b'daikon' b'broccoli rabe'], shape=(5,), dtype=string)


### Appendix

In [28]:
# Load embedding matrix as weights matrix for the embedding layer and set trainable to False
Embedding_layer=Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=Constant(embedding_matrix),
    trainable=False)

In [22]:
#build the model. 
model = Sequential()
model.add(Input(shape=(1,), dtype=tf.string))
model.add(Vectorizer)
model.add(Embedding_layer)

In [33]:
model.summary()
model('apple'.to_numpy())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_4 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, None, 300)         2400      
                                                                 
Total params: 2,400
Trainable params: 0
Non-trainable params: 2,400
_________________________________________________________________


AttributeError: 'str' object has no attribute 'to_numpy'

In [4]:
# spacy
spacy_model = spacy.load('en_core_web_md')
spacy_embeddings = spacy_model(item_list)


NameError: name 'item_list' is not defined

In [63]:
# Create new features with normalized names
df['item_clean']=df['item'].str.lower().str.replace('-', ' ') \
                           .str.replace('.', '') \
                           .str.replace('[', '') \
                           .str.replace(']', '')
df


  .str.replace('.', '') \
  .str.replace('[', '') \
  .str.replace(']', '')


Unnamed: 0,Name,Price,Organic?,Origin,date,item,price,unit,item_clean
0,"Aloe-organic $1.94 each Organic, Mexico",$1.94 each,Organic,Mexico,2022-10-30,Aloe-organic,1.94,each,aloe organic
1,Apple- Blue pearmain heirloom ipm $3.34 per p...,$3.34 per pound,"Integrated Pest Management, Waxed",Scott Farm Orchard Dummerston VT,2022-10-30,Apple- Blue pearmain heirloom ipm,3.34,per pound,apple blue pearmain heirloom ipm
2,Apple- Calville blanc d'hiver heirloom ipm $3...,$3.77 per pound,"Integrated Pest Management, Waxed",Scott Farm Orchard Dummerston VT,2022-10-30,Apple- Calville blanc d'hiver heirloom ipm,3.77,per pound,apple calville blanc d'hiver heirloom ipm
3,Apple- Cox's Orange Pippin heirloom label $3....,$3.34 per pound,"Integrated Pest Management, Waxed",Scott Farm Orchard Dummerston VT,2022-10-30,Apple- Cox's Orange Pippin heirloom label,3.34,per pound,apple cox's orange pippin heirloom label
4,Apple- Golden delicious ipm $1.50 per pound ...,$1.50 per pound,"Integrated Pest Management, Waxed",Locally grown within 500 miles USA,2022-10-30,Apple- Golden delicious ipm,1.50,per pound,apple golden delicious ipm
...,...,...,...,...,...,...,...,...,...
11794,Walnuts in shell organic $4.10 per pound Org...,$4.10 per pound,Organic,USA,2022-12-06,Walnuts in shell organic,4.10,per pound,walnuts in shell organic
11795,"Watercress-wonder $1.96 each Conventional, C...",$1.96 each,Conventional,California,2022-12-06,Watercress-wonder,1.96,each,watercress wonder
11796,Wheatgrass-1 lb bags organic $16.41 each Org...,$16.41 each,Organic,New York,2022-12-06,Wheatgrass-1 lb bags organic,16.41,each,wheatgrass 1 lb bags organic
11797,"Wheatgrass-pots organic $2.96 each Organic, ...",$2.96 each,Organic,New York,2022-12-06,Wheatgrass-pots organic,2.96,each,wheatgrass pots organic


In [79]:
nlp('organic').vector

array([-2.1983  , -3.6084  , -2.504   ,  2.0542  ,  1.6154  , -2.9402  ,
        1.3629  ,  1.1536  , -1.5376  , -1.1576  ,  6.2707  , -1.0321  ,
       -4.4132  ,  3.2919  ,  0.29754 ,  0.73407 ,  2.5348  ,  0.65846 ,
       -2.5115  , -2.3373  , -2.5962  ,  1.9925  , -3.7461  ,  1.2572  ,
       -1.4413  , -1.0252  , -0.25379 ,  0.93515 , -4.142   , -1.3918  ,
        1.8279  , -1.8329  ,  0.14835 , -1.765   , -2.3603  , -3.4326  ,
        3.2104  , -0.54911 , -3.6038  ,  0.96807 , -0.76069 ,  1.5069  ,
        0.96726 ,  2.9142  , -2.1434  ,  6.2828  ,  3.8027  , -0.97692 ,
       -1.4082  ,  0.88415 ,  2.2344  , -0.26386 ,  0.92556 , -0.29604 ,
       -1.728   , -0.5394  , -1.49    , -0.66376 ,  1.6845  , -3.5032  ,
        3.7142  ,  0.42245 ,  1.9852  ,  0.43228 ,  0.74296 ,  0.75424 ,
       -3.6196  , -2.9455  ,  2.7441  ,  0.53157 ,  0.48449 ,  0.55528 ,
       -2.7062  ,  1.9797  ,  0.46699 ,  2.8114  , -6.1383  ,  0.52034 ,
       -1.8446  ,  3.4715  , -5.6995  , -0.010747, 

In [None]:
model.fit()

In [None]:



model.add(LSTM(25, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='tanh', 
                kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))
model.add(Dropout(0.5))
model.add(Dense(32, activation='tanh', 
                kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))    
model.add(Dense(1))

adam = optimizers.Adam(learning_rate=.01, decay=1e-2)
model.compile(optimizer = adam, loss = 'mean_absolute_error', metrics = None)

print(model.summary())

#fit the model
model.fit(text.Text,
          text.Grade,
          batch_size = 10,
          epochs = 50,
          validation_split=.2)

In [None]:
max_features = 5000
max_len = 10

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=max_len
)

vectorize_layer.adapt(item_list)

model = tf.keras.models.Sequential()