## Explore Text Embedding for Produce Item Names

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load data
df = pd.read_csv('../../raw_data/2022-12-07.csv', index_col=0)
df

Unnamed: 0,Name,Price,Organic?,Origin,date,item,price,unit
0,"Aloe-organic $1.94 each Organic, Mexico",$1.94 each,Organic,Mexico,2022-10-30,Aloe-organic,1.94,each
1,Apple- Blue pearmain heirloom ipm $3.34 per p...,$3.34 per pound,"Integrated Pest Management, Waxed",Scott Farm Orchard Dummerston VT,2022-10-30,Apple- Blue pearmain heirloom ipm,3.34,per pound
2,Apple- Calville blanc d'hiver heirloom ipm $3...,$3.77 per pound,"Integrated Pest Management, Waxed",Scott Farm Orchard Dummerston VT,2022-10-30,Apple- Calville blanc d'hiver heirloom ipm,3.77,per pound
3,Apple- Cox's Orange Pippin heirloom label $3....,$3.34 per pound,"Integrated Pest Management, Waxed",Scott Farm Orchard Dummerston VT,2022-10-30,Apple- Cox's Orange Pippin heirloom label,3.34,per pound
4,Apple- Golden delicious ipm $1.50 per pound ...,$1.50 per pound,"Integrated Pest Management, Waxed",Locally grown within 500 miles USA,2022-10-30,Apple- Golden delicious ipm,1.50,per pound
...,...,...,...,...,...,...,...,...
11794,Walnuts in shell organic $4.10 per pound Org...,$4.10 per pound,Organic,USA,2022-12-06,Walnuts in shell organic,4.10,per pound
11795,"Watercress-wonder $1.96 each Conventional, C...",$1.96 each,Conventional,California,2022-12-06,Watercress-wonder,1.96,each
11796,Wheatgrass-1 lb bags organic $16.41 each Org...,$16.41 each,Organic,New York,2022-12-06,Wheatgrass-1 lb bags organic,16.41,each
11797,"Wheatgrass-pots organic $2.96 each Organic, ...",$2.96 each,Organic,New York,2022-12-06,Wheatgrass-pots organic,2.96,each


### Example 1: SentenceTransformer

In [3]:
from sentence_transformers import SentenceTransformer, util
import torch

2023-06-17 15:44:05.971753: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
# create list of cleaned, unique items
df['item_clean']=df['item'].str.lower().str.split('-').str[0] \
                           .str.replace('.', '', regex=False) \
                           .str.strip() \
                           .str.replace(r'\s+', ' ', regex=True)
item_list = df['item_clean'].drop_duplicates().to_list()
item_list

['aloe',
 'apple',
 'artichokes',
 'arugula',
 'arugula bunched',
 'asparagus conventional banded bunched',
 'avocados',
 'bananas',
 'basil',
 'beans',
 'beets',
 'bergamot',
 'blackberries',
 'blueberries',
 'blueberries pint',
 'bok choy',
 'broccoli rabe',
 'broccoli slaw organic',
 'broccoli',
 'broccolini',
 'brussels sprouts loose organic',
 'brussels sprouts stalks organic',
 'cabbage',
 'carrots',
 'cauliflower',
 'celeriac',
 'celery',
 'chard',
 'chestnuts',
 'chicory',
 'cilantro',
 'compost',
 'cranberries',
 'cucumber',
 'curry leaf',
 'daikon',
 'dandelions',
 'dill',
 'eggplant',
 'endive',
 'fennel',
 'flowers',
 'garlic',
 'ginger',
 'gourds',
 'grapefruit',
 'grapes',
 'herbs',
 'horseradish',
 'jerusalem artichokes',
 'jicama',
 'jujubes',
 'kale',
 'kiwi',
 'kohlrabi',
 'komatsuna',
 'leeks',
 'lemongrass loose',
 'lemons',
 'lemons 4958 or blank label',
 'lettuce',
 'lime leaf makrut',
 'limes',
 'mangos',
 'melon',
 'micro greens sampler blue moon pesticide free'

In [28]:
# simple embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(item_list, convert_to_tensor=True)

tensor([[-0.0475,  0.0186,  0.0265,  ...,  0.0013,  0.1006,  0.0396],
        [-0.0061,  0.0310,  0.0648,  ..., -0.0482,  0.0735,  0.0895],
        [-0.0030, -0.0477, -0.0151,  ...,  0.0208,  0.0760,  0.0004],
        ...,
        [-0.0530, -0.0604, -0.0089,  ..., -0.0361,  0.1295,  0.0456],
        [-0.0294,  0.0276, -0.0099,  ...,  0.0216,  0.1260,  0.0360],
        [ 0.0263, -0.0160, -0.0518,  ..., -0.0690,  0.0266,  0.0071]])

In [27]:
# test embeddings
queries = ['tomato']

top_k=min(5,len(item_list))
for query in queries:
    query_embedding = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_embedding, embeddings)[0]
    filtered_item_list = [(idx, item) for idx, item in enumerate(item_list) if query.lower() not in item]
    filtered_cos_scores = torch.index_select(cos_scores,
                                             0,
                                             torch.tensor([i[0] for i in filtered_item_list]))
    top_results = torch.topk(filtered_cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar items in item list:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(filtered_item_list[idx][1], "(Score: {:.4f})".format(score))





Query: tomato

Top 5 most similar items in item list:
grapes (Score: 0.5621)
plants (Score: 0.5533)
strawberries (Score: 0.5512)
pineapple (Score: 0.5497)
mushroom (Score: 0.5420)


In [None]:
# to productionize:
# pkl the model, embeddings, and item_list
# use flask to create API
# https://www.datacamp.com/tutorial/machine-learning-models-api-python

### Example 2: TextVectorization and SpaCy

In [None]:
# from nltk.corpus import stopwords
import tensorflow as tf
import spacy

In [None]:
# spacy
spacy_model = spacy.load('en_core_web_md')
spacy_embeddings = spacy_model(item_list)

ValueError: [E1041] Expected a string, Doc, or bytes as input, but got: <class 'list'>





Query: Apple

Top 5 most similar items in item list:
apple heirloom   all other   (Score: 0.6420)
apple pink lady ipm   (Score: 0.5465)
apple  golden russet ipm   (Score: 0.5024)
apple  golden delicious ipm   (Score: 0.4977)
apple ipm   all other   (Score: 0.4961)
apple gala ipm   (Score: 0.4958)
apple jonaprince ipm   (Score: 0.4957)




Query: basil

Top 5 most similar items in item list:
basil   (Score: 1.0000)
herbs  basil 75oz organic   (Score: 0.6563)
plants soli organic basil   (Score: 0.6041)
herbs  basil square roots pesticide free   (Score: 0.5511)
herbs  basil gotham greens pesticide free   (Score: 0.5168)
herbs tarragon   (Score: 0.4673)
herbs rosemary   (Score: 0.4608)




Query: plants

Top 5 most similar items in item list:
wheatgrass pots organic   (Score: 0.5144)
platanos green (plantains)   (Score: 0.4909)
bananas organic   (Score: 0.4843)
lettuce red leaf organic   (Score: 0.4823)
tomatillos organic   (Score: 0.4786)
nasturtiums edible flowers organic   (Score: 0

In [63]:
# Create new features with normalized names
df['item_clean']=df['item'].str.lower().str.replace('-', ' ') \
                           .str.replace('.', '') \
                           .str.replace('[', '') \
                           .str.replace(']', '')
df


  .str.replace('.', '') \
  .str.replace('[', '') \
  .str.replace(']', '')


Unnamed: 0,Name,Price,Organic?,Origin,date,item,price,unit,item_clean
0,"Aloe-organic $1.94 each Organic, Mexico",$1.94 each,Organic,Mexico,2022-10-30,Aloe-organic,1.94,each,aloe organic
1,Apple- Blue pearmain heirloom ipm $3.34 per p...,$3.34 per pound,"Integrated Pest Management, Waxed",Scott Farm Orchard Dummerston VT,2022-10-30,Apple- Blue pearmain heirloom ipm,3.34,per pound,apple blue pearmain heirloom ipm
2,Apple- Calville blanc d'hiver heirloom ipm $3...,$3.77 per pound,"Integrated Pest Management, Waxed",Scott Farm Orchard Dummerston VT,2022-10-30,Apple- Calville blanc d'hiver heirloom ipm,3.77,per pound,apple calville blanc d'hiver heirloom ipm
3,Apple- Cox's Orange Pippin heirloom label $3....,$3.34 per pound,"Integrated Pest Management, Waxed",Scott Farm Orchard Dummerston VT,2022-10-30,Apple- Cox's Orange Pippin heirloom label,3.34,per pound,apple cox's orange pippin heirloom label
4,Apple- Golden delicious ipm $1.50 per pound ...,$1.50 per pound,"Integrated Pest Management, Waxed",Locally grown within 500 miles USA,2022-10-30,Apple- Golden delicious ipm,1.50,per pound,apple golden delicious ipm
...,...,...,...,...,...,...,...,...,...
11794,Walnuts in shell organic $4.10 per pound Org...,$4.10 per pound,Organic,USA,2022-12-06,Walnuts in shell organic,4.10,per pound,walnuts in shell organic
11795,"Watercress-wonder $1.96 each Conventional, C...",$1.96 each,Conventional,California,2022-12-06,Watercress-wonder,1.96,each,watercress wonder
11796,Wheatgrass-1 lb bags organic $16.41 each Org...,$16.41 each,Organic,New York,2022-12-06,Wheatgrass-1 lb bags organic,16.41,each,wheatgrass 1 lb bags organic
11797,"Wheatgrass-pots organic $2.96 each Organic, ...",$2.96 each,Organic,New York,2022-12-06,Wheatgrass-pots organic,2.96,each,wheatgrass pots organic


In [13]:
nlp = spacy.load('en_core_web_md')

In [67]:
vocab

['',
 '[UNK]',
 'organic',
 'conventional',
 'waxed',
 'pest',
 'management',
 'integrated',
 'hydroponic',
 'nongmo']

In [50]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 6.2900672 , -3.87783337,  7.1935997 , ..., -5.09193373,
         1.85933316,  1.33483326],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [74]:
Vectorizer = tf.keras.layers.TextVectorization(
    standardize='lower_and_strip_punctuation'
)
Vectorizer.adapt(df['Organic?'].dropna().to_numpy())
vocab = Vectorizer.get_vocabulary(
    include_special_tokens=False
)


In [75]:
vocab

['organic',
 'conventional',
 'waxed',
 'pest',
 'management',
 'integrated',
 'hydroponic',
 'nongmo']

In [79]:
nlp('organic').vector

array([-2.1983  , -3.6084  , -2.504   ,  2.0542  ,  1.6154  , -2.9402  ,
        1.3629  ,  1.1536  , -1.5376  , -1.1576  ,  6.2707  , -1.0321  ,
       -4.4132  ,  3.2919  ,  0.29754 ,  0.73407 ,  2.5348  ,  0.65846 ,
       -2.5115  , -2.3373  , -2.5962  ,  1.9925  , -3.7461  ,  1.2572  ,
       -1.4413  , -1.0252  , -0.25379 ,  0.93515 , -4.142   , -1.3918  ,
        1.8279  , -1.8329  ,  0.14835 , -1.765   , -2.3603  , -3.4326  ,
        3.2104  , -0.54911 , -3.6038  ,  0.96807 , -0.76069 ,  1.5069  ,
        0.96726 ,  2.9142  , -2.1434  ,  6.2828  ,  3.8027  , -0.97692 ,
       -1.4082  ,  0.88415 ,  2.2344  , -0.26386 ,  0.92556 , -0.29604 ,
       -1.728   , -0.5394  , -1.49    , -0.66376 ,  1.6845  , -3.5032  ,
        3.7142  ,  0.42245 ,  1.9852  ,  0.43228 ,  0.74296 ,  0.75424 ,
       -3.6196  , -2.9455  ,  2.7441  ,  0.53157 ,  0.48449 ,  0.55528 ,
       -2.7062  ,  1.9797  ,  0.46699 ,  2.8114  , -6.1383  ,  0.52034 ,
       -1.8446  ,  3.4715  , -5.6995  , -0.010747, 

In [88]:
#generate the embedding matrix
num_tokens = len(vocab)
embedding_dim = len(nlp('The').vector)
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for i, word in enumerate(vocab):
    embedding_matrix[i] = nlp(str(word)).vector

organic
conventional
waxed
pest
management
integrated
hydroponic
nongmo


In [89]:
#Load the embedding matrix as the weights matrix for the embedding layer and set trainable to False
Embedding_layer=tf.keras.layers.Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False)

In [92]:
#build the model. 
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Input(shape=(1,), dtype=tf.string))
model.add(Vectorizer)
model.add(Embedding_layer)


In [None]:
model.fit()

In [None]:



model.add(LSTM(25, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='tanh', 
                kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))
model.add(Dropout(0.5))
model.add(Dense(32, activation='tanh', 
                kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))    
model.add(Dense(1))

adam = optimizers.Adam(learning_rate=.01, decay=1e-2)
model.compile(optimizer = adam, loss = 'mean_absolute_error', metrics = None)

print(model.summary())

#fit the model
model.fit(text.Text,
          text.Grade,
          batch_size = 10,
          epochs = 50,
          validation_split=.2)

In [None]:
max_features = 5000
max_len = 10

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=max_len
)

vectorize_layer.adapt(item_list)

model = tf.keras.models.Sequential()