In [6]:
import pandas as pd
import tiktoken
import glob

from openai.embeddings_utils import get_embedding

In [7]:
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000

In [8]:
input_datapath = glob.glob('*.csv')[0]
df = pd.read_csv(input_datapath, delimiter=';', on_bad_lines='skip')
df = df[['manufacturer_name', 'name', 'flavour']]
df['combined'] = 'Manufacturer name: ' + df['manufacturer_name'] + '; Name: ' + df['name'] + '; Flavour: ' + df['flavour']
df = df.dropna()
df = df.head(100) # Take a subset of 100 products for testing purposes

df

Unnamed: 0,manufacturer_name,name,flavour,combined
0,1Up Nutrition,"Vegan Greens & Reds Superfoods, Lemonade - 300g",Lemonade,Manufacturer name: 1Up Nutrition; Name: Vegan ...
1,1Up Nutrition,"Organic Vegan Greens & Reds Superfoods, Pineap...",Pineapple,Manufacturer name: 1Up Nutrition; Name: Organi...
2,1Up Nutrition,"Pure Rebuild, Pineapple - 600g",Pineapple,Manufacturer name: 1Up Nutrition; Name: Pure R...
3,1Up Nutrition,"Vegan Greens & Reds Superfoods, Peach - 300g",Peach,Manufacturer name: 1Up Nutrition; Name: Vegan ...
4,1Up Nutrition,"Vegan Protein, Vanilla - 900g",Vanilla,Manufacturer name: 1Up Nutrition; Name: Vegan ...
...,...,...,...,...
95,5% Nutrition,"AllDayYouMay Caffeinated - Legendary Series, S...",Southern Sweet Tea,Manufacturer name: 5% Nutrition; Name: AllDayY...
96,5% Nutrition,"Crea-TEN - Legendary Series, Blueberry Lemonad...",Blueberry Lemonade (EAN 850041158006),Manufacturer name: 5% Nutrition; Name: Crea-TE...
97,5% Nutrition,"AllDayYouMay - Legendary Series, Lemon Lime - ...",Lemon Lime,Manufacturer name: 5% Nutrition; Name: AllDayY...
98,5% Nutrition,Turkesterone 1200 - 120 caps,,Manufacturer name: 5% Nutrition; Name: Turkest...


In [9]:
top_n = 1000
encoding = tiktoken.get_encoding(embedding_encoding)

df['n_tokens'] = df['combined'].apply(lambda x: len(encoding.encode(x)))
df = df[df['n_tokens'] <= max_tokens].tail(top_n)

df

Unnamed: 0,manufacturer_name,name,flavour,combined,n_tokens
0,1Up Nutrition,"Vegan Greens & Reds Superfoods, Lemonade - 300g",Lemonade,Manufacturer name: 1Up Nutrition; Name: Vegan ...,29
1,1Up Nutrition,"Organic Vegan Greens & Reds Superfoods, Pineap...",Pineapple,Manufacturer name: 1Up Nutrition; Name: Organi...,30
2,1Up Nutrition,"Pure Rebuild, Pineapple - 600g",Pineapple,Manufacturer name: 1Up Nutrition; Name: Pure R...,26
3,1Up Nutrition,"Vegan Greens & Reds Superfoods, Peach - 300g",Peach,Manufacturer name: 1Up Nutrition; Name: Vegan ...,27
4,1Up Nutrition,"Vegan Protein, Vanilla - 900g",Vanilla,Manufacturer name: 1Up Nutrition; Name: Vegan ...,23
...,...,...,...,...,...
95,5% Nutrition,"AllDayYouMay Caffeinated - Legendary Series, S...",Southern Sweet Tea,Manufacturer name: 5% Nutrition; Name: AllDayY...,35
96,5% Nutrition,"Crea-TEN - Legendary Series, Blueberry Lemonad...",Blueberry Lemonade (EAN 850041158006),Manufacturer name: 5% Nutrition; Name: Crea-TE...,52
97,5% Nutrition,"AllDayYouMay - Legendary Series, Lemon Lime - ...",Lemon Lime,Manufacturer name: 5% Nutrition; Name: AllDayY...,30
98,5% Nutrition,Turkesterone 1200 - 120 caps,,Manufacturer name: 5% Nutrition; Name: Turkest...,26


In [10]:
df['embedding'] = df['combined'].apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_csv('product_embeddings.csv')

df

Unnamed: 0,manufacturer_name,name,flavour,combined,n_tokens,embedding
0,1Up Nutrition,"Vegan Greens & Reds Superfoods, Lemonade - 300g",Lemonade,Manufacturer name: 1Up Nutrition; Name: Vegan ...,29,"[0.022003619000315666, -0.02273578941822052, -..."
1,1Up Nutrition,"Organic Vegan Greens & Reds Superfoods, Pineap...",Pineapple,Manufacturer name: 1Up Nutrition; Name: Organi...,30,"[0.016161002218723297, -0.01461572665721178, 0..."
2,1Up Nutrition,"Pure Rebuild, Pineapple - 600g",Pineapple,Manufacturer name: 1Up Nutrition; Name: Pure R...,26,"[0.008154603652656078, -0.0032534413039684296,..."
3,1Up Nutrition,"Vegan Greens & Reds Superfoods, Peach - 300g",Peach,Manufacturer name: 1Up Nutrition; Name: Vegan ...,27,"[0.00973617285490036, -0.017075950279831886, 0..."
4,1Up Nutrition,"Vegan Protein, Vanilla - 900g",Vanilla,Manufacturer name: 1Up Nutrition; Name: Vegan ...,23,"[-0.008280187845230103, -0.032424286007881165,..."
...,...,...,...,...,...,...
95,5% Nutrition,"AllDayYouMay Caffeinated - Legendary Series, S...",Southern Sweet Tea,Manufacturer name: 5% Nutrition; Name: AllDayY...,35,"[-0.007799999322742224, -0.004943522159010172,..."
96,5% Nutrition,"Crea-TEN - Legendary Series, Blueberry Lemonad...",Blueberry Lemonade (EAN 850041158006),Manufacturer name: 5% Nutrition; Name: Crea-TE...,52,"[0.005526192951947451, -0.017366159707307816, ..."
97,5% Nutrition,"AllDayYouMay - Legendary Series, Lemon Lime - ...",Lemon Lime,Manufacturer name: 5% Nutrition; Name: AllDayY...,30,"[0.005441951099783182, 0.008156273514032364, -..."
98,5% Nutrition,Turkesterone 1200 - 120 caps,,Manufacturer name: 5% Nutrition; Name: Turkest...,26,"[-0.006450008600950241, -0.011722450144588947,..."
