# Product Matching Using Graphs

Install the Neo4j [Graph Data Science Client](https://neo4j.com/docs/graph-data-science-client/current/installation/) and import modules

In [1]:
%%capture
import pandas as pd
import numpy as np
import getpass
import os

try:
    from graphdatascience import GraphDataScience
except:
    %pip install graphdatascience
    from graphdatascience import GraphDataScience

Set connection credentials and OpenAI api key 

In [16]:
connectionUrl = str(input("Neo4j Database Url (press enter to use \"neo4j://localhost:7687\"): ") or "neo4j://localhost:7687")
username = str(input("Username (press enter for \"neo4j\"): ") or "neo4j")
password = input("Password: ")
database = str(input("Database name (press enter for \"neo4j\"): ") or "neo4j")
os.environ["OPENAI_API_KEY"] = getpass.getpass(prompt='OpenAI API key: ')
openai_api_key = os.getenv('OPENAI_API_KEY') # Needed for the last part: Product Matching using OpenAI text embedding

Verify the database connection and return the Graph Data Science library version

In [14]:
gds = GraphDataScience(connectionUrl, auth=(username, password))
gds.set_database(database)
print(gds.version())

2.4.2


## 0. Data
Data has been downloaded from [openfoodfacts.org](https://world.openfoodfacts.org/cgi/search.pl?action=display&search_terms=ice&tagtype_0=countries&tag_contains_0=contains&tag_0=uk&tagtype_1=languages&tag_contains_1=contains&tag_1=en&sort_by=unique_scans_n&page_size=20)

search criteria: 
['bread','ice','peas'] with country contains 'uk' and languages contains "en" 


In [18]:
# load data
bread_df = pd.read_csv('openfoodfacts_export_bread_UK_en.csv', sep='\t')
ice_df = pd.read_csv('openfoodfacts_export_ice_UK_en.csv', sep='\t')
peas_df = pd.read_csv('openfoodfacts_export_peas_UK_en.csv', sep='\t')

In [19]:
all_df = pd.concat([bread_df, ice_df,peas_df], ignore_index=True, axis=0)
all_df


Unnamed: 0,code,lc,product_name_da,product_name_de,product_name_en,product_name_es,product_name_fi,product_name_fr,product_name_it,product_name_nl,...,nutriscore_grade_producer,customer_service_fr,sources_fields:org-gs1:gln,sources_fields:org-gs1:gpcCategoryCode,sources_fields:org-gs1:gpcCategoryName,sources_fields:org-gs1:isAllergenRelevantDataProvided,sources_fields:org-gs1:lastChangeDateTime,sources_fields:org-gs1:partyName,sources_fields:org-gs1:productionVariantDescription,sources_fields:org-gs1:publicationDateTime
0,4088600107646,en,,,Original Rye Crispbread,,,,,,...,,,,,,,,,,
1,5070000210605,en,,,Keto bread,,,,,,...,,,,,,,,,,
2,29296484,en,,,Fruit and treacle bread,,,,,,...,,,,,,,,,,
3,5057967395071,en,,,White Sourdough,,,,,,...,,,,,,,,,,
4,5060235980480,en,,,White Sourdough,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2426,5024121380419,en,,,Organix cheese pea snaps,,,,,,...,,,,,,,,,,
2427,5060413751109,en,,,Yushoi lightly salted pea snacks,,,,,,...,,,,,,,,,,
2428,656320,en,,,Tenderstem Broccoli Carrots & Sugar snap peas,,,,,,...,,,,,,,,,,
2429,116022,en,,,British Giant Marrowfat Peas,,,,,,...,,,,,,,,,,


In [20]:
min_col = ['code', 'product_name_en','generic_name_en','quantity','serving_size',
           'packaging_tags','brands_tags','categories_tags','labels_tags','countries_tags','stores_tags', 
           'ingredients_text_en','allergens_tags','traces_tags','packaging_1_shape',
           'link','off:food_groups','off:food_groups_tags','off:nova_groups_tags','off:nutriscore_grade','off:nutriscore_score']

In [21]:
df = all_df[min_col]

In [22]:
df =df.dropna(subset=['product_name_en'])
df

Unnamed: 0,code,product_name_en,generic_name_en,quantity,serving_size,packaging_tags,brands_tags,categories_tags,labels_tags,countries_tags,...,ingredients_text_en,allergens_tags,traces_tags,packaging_1_shape,link,off:food_groups,off:food_groups_tags,off:nova_groups_tags,off:nutriscore_grade,off:nutriscore_score
0,4088600107646,Original Rye Crispbread,,250 g,10 g,,savour-bakes,"en:plant-based-foods-and-beverages,en:plant-ba...","en:vegetarian,en:source-of-fibre,en:high-fibres",en:united-kingdom,...,"rye flour, salt",en:gluten,"en:gluten,en:sesame-seeds,en:soybeans",,,en:bread,"en:cereals-and-potatoes,en:bread",en:3-processed-foods,a,-4.0
1,5070000210605,Keto bread,,,,,,"en:flax-seed-bread,en:flax-seed-bread",,en:united-kingdom,...,"Water, Sunflower seeds, Flaxseed (13%), Tapioc...",,,,,,,en:3-processed-foods,c,7.0
2,29296484,Fruit and treacle bread,,,,,,"en:plant-based-foods-and-beverages,en:plant-ba...",,en:united-kingdom,...,"wheat flour, calcium carbonate, iron, niacin, ...",en:gluten,,,,en:bread,"en:cereals-and-potatoes,en:bread",en:3-processed-foods,b,2.0
3,5057967395071,White Sourdough,,400g,,"en:card-sleeve,en:ldpe-film","tesco,walkers","en:plant-based-foods-and-beverages,en:plant-ba...",en:vegetarian,en:united-kingdom,...,"wheat flour (wheat flour, calcium carbonate, i...",en:gluten,en:en-eggs-en-gluten-en-milk-en-may-contain-se...,en:sleeve,,en:bread,"en:cereals-and-potatoes,en:bread",en:3-processed-foods,a,-2.0
4,5060235980480,White Sourdough,,1 kg,,,bertinet-bakery,en:sourdough-white-sliced-bread,,en:united-kingdom,...,,,,,,,,unknown,c,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2425,274012,"Pea, mint, feta",,,,,sainsbury-s,,,"en:france,en:united-kingdom",...,"Peas (52%), Water, Spinach, Soured Cream (Cows...",en:milk,,,,,,en:4-ultra-processed-food-and-drink-products,,
2426,5024121380419,Organix cheese pea snaps,,,,,,,,en:united-kingdom,...,,,,,,,,unknown,,
2427,5060413751109,Yushoi lightly salted pea snacks,,,21.0g,,yushoi,,,en:united-kingdom,...,,,,,,,,unknown,,
2428,656320,Tenderstem Broccoli Carrots & Sugar snap peas,,,80g,,m-s,,,en:united-kingdom,...,,,,,,,,unknown,,


### 0.1 Data Loading

In [23]:
gds.run_cypher("create constraint if not exists for (p:Product) require (p.code) is node key")

In [24]:
#load product
gds.run_cypher(
"""
unwind $df as df 
merge (p:Product{code: df['code'], name:df['product_name_en']})
set p.genericName = df['generic_name_en'], 
    p.quantity = df['quantity'], 
    p.servingSize = df['serving_size']
""",
params = {'df':df.to_dict(orient='records')}
)

In [25]:
#load brands
gds.run_cypher(
"""
unwind $df as df
with df, split(df['brands_tags'],",") as brands
unwind brands as brand
match (p:Product {code: df['code']})
merge (b:Brand {name:brand})
merge (p)-[:HAS_BRAND]->(b)
""",
params = {'df':df.dropna(subset=['brands_tags']).to_dict(orient='records')}
)

In [26]:
#load packaging
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['packaging_tags'],"en:",""),",") as packagings
unwind packagings as packaging
match (p:Product {code: df['code']})
merge (pa:Packaging {name:packaging})
merge (p)-[:HAS_PACKAGING]->(pa)
""",
params = {'df':df.dropna(subset=['packaging_tags']).to_dict(orient='records')}
)

In [27]:
#load categories
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['categories_tags'],"en:",""),",") as categories
unwind categories as category
match (p:Product {code: df['code']})
merge (ca:Category {name:category})
merge (p)-[:HAS_CATEGORY]->(ca)
""",
params = {'df':df.dropna(subset=['categories_tags']).to_dict(orient='records')}
)

In [28]:
#load labels
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['labels_tags'],"en:",""),",") as labels
unwind labels as label
match (p:Product {code: df['code']})
merge (l:Label {name:label})
merge (p)-[:HAS_LABEL]->(l)
""",
params = {'df':df.dropna(subset=['labels_tags']).to_dict(orient='records')}
)

In [29]:
#load countries
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['countries_tags'],"en:",""),",") as countries
unwind countries as country
match (p:Product {code: df['code']})
merge (c:Country {name:country})
merge (p)-[:HAS_COUNTRY]->(c)
""",
params = {'df':df.dropna(subset=['countries_tags']).to_dict(orient='records')}
)

In [30]:
#load stores
gds.run_cypher(
"""
unwind $df as df
with df, split(df['stores_tags'],",") as stores
unwind stores as store
match (p:Product {code: df['code']})
merge (s:Store {name:store})
merge (p)-[:HAS_STORE]->(s)
""",
params = {'df':df.dropna(subset=['stores_tags']).to_dict(orient='records')}
)

In [31]:
# load allergens
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['allergens_tags'],"en:",""),",") as allergens
unwind allergens as allergen
match (p:Product {code: df['code']})
merge (a:Allergen {name:allergen})
merge (p)-[:HAS_ALLERGEN]->(a)
""",
params = {'df':df.dropna(subset=['allergens_tags']).to_dict(orient='records')}
)

In [32]:
# load foodGroup1
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['off:food_groups'],"en:",""),",") as foodgroups
unwind foodgroups as foodgroup
match (p:Product {code: df['code']})
merge (c:Category {name:foodgroup})
merge (p)-[:HAS_GROUP1]->(c)
""",
params = {'df':df.dropna(subset=['off:food_groups']).to_dict(orient='records')}
)

In [33]:
# load foodGroup2
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['off:food_groups_tags'],"en:",""),",") as foodgroups
unwind foodgroups as foodgroup
match (p:Product {code: df['code']})
merge (c:Category {name:foodgroup})
merge (p)-[:HAS_GROUP2]->(c)
""",
params = {'df':df.dropna(subset=['off:food_groups_tags']).to_dict(orient='records')}
)

In [34]:
# import ingredients as free text
gds.run_cypher(
"""
unwind $df as df
match (p:Product {code: df['code']})
set p.ingredients = df.ingredients_text_en
""",
params = {'df':df.dropna(subset=['ingredients_text_en']).to_dict(orient='records')}
)

In [35]:
# Ugly clean
gds.run_cypher("""MATCH (p:Product ) where toString(p.genericName)="NaN" set p.genericName = NULL""")
gds.run_cypher("""MATCH (p:Product ) where toString(p.quantity)="NaN" set p.genericName = NULL""")
gds.run_cypher("""MATCH (p:Product ) where toString(p.servingSize)="NaN" set p.genericName = NULL""")

## 1. EDA

In [36]:
d = gds.run_cypher("""
call apoc.meta.stats
YIELD labels
""")
pd.DataFrame([d.labels[0]])

Unnamed: 0,Brand,Category,Packaging,Label,Product,Country,Store,Allergen
0,451,448,237,177,2417,28,73,43


In [37]:
# Most represented category
gds.run_cypher("""
MATCH (p:Product)-[:HAS_CATEGORY]->(c:Category)
RETURN c.name as category, count(p) as productCount
ORDER BY productCount DESC
""").head(20)

Unnamed: 0,category,productCount
0,plant-based-foods-and-beverages,800
1,plant-based-foods,770
2,cereals-and-potatoes,664
3,breads,649
4,frozen-foods,242
5,desserts,229
6,frozen-desserts,228
7,ice-creams-and-sorbets,208
8,ice-creams,184
9,white-breads,108


In [38]:
# Most represented group1 category
gds.run_cypher("""
MATCH (p:Product)-[:HAS_GROUP1]->(c:Category)
RETURN c.name as category, count(p) as productCount
ORDER BY productCount DESC
""").head(20)

Unnamed: 0,category,productCount
0,bread,637
1,ice-cream,208
2,vegetables,50
3,appetizers,37
4,one-dish-meals,23
5,sandwiches,22
6,pastries,22
7,legumes,22
8,cereals,7
9,pizza-pies-and-quiches,6


In [39]:
# store - product
gds.run_cypher("""
    MATCH (b:Brand)--(p:Product)--(s:Store)
    RETURN b.name as brand, s.name as store, count(p) as productCount order by productCount desc
""").head(20)

Unnamed: 0,brand,store,productCount
0,tesco,tesco,140
1,morrisons,morrisons,81
2,waitrose,waitrose,72
3,sainsbury-s,sainsbury-s,64
4,asda,asda,62
5,aldi,aldi,39
6,by-sainsbury-s,sainsbury-s,32
7,lidl,lidl,27
8,marks-spencer,marks-spencer,25
9,village-bakery,aldi,15


In [40]:
# Number of product per brand
gds.run_cypher("""
    MATCH (b:Brand)--(p:Product)
    RETURN b.name, count(p) as productCount order by productCount desc
""")

Unnamed: 0,b.name,productCount
0,tesco,144
1,morrisons,101
2,asda,86
3,waitrose,72
4,sainsbury-s,67
...,...,...
446,sweat-pea,1
447,smedley-s,1
448,selection,1
449,sweet-harvest,1


In [41]:
# how many products has labels, per brand
gds.run_cypher("""
    MATCH path=(b:Brand)--(p:Product)--(l:Label)
    RETURN b.name as brand, 
        count(l) as totalLabelCount,  
        count(distinct l) as uniqueLabel, 
        count(distinct p) as uniqueProduct,
        count(l) / count(distinct p) as ratio
    order by ratio desc
""").head(20)

Unnamed: 0,brand,totalLabelCount,uniqueLabel,uniqueProduct,ratio
0,wunda,21,21,1,21
1,good-honest,27,16,2,13
2,slooow,11,11,1,11
3,devils-kitchen,10,10,1,10
4,heart-of-nature,9,9,1,9
5,new-york-bakery-co,32,12,4,8
6,off-the-eaten-path,8,8,1,8
7,one-planet-pizza,8,8,1,8
8,brioche-pasquier,7,7,1,7
9,kleenex,7,7,1,7


## 2. Find Similar Products

### 2.1. based on categories - Cypher

In [43]:
gds.run_cypher("""
    MATCH (p:Product {code:1216486})--(c)
    with p, collect (c.name) as context
    return p.name, p.genericName, p.quantity, p.ingredients, context
""")

Unnamed: 0,p.name,p.genericName,p.quantity,p.ingredients,context
0,Garden Peas,Frozen Organic Peas,750 gr,garden peas,"[fruits-and-vegetables, vegetables, vegetables..."


In [44]:
# similar product to "Garden Peas"
gds.run_cypher("""
MATCH (p:Product {code:1216486})-[r:HAS_ALLERGEN|HAS_GROUP1|HAS_GROUP2|HAS_LABEL*2]-(sim:Product)
return sim.code, sim.name, sim.genericName, sim.quantity,sim.servingSize, count(r) as score order by score desc limit 15
""")

Unnamed: 0,sim.code,sim.name,sim.genericName,sim.quantity,sim.servingSize,score
0,5054073002756,chick peas,chick peas,400g (240g drained),60g,5
1,649391,Peas & carrots with mint butter,,300 g,100g,5
2,87445,Peas & carrots,,300 g,150g,5
3,4088600172958,Processed mushy peas with sugar and salt,Mushy Peas,300 g,150 g,5
4,20425708,Garden peas,,265 g,130 g,5
5,5000128731416,Marrowfat Processed Peas,,177 g,88g,5
6,1019735,Processed peas in water with sugar and salt,,"300 g, drained 190 g",95 g,5
7,29086443,British Petits Pois,,1.25 kg,80g,5
8,25238099,Mushy Peas,,300g,2,5
9,5055958700569,Smoky Barbecue Crunch,"Smoky Barbecue flavoured peas, crunchy corn ch...",104 g,26 g,5


In [45]:
# similar product to Magnum batonnet classic
gds.run_cypher("""
MATCH (p:Product {name:"Batonnet Classic"})-[r:HAS_ALLERGEN|HAS_GROUP1|HAS_GROUP2|HAS_LABEL*2]-(sim:Product)
return sim.code, sim.name, sim.genericName, sim.quantity,sim.servingSize, count(r) as score order by score desc limit 15
""")

Unnamed: 0,sim.code,sim.name,sim.genericName,sim.quantity,sim.servingSize,score
0,8714100638415,Mini Chocolate & Hazelnut Praliné,Chocolate and hazelnut flavour ice cream coate...,330 ml / 276 g,55 ml / 46 g,11
1,8714100635650,mini Almond,Ice cream with vanilla from Madagascar coated ...,330 ml / 276 g,55 ml / 46 g,10
2,8714100862636,Mini Batonnet Double Chocolat,Chocolate ice cream (with 6% milk chocolate) c...,300 g,60 ml (60 Millil Etat initial),10
3,76840600038,Chocolate Fudge Brownie,,415 g,83 g,9
4,4088600556338,Caramel Millionare,,"210 g, 3 x 70 g, 255 ml, 3 x 85 ml",70g,9
5,8714100240038,Chocolate Fudge Brownie Non-Dairy Ice Cream,Choolate non-dairy ice cream with chocolate br...,395 g,2g,9
6,3415582300924,Belgian chocolate & vanilla crunch,Dairy vanilla ice cream and dairy ice cream wi...,420ml,86g,9
7,8717163691496,White Chocolate & Cookies,Cream ice cream with a cookie flavour chocolat...,222 g,74g,8
8,5010238019328,Oreo ice cream with oreo cookie pieces,Chocolate flavoured sandwich cookies filled wi...,220 ml,33g,8
9,5900130030692,Nuii Salted Caramel & Australian Macadamia Ice...,,204 g,68g,8


In [46]:
# similar product to Magnum batonnet classic that is vegan
gds.run_cypher("""
MATCH (p:Product {name:"Batonnet Classic"})-[r:HAS_ALLERGEN|HAS_GROUP1|HAS_GROUP2|HAS_LABEL*2]-(sim:Product)-[:HAS_LABEL]->(:Label {name:"vegan"})
return sim.code, sim.name, sim.genericName, sim.quantity,sim.servingSize, count(r) as Score order by Score desc limit 15
""")

Unnamed: 0,sim.code,sim.name,sim.genericName,sim.quantity,sim.servingSize,Score
0,8714100240038,Chocolate Fudge Brownie Non-Dairy Ice Cream,Choolate non-dairy ice cream with chocolate br...,395 g,2g,9
1,5059512738753,Chocolate and red berry cones,Chocolate flavoured iced dessert made with coc...,4 x 120 ml,67g,7
2,5054775540099,Toffee and Vanilla Cones,Dairy free caramel flavoured and vanilla iced ...,4 x 110ml,70g,7
3,8714100658499,Vegan Almond Ice Cream 3 x,Vanilla vegan ice cream coated with chocolate ...,216 g,90 ml (90 Millil Etat initial),7
4,8711327434981,Vegan Sea Salt Caramel,Vegan sea salt caramel ice cream and chocolate...,213 g,71g,7
5,5411188128762,Velvety Chocolate Swirl,,450 ml,50 ml,7
6,8711327313712,Magnum Vegan Classic,,71 g,71g,7
7,8714100658420,Magnum vegan,Vanilla vegan ice cream coated with chocolate ...,3pcs,71 g,7
8,5054775540044,3 Choc Sticks,Diary free vanilla iced dessert with chocolate...,300 ml (3 * 100 ml),70 g,6
9,4056489238591,Choco Brownie Love,,500ml,80g,6


## How to improve this basic product matching?
- Extract more descriptive labels
    - ingredients, size, weights, nutriscore...
    - entity extraction with NLP libraries or LLMs like OpenAI
- Use Graph Data Science
    - node similarity
    - Graph embeddings + KNN
- Curate, clean and organize labels
    - Taxonomies and ontologies
    - Can be done manually (experts) or using automatic procedures (Ontologies, GML node classification or link prediction)

    

### Import ingredients as nodes

In [47]:
# ingredients
ing_df = df.dropna(subset=['ingredients_text_en'])
ing_df.ingredients_text_en

0                                        rye  flour, salt
1       Water, Sunflower seeds, Flaxseed (13%), Tapioc...
2       wheat flour, calcium carbonate, iron, niacin, ...
3       wheat flour (wheat flour, calcium carbonate, i...
8       _Wheat_ Flour (with added Calcium, Iron, Niaci...
                              ...                        
2392    Rehydrated Processed Peas (95%), Water, Sugar,...
2396    water, peas (25%), petit pois (7,0%), cherrywo...
2410                             Peas, water, sugar,salt.
2413    Marrowfat Peas (72%), Rapeseed Oil, Rice Flour...
2425    Peas (52%), Water, Spinach, Soured Cream (Cows...
Name: ingredients_text_en, Length: 942, dtype: object

In [48]:
ing_df['ing'] = ing_df['ingredients_text_en'].str.replace(r"\(.*\)","") # remove what's inside parentheses
ing_df['ing'] = ing_df['ing'].str.replace(r"\[.*\]","")
ing_df['ing'] = ing_df['ing'].str.replace("_","") # remove "_"
ing_df['ing'] = ing_df['ing'].str.replace("  "," ")
ing_df['ing'] = ing_df['ing'].str.replace(" ,",",").apply(str.lower)
ing_df['ing'] = ing_df['ing'].str.replace(".","")
ing_df['ing'] = ing_df['ing'].str.replace(", ,",",")
ing_df['ing'] = ing_df['ing'].str.replace(",,",",")

  ing_df['ing'] = ing_df['ingredients_text_en'].str.replace(r"\(.*\)","") # remove what's inside parentheses
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ing_df['ing'] = ing_df['ingredients_text_en'].str.replace(r"\(.*\)","") # remove what's inside parentheses
  ing_df['ing'] = ing_df['ing'].str.replace(r"\[.*\]","")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ing_df['ing'] = ing_df['ing'].str.replace(r"\[.*\]","")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats 

In [49]:
pd.DataFrame(ing_df.ing.str.split(",",expand=True).stack().value_counts()).head(20)

Unnamed: 0,0
salt,259
water,218
yeast,184
sugar,177
wheat flour,153
water,100
rapeseed oil,93
,92
wheat flour,57
dextrose,53


In [50]:
# import ingredients as nodes
gds.run_cypher(
"""
unwind $df as df
with df, split(df['ing'],", ") as ingredients
unwind ingredients as ingredient
match (p:Product {code: df['code']})
merge (i:Ingredient {name:ingredient})
merge (p)-[:HAS_INGREDIENT]->(i)
""",
params = {'df':ing_df.dropna(subset=['ing']).to_dict(orient='records')}
)

### 2.2. Find similar products with ingredients

In [51]:
# Use ingredients
gds.run_cypher("""
MATCH (p:Product {code:1216486})-[:HAS_INGREDIENT]-(i:Ingredient)-[:HAS_INGREDIENT]-(sim:Product)
WITH p, sim, collect(i.name) as ingredients
return sim.code, sim.name, sim.genericName, sim.quantity,sim.servingSize, ingredients, size(ingredients) as score order by score desc
""").head(20)

Unnamed: 0,sim.code,sim.name,sim.genericName,sim.quantity,sim.servingSize,ingredients,score
0,5031021922076,Garden peas,,290 g,,[garden peas],1
1,4088600203348,Garden Peas,,,,[garden peas],1
2,5054781025672,Garden peas,,,,[garden peas],1
3,5000116112074,Garden Peas,,,,[garden peas],1
4,5050854213656,Garden peas,,,,[garden peas],1
5,20425708,Garden peas,,265 g,130 g,[garden peas],1
6,281713,British garden peas in water,,185 g,,[garden peas],1
7,5057753155933,Garden peas,,,,[garden peas],1


In [52]:
# similar product to Magnum batonnet classic
gds.run_cypher("""
MATCH (p:Product {name:"Batonnet Classic"})-[:HAS_INGREDIENT]-(i:Ingredient)-[:HAS_INGREDIENT]-(sim:Product)
WITH p, sim, collect(i.name) as ingredients
return sim.code, sim.name, sim.genericName, ingredients, size(ingredients) as score order by score desc limit 15
""")


Unnamed: 0,sim.code,sim.name,sim.genericName,ingredients,score
0,8712100837890,Magnum Almond,,"[glucose syrup, cocoa butter', coconut oil, re...",7
1,8714100658420,Magnum vegan,Vanilla vegan ice cream coated with chocolate ...,"[cocoa mass', glucose syrup, cocoa butter', co...",6
2,8714100661376,Viennetta chocolate & orange flavour,,"[glucose syrup, coconut oil, reconstituted ski...",6
3,8714100390023,Vanilla light ice cream,Vanilla ice cream,"[glucose syrup, coconut oil, reconstituted ski...",5
4,8714100862636,Mini Batonnet Double Chocolat,Chocolate ice cream (with 6% milk chocolate) c...,"[glucose syrup, coconut oil, reconstituted ski...",5
5,8711327313712,Magnum Vegan Classic,,"[glucose syrup, coconut oil, water, glucose-fr...",5
6,8714100590430,Swedish Glace dairy-free smooth Vanilla,,"[glucose syrup, coconut oil, water, glucose-fr...",5
7,8714100658499,Vegan Almond Ice Cream 3 x,Vanilla vegan ice cream coated with chocolate ...,"[glucose syrup, coconut oil, water, glucose-fr...",5
8,8711327434981,Vegan Sea Salt Caramel,Vegan sea salt caramel ice cream and chocolate...,"[glucose syrup, coconut oil, water, glucose-fr...",5
9,8712566291144,Soft scoop,,"[glucose syrup, reconstituted skimmed milk, wa...",5


In [53]:
# similar product to Magnum batonnet classic using all labels
gds.run_cypher("""
MATCH (p:Product {name:"Batonnet Classic"})-[r:HAS_ALLERGEN|HAS_GROUP1|HAS_GROUP2|HAS_LABEL|HAS_INGREDIENT*2]-(sim:Product)
return sim.code, sim.name, sim.genericName, sim.quantity,sim.servingSize, count(r) as Score order by Score desc limit 15
""")

Unnamed: 0,sim.code,sim.name,sim.genericName,sim.quantity,sim.servingSize,Score
0,8714100862636,Mini Batonnet Double Chocolat,Chocolate ice cream (with 6% milk chocolate) c...,300 g,60 ml (60 Millil Etat initial),15
1,8712100837890,Magnum Almond,,73 g,73g,14
2,8714100658420,Magnum vegan,Vanilla vegan ice cream coated with chocolate ...,3pcs,71 g,13
3,8711327434981,Vegan Sea Salt Caramel,Vegan sea salt caramel ice cream and chocolate...,213 g,71g,12
4,8714100390023,Vanilla light ice cream,Vanilla ice cream,500g,50g,12
5,8711327313712,Magnum Vegan Classic,,71 g,71g,12
6,8714100240038,Chocolate Fudge Brownie Non-Dairy Ice Cream,Choolate non-dairy ice cream with chocolate br...,395 g,2g,12
7,8714100658499,Vegan Almond Ice Cream 3 x,Vanilla vegan ice cream coated with chocolate ...,216 g,90 ml (90 Millil Etat initial),12
8,8717163691496,White Chocolate & Cookies,Cream ice cream with a cookie flavour chocolat...,222 g,74g,11
9,8714100638415,Mini Chocolate & Hazelnut Praliné,Chocolate and hazelnut flavour ice cream coate...,330 ml / 276 g,55 ml / 46 g,11


## 3. Product Matching using Similarities

### 3.1 fastRP + Knn on allergen, category and label

In [54]:
# For this demo, we will use compute similarities between Bread product only
# create Bread label 
gds.run_cypher("""
    match (p:Product)-[:HAS_GROUP1]->(c:Category {name:"bread"})
    set p:Bread
""")

In [55]:
g, _= gds.graph.project('simcat', 
    ['Bread','Category','Label','Allergen'],
    {'HAS_ALLERGEN':{'orientation':'UNDIRECTED'},
     'HAS_CATEGORY':{'orientation':'UNDIRECTED'},
     'HAS_GROUP2':{'orientation':'UNDIRECTED'},
     'HAS_LABEL':{'orientation':'UNDIRECTED'}
     }) 

Loading:   0%|          | 0/100 [00:00<?, ?%/s]

In [56]:
# It is possible to use nodeSimilarity directly. This is an example of results
gds.nodeSimilarity.stream(g)

Unnamed: 0,node1,node2,similarity
0,0,30,1.000000
1,0,103,1.000000
2,0,46,1.000000
3,0,43,1.000000
4,0,218,1.000000
...,...,...,...
8612,5825,477,0.714286
8613,5825,914,0.692308
8614,5825,437,0.692308
8615,5825,297,0.692308


In [57]:
# But fastRP + KNN will be used in the rest of the demo
# This is an example of fastRP embeddings
gds.fastRP.stream(g,embeddingDimension=128)

Unnamed: 0,nodeId,embedding
0,0,"[-0.14074522256851196, -0.05100741237401962, 0..."
1,1,"[-0.1506577581167221, -0.04532090574502945, 0...."
2,6,"[-0.15859204530715942, -0.026204336434602737, ..."
3,7,"[-0.1366301327943802, -0.05808933451771736, 0...."
4,8,"[-0.12003785371780396, -0.006167154759168625, ..."
...,...,...
1300,3868,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1301,3869,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1302,3870,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1303,3871,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [58]:
# algorithms in Graph Data Science can be chained
gds.fastRP.mutate(g,embeddingDimension=128, mutateProperty='fastRP')

nodePropertiesWritten                                                 1305
mutateMillis                                                             0
nodeCount                                                             1305
preProcessingMillis                                                      0
computeMillis                                                            7
configuration            {'nodeSelfInfluence': 0, 'propertyRatio': 0.0,...
Name: 0, dtype: object

In [59]:
gds.knn.write(g,nodeLabels=['Bread'], topK=10, nodeProperties=['fastRP'],writeRelationshipType='SIMILAR_TO',writeProperty='score')

ranIterations                                                             8
didConverge                                                            True
nodePairsConsidered                                                  241112
preProcessingMillis                                                       0
computeMillis                                                           107
writeMillis                                                              73
postProcessingMillis                                                     -1
nodesCompared                                                           637
relationshipsWritten                                                   6370
similarityDistribution    {'p1': 0.95733642578125, 'max': 1.000003814697...
configuration             {'topK': 10, 'maxIterations': 100, 'writeConcu...
Name: 0, dtype: object

SIMILAR_TO relationships can then be shown in Bloom. Use Louvain in Bloom to display clusters of bread types.

### 3.2 Product reconciliation

In [60]:
# Example of query to find similar products from the same brand. 
gds.run_cypher("""
MATCH (b:Brand)--(b1:Bread)-[r:SIMILAR_TO]->(b2:Bread)--(b)
RETURN b1.name, b2.name, b.name, r.score as score order by score desc
""").head(20)

Unnamed: 0,b1.name,b2.name,b.name,score
0,white bloomer,Tiger baton,tesco,1.0
1,White baton,Tiger baton,tesco,1.0
2,Tesco Finest Super Seeded Farmhouse 800G,Multiseed Batch,tesco,1.0
3,Tesco Sliced Fruit Loaf,Tesco Crusty White Farmhouse Sliced Bread 800G,tesco,1.0
4,tesco white baton,Tesco Big Bite Soft White Rolls 6 Pack,tesco,1.0
5,Tesco Finest Super Grained Farmhouse Bread,roast vegetable hand stretched flatbread,tesco,1.0
6,White Sourdough,roast vegetable hand stretched flatbread,tesco,1.0
7,Multiseed Batch,Tesco Finest Super Seeded Farmhouse 800G,tesco,1.0
8,Soft white rolls,Tesco Stone Baked White Boule,tesco,1.0
9,Garlic baguette,Tesco Stone Baked White Boule,tesco,1.0


### 3.3. Improve Product classification by learning Taxonomy from data

In [61]:
# Compute co-ocurence
gds.run_cypher("""
MATCH (c:Category)
WITH c, count{ (c)<-[:HAS_CATEGORY]-() } as totalCount
MATCH (c)<-[:HAS_CATEGORY]-(p)-[:HAS_CATEGORY]->(relatedCategory)
WITH c, relatedCategory, toFloat(count(p)) as countp, totalCount
CREATE (c)-[:CO_OCCURS {index: countp/ totalCount}]->(relatedCategory)
""")

In [62]:
# Infer same-as relationships 
gds.run_cypher("""
MATCH (c1)-[co1:CO_OCCURS {index:1}]->(c2),
    (c2)-[co2:CO_OCCURS {index:1}]->(c1)
WHERE ID(c1) > ID(c2)
MERGE (c1)-[:SAME_AS]-(c2)
""")

In [63]:
# Infer narrower-than relationships
gds.run_cypher("""
MATCH (c1)-[:CO_OCCURS {index:1}]->(c2),
    (c2)-[co2:CO_OCCURS]->(c1)
WHERE co2.index < 1
MERGE (c1)-[:NARROWER_THAN]->(c2)
""")

In [64]:
# Reduce transitive narrower-than relationships
gds.run_cypher("""
MATCH (c1)-[:NARROWER_THAN*2..]->(c3),
    (c1)-[d:NARROWER_THAN]->(c3)
DELETE d
""") 

Analysis can then be done in Bloom to improve Taxonomy / Categories

### 3.4 Improved Product Matching using FastRP + KNN

In [65]:
g2, _= gds.graph.project('simall3', 
    ['Bread','Category','Label','Allergen','Ingredient'],
    {'HAS_ALLERGEN':{'orientation':'UNDIRECTED'},
     'HAS_CATEGORY':{'orientation':'UNDIRECTED'},
     'HAS_GROUP1':{'orientation':'UNDIRECTED'},
     'HAS_GROUP2':{'orientation':'UNDIRECTED'},
     'HAS_LABEL':{'orientation':'UNDIRECTED'},
     'NARROWER_THAN':{'orientation':'UNDIRECTED'},
     'HAS_INGREDIENT':{'orientation':'UNDIRECTED'},
     'SAME_AS':{'orientation':'UNDIRECTED'}
     }) 

In [66]:
gds.fastRP.mutate(g2,embeddingDimension=1028, mutateProperty='fastRP')

nodePropertiesWritten                                                 3256
mutateMillis                                                             0
nodeCount                                                             3256
preProcessingMillis                                                      0
computeMillis                                                           68
configuration            {'nodeSelfInfluence': 0, 'propertyRatio': 0.0,...
Name: 0, dtype: object

In [67]:
gds.knn.write(g2,nodeLabels=['Bread'], topK=10, nodeProperties=['fastRP'],writeRelationshipType='SIMILAR_TO_ALL',writeProperty='score')

ranIterations                                                             8
didConverge                                                            True
nodePairsConsidered                                                  237640
preProcessingMillis                                                       1
computeMillis                                                           310
writeMillis                                                              66
postProcessingMillis                                                     -1
nodesCompared                                                           637
relationshipsWritten                                                   6370
similarityDistribution    {'p1': 0.8946685791015625, 'max': 1.0000038146...
configuration             {'topK': 10, 'maxIterations': 100, 'writeConcu...
Name: 0, dtype: object

In [68]:
gds.run_cypher("""
    MATCH (p:Product {code:"5060195901334"})-[r:SIMILAR_TO_ALL]-(sim)
    RETURN p.name, sim.name, r.score as score order by score desc
""")

Unnamed: 0,p.name,sim.name,score
0,Gluten Free Soft White Farmhouse,"Gluten free sliced cob made with linseed, mill...",0.969881
1,Gluten Free Soft White Farmhouse,"Gluten free sliced cob made with linseed, mill...",0.969881
2,Gluten Free Soft White Farmhouse,Gluten free bread,0.965503
3,Gluten Free Soft White Farmhouse,Gluten free bread,0.965503
4,Gluten Free Soft White Farmhouse,Tiger rolls,0.964273
5,Gluten Free Soft White Farmhouse,Tiger rolls,0.964273
6,Gluten Free Soft White Farmhouse,Gluten free Bread,0.962275
7,Gluten Free Soft White Farmhouse,Gluten free Bread,0.962275
8,Gluten Free Soft White Farmhouse,Gluten free tiger loaf,0.959957
9,Gluten Free Soft White Farmhouse,Gluten free tiger loaf,0.959957


## 4. Product Matching using free text (product description)

### 4.1 Using Lucene Full-text search index

In [69]:
# lucene analyzers 
gds.run_cypher("""CALL db.index.fulltext.listAvailableAnalyzers""")

Unnamed: 0,analyzer,description,stopwords
0,standard-folding,Analyzer that uses ASCIIFoldingFilter to remov...,"[but, be, with, such, then, for, no, will, not..."
1,lithuanian,Lithuanian analyzer with stemming and stop wor...,"[judviejų, to, jį, jie, tavimi, judviese, tuo,..."
2,simple,A simple analyzer that tokenizes at non-letter...,[]
3,latvian,Latvian analyzer with stemming and stop word f...,"[varēšu, pār, varēja, pret, nebūt, nezin, nevi..."
4,cjk,CJK - Chinese/Japanese/Korean - analyzer. Term...,"[but, be, with, such, if, for, no, will, not, ..."
5,sorani,Sorani Kurdish analyzer with stemming and stop...,"[دەکات, لێ, ئەوەی, لە, بەبێ, لەناو, پێش, لەلای..."
6,stop,Stop analyzer tokenizes at non-letter characte...,"[but, be, with, such, then, for, no, will, not..."
7,indonesian,Indonesian analyzer with stemming and stop wor...,"[entahlah, ataupun, walau, seorang, sewaktu, m..."
8,keyword,"Keyword analyzer ""tokenizes"" the text as a sin...",[]
9,arabic,"Arabic analyzer with light stemming, as specif...","[فان, او, اى, اي, لها, كما, أنت, وإن, انت, حيث..."


In [70]:
# create index on string properties (name, genericName and ingredients as free text)
gds.run_cypher("""
    CREATE FULLTEXT INDEX namesAndIng FOR (n:Product) ON EACH [n.name, n.genericName, n.ingredients]
""")

In [71]:
# Products can be searched using free text queries
gds.run_cypher("""
    CALL db.index.fulltext.queryNodes("namesAndIng","wheat")
    YIELD node, score
    RETURN node.name, node.genericName, node.ingredients, score
""").head(20)

Unnamed: 0,node.name,node.genericName,node.ingredients,score
0,Wheat & corn tortillas,Wheat and corn tortilla wraps,"Fortified Wheat Flour (_Wheat_ Flour, Calcium ...",3.797634
1,Wheat & Rye Sourdough Bread,,"Fortified wheat flour (wheat flour, calcium ca...",2.671103
2,Wheat flour sourdough bread,,"Fortified British Wheat Flour (Wheat Flour, Ca...",2.664444
3,White & Wheat Medium Sliced,,"Fortified Wheat Flour [Wheat Flour, Calcium Ca...",2.610072
4,Traditional wheat & rye sourdough bread,Polish Bread,"Wheat Flour (45%), water, rye flour 11%, rye s...",2.374334
5,Sprouted wheat bread,,,2.326651
6,Piadina Wheat Flatbreads,,,2.326651
7,Wheat and Rye Soughdough Half Bloomer,,"Fortified British Wheat Flour (Wheat Flour, Ca...",2.283336
8,Super soft medium tortilla with while wheat,,"Flour blend (WHEAT (45%), whole WHEAT, WHEAT b...",2.180981
9,Arabic Bread Whole Wheat,,,2.077649


In [72]:
# This command does not return any result
gds.run_cypher("""
    CALL db.index.fulltext.queryNodes("namesAndIng",'genericName:bred')
    YIELD node, score
    RETURN node.name, node.genericName, score
""")

Unnamed: 0,node.name,node.genericName,score


In [73]:
# Fuzzy search
gds.run_cypher("""
    CALL db.index.fulltext.queryNodes("namesAndIng",'genericName:bred~')
    YIELD node, score
    RETURN node.name, node.genericName, score
""")

Unnamed: 0,node.name,node.genericName,score
0,Seeded Batch,"Thick sliced white bread with sesame seed, sun...",1.662897
1,Pain wholemeal with Rye,sliced wholemeal bread with added rye bran,1.562331
2,Multiseed stonebaked baguettes,"Part-baked, wheat based baguettes, with sourdo...",1.536985
3,Gluten free bread,Bread Sinclair Nature (Gluten Free),1.506417
4,3 Choc Sticks,Diary free vanilla iced dessert with chocolate...,1.490883
...,...,...,...
89,Superseded sliced loaf bread,"Sliced white bread with sunflower seeds, linse...",0.413867
90,Seeded medium sliced white,"Medium sliced white bread, with linseed, sunfl...",0.413867
91,Meatless maverick,Plant-based patty formed from pea and bean pro...,0.346589
92,Garlic & herbs croutons,Oven baked croutons made with garlic purée and...,0.338035


In [74]:
# search and match
gds.run_cypher("""
    CALL db.index.fulltext.queryNodes("namesAndIng",'name:"chocolate ice cream"')
    YIELD node, score
    OPTIONAL MATCH (node)-[:HAS_BRAND]-(b:Brand)
    RETURN node.name, node.genericName, node.ingredients, b.name, score
""")

Unnamed: 0,node.name,node.genericName,node.ingredients,b.name,score
0,Chocolate Ice Cream,,"Water, Coconut Oil (10%), Sugar, Lupin Protein...",wicked,3.105481
1,Chocolate ice cream,,,asda,3.105481
2,Chocolate ice cream,,,waitrose,3.105481
3,Vegan Chocolate Ice Cream,Vegan chocolate flavoured ice cream with sugar...,"Water, soluble corn fibre, sweeteners (erythr...",jude-s,2.773128
4,Mini Chocolate Ice Cream,,,tesco,2.773128
5,milk chocolate ice cream,,,,2.773128
6,White chocolate ice cream,,,asda,2.773128
7,Triple Chocolate Ice Cream,,,by-sainsbury-s,2.773128
8,Belgian Chocolate Ice Cream,,"_Fresh cream_ 29%, condensed skimmed milk, Bel...",haagen-dazs,2.773128
9,Buzz Chocolate Ice Cream,,,,2.773128


### 4.2. Using OpenAI text embeddings

This part requires:
- an OpenAI key
- [APOC extended](https://github.com/neo4j-contrib/neo4j-apoc-procedures) version > 5.8

In [75]:
# Will be used only on Bread with genericName size > 20 characters
gds.run_cypher("""
    MATCH (b:Bread) where size(b.genericName) > 20 
    set b:Bgname 
    return count(b)
""")


Unnamed: 0,count(b)
0,88


In [76]:
# Compute OpenAi Embeddings
gds.run_cypher("""
    MATCH (b:Bgname)
    WITH b, b.genericName as genName
    CALL apoc.ml.openai.embedding([genName],$apiKey) 
    YIELD embedding
    SET b.openAiEmbedding = embedding
""", {'apiKey':openai_api_key})

In [77]:
# OpenAI embeddings dimension: 1536
gds.run_cypher("""
    MATCH (b:Bgname) return size(b.openAiEmbedding) limit 5
""")

Unnamed: 0,size(b.openAiEmbedding)
0,1536
1,1536
2,1536
3,1536
4,1536


In [78]:
# KNN on OpenAIEmbeddings
g3,_= gds.graph.project("TextEmbed2", {"Bgname": {"properties":"openAiEmbedding"}},'*')

In [79]:
gds.knn.write(g3, topK=10, nodeProperties=['openAiEmbedding'],writeRelationshipType='SIMILAR_OPENAI',writeProperty='score')

ranIterations                                                             5
didConverge                                                            True
nodePairsConsidered                                                   25669
preProcessingMillis                                                       0
computeMillis                                                            50
writeMillis                                                              16
postProcessingMillis                                                     -1
nodesCompared                                                            88
relationshipsWritten                                                    880
similarityDistribution    {'p1': 0.916046142578125, 'max': 1.00000381469...
configuration             {'topK': 10, 'maxIterations': 100, 'writeConcu...
Name: 0, dtype: object

In [80]:
gds.run_cypher("""
    MATCH (n:Bgname)-[r:SIMILAR_OPENAI]->(m)
    where r.score < 1 AND id(n)>id(m)
    return r.score as score, n.genericName, m.genericName
    order by score desc limit 20
""")

Unnamed: 0,score,n.genericName,m.genericName
0,0.999999,Part baked white baguettes,Part baked white baguettes
1,0.999999,Part baked white baguettes,Part baked white baguettes
2,0.999999,Part baked white baguettes,Part baked white baguettes
3,0.999983,Thick Sliced White Bread,Thick Sliced White Bread
4,0.99956,Lime and coriander chutney flavour potato & gr...,Lime and coriander chutney flavour potato and ...
5,0.994954,Part baked white bread baguettes,Part baked white baguettes
6,0.994954,Part baked white baguettes,Part baked white bread baguettes
7,0.994954,Part baked white bread baguettes,Part baked white baguettes
8,0.994951,Part baked white bread baguettes,Part baked white baguettes
9,0.994487,Plain wheat flour tortilla wraps,Plain wheat tortilla wraps


In [81]:
# Comparison between the different similarity scores
gds.run_cypher("""
    MATCH (n:Bgname)-[r:SIMILAR_OPENAI]->(m)
    where r.score < 1 AND id(n)>id(m)
    OPTIONAL MATCH (n)-[r2:SIMILAR_TO_ALL]->(m)
    return r2.score as nodescore, r.score as score, n.genericName, m.genericName
    order by score desc limit 20
""")

Unnamed: 0,nodescore,score,n.genericName,m.genericName
0,0.982505,0.999999,Part baked white baguettes,Part baked white baguettes
1,0.991141,0.999999,Part baked white baguettes,Part baked white baguettes
2,,0.999999,Part baked white baguettes,Part baked white baguettes
3,,0.999983,Thick Sliced White Bread,Thick Sliced White Bread
4,0.993377,0.99956,Lime and coriander chutney flavour potato & gr...,Lime and coriander chutney flavour potato and ...
5,,0.994954,Part baked white bread baguettes,Part baked white baguettes
6,0.985517,0.994954,Part baked white baguettes,Part baked white bread baguettes
7,0.994417,0.994954,Part baked white bread baguettes,Part baked white baguettes
8,,0.994951,Part baked white bread baguettes,Part baked white baguettes
9,0.998847,0.994487,Plain wheat flour tortilla wraps,Plain wheat tortilla wraps


### 4.3. It is possible to concatenate embeddings for improved similarities

In [82]:
g4, _= gds.graph.project('simall5', 
    ['Bread','Category','Label','Allergen','Ingredient'],
    {'HAS_ALLERGEN':{'orientation':'UNDIRECTED'},
     'HAS_CATEGORY':{'orientation':'UNDIRECTED'},
     'HAS_GROUP1':{'orientation':'UNDIRECTED'},
     'HAS_GROUP2':{'orientation':'UNDIRECTED'},
     'HAS_LABEL':{'orientation':'UNDIRECTED'},
     'NARROWER_THAN':{'orientation':'UNDIRECTED'},
     'HAS_INGREDIENT':{'orientation':'UNDIRECTED'},
     'SAME_AS':{'orientation':'UNDIRECTED'}
     }) 

In [83]:
# Compute FastRP embeddings
gds.fastRP.write(g4,embeddingDimension=1028, writeProperty="FastRP_all")

nodeCount                                                             3256
nodePropertiesWritten                                                 3256
preProcessingMillis                                                      0
computeMillis                                                           63
writeMillis                                                            164
configuration            {'writeConcurrency': 4, 'nodeSelfInfluence': 0...
Name: 0, dtype: object

In [84]:
# Concatenate fastRP and OpenAI Embeddings
gds.run_cypher("""
    MATCH (b:Bgname)
    SET b.totalEmbeddings = b.FastRP_all + b.openAiEmbedding
""")

In [85]:
# Embedding size is now 2564
gds.run_cypher("""
    MATCH (b:Bgname)
    RETURN size(b.totalEmbeddings) limit 1
""")

Unnamed: 0,size(b.totalEmbeddings)
0,2564


In [86]:
# KNN on Concatenated Embeddings
g5,_= gds.graph.project("AllEmbed2", {"Bgname": {"properties":"totalEmbeddings"}},'*')

In [87]:
gds.knn.write(g5, topK=10, nodeProperties=['totalEmbeddings'],writeRelationshipType='SIMILAR_TOTAL_EMB',writeProperty='score')

ranIterations                                                             5
didConverge                                                            True
nodePairsConsidered                                                   24681
preProcessingMillis                                                       0
computeMillis                                                            87
writeMillis                                                              14
postProcessingMillis                                                     -1
nodesCompared                                                            88
relationshipsWritten                                                    880
similarityDistribution    {'p1': 0.9131889343261719, 'max': 0.9994201660...
configuration             {'topK': 10, 'maxIterations': 100, 'writeConcu...
Name: 0, dtype: object

Show results in Bloom

### 4.4 Using OpenAI embeddings for online product matching

In [88]:
question = "bake at home french baguette"

gds.run_cypher("""

    CALL apoc.ml.openai.embedding([$question],$apiKey) 
    YIELD embedding
    MATCH (b:Bgname)
    WITH b, gds.similarity.cosine(embedding, b.openAiEmbedding) AS score
    RETURN b.name, score order by score desc
""", {'apiKey':openai_api_key, 'question':question})

Unnamed: 0,b.name,score
0,Bake at Home Brown Baguettes,0.926832
1,Baguettes,0.922320
2,Bake at home white baguettes,0.922320
3,Bake at home white baguettes,0.922320
4,2 White Baguettes,0.922295
...,...,...
83,Seeded tortilla wraps,0.775390
84,Quinoa & Chia Seed Wrap with Teff Seeds & Flax...,0.769985
85,Crunchy Taco Shells,0.762265
86,Crunchy Taco Shells,0.762265


In [89]:
# Results of OpenAI embedding matching queries can be chained with regular cypher queries
question = "bake at home french baguette"

gds.run_cypher("""

    CALL apoc.ml.openai.embedding([$question],$apiKey) 
    YIELD embedding
    MATCH (b:Bgname)-[:HAS_LABEL]->(:Label {name:"vegan"})
    WITH b, gds.similarity.cosine(embedding, b.openAiEmbedding) AS score
    RETURN b.name, score order by score desc
""", {'apiKey':openai_api_key, 'question':question})

Unnamed: 0,b.name,score
0,Bake at Home Brown Baguettes,0.926832
1,Bake at home white baguettes,0.92232
2,Baguettes,0.92232
3,Stone Oven Baguette White,0.881803
4,Bake at home white petits pains,0.858252
5,Bake at Home Rolls,0.855757
6,Seed Sensations,0.831724
7,Soft White Thick,0.831509
8,Medium Sliced Soft White Bread,0.829225
9,White Bread,0.829225


## NEXT 
- Any embeddings, like images, can be stored on external node (so several images per product is possible)

- Customers can be imported as nodes, with (:Customer)-[:PURCHASED]-(:Product) or (:Customer)-[:VIEWED]-(:Product) relationships

    - Similar segmentation can be done on Customers (vegan, halal...) to improve customer recommendation

    - SIMILAR_TO relationships can be computed on customer to allow hyper personalisation
    
    - Using apoc.ml.openai.completion, product descriptions can be personalised on the fly