In [1]:
import json
import pandas as pd
from itertools import combinations
import Levenshtein
import numpy as np
from sklearn.cluster import DBSCAN

In [2]:
with open('data/przepisy.json') as f:
    recipes = json.load(f)


In [3]:
ingredients = []
for recipe in recipes:
    for ingredient in recipe['ingredient_list']:
        ingredients.append([ recipe['url'], ingredient['ingredient_name'], 
                            ingredient['ingredient_amount'],
                            ingredient['ingredient_unit']])

In [4]:
df = pd.DataFrame(ingredients, columns=["URL", "nazwa", "ilosc", "jednostka"])

In [5]:
df["nazwa"].head()

0     makaron świderki
1    filet z kurczaka 
2               cebula
3              papryka
4              pomidor
Name: nazwa, dtype: object

In [6]:
names = df["nazwa"].unique()

In [7]:
pairs = combinations(enumerate(names), 2)
n_names = len(names)
distances = np.zeros((n_names, n_names))
for (index1, prod1), (index2, prod2) in pairs:
    distance = Levenshtein.distance(prod1, prod2) / max(len(prod1), len(prod2))
    distances[index1, index2] = distance
    distances[index2, index1] = distance

In [8]:
distances

array([[0.        , 0.88235294, 1.        , ..., 0.68      , 0.81818182,
        0.86666667],
       [0.88235294, 0.        , 0.82352941, ..., 0.8       , 0.81818182,
        0.86666667],
       [1.        , 0.82352941, 0.        , ..., 0.88      , 0.95454545,
        0.86666667],
       ...,
       [0.68      , 0.8       , 0.88      , ..., 0.        , 0.52      ,
        0.76666667],
       [0.81818182, 0.81818182, 0.95454545, ..., 0.52      , 0.        ,
        0.86666667],
       [0.86666667, 0.86666667, 0.86666667, ..., 0.76666667, 0.86666667,
        0.        ]])

In [9]:
clust = DBSCAN(eps=0.3, min_samples=2, metric='precomputed', n_jobs=-1)
clust.fit(distances)

DBSCAN(algorithm='auto', eps=0.3, leaf_size=30, metric='precomputed',
       metric_params=None, min_samples=2, n_jobs=-1, p=None)

In [13]:
labels = clust.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
clusters = clust.labels_
for cluster_id in range(n_clusters):
    indexes, = np.where(labels == cluster_id)
    cluster_names = [names[i] for i in indexes.tolist()]
    print(f"Cluster {cluster_id}:\n({len(cluster_names)} produktów): {set(cluster_names)}\n" )

Cluster 0:
(55 produktów): {'makaron chiński', 'makaron literki', 'makaron np: rurki', 'makaron macaroni', 'makaron spagetti', 'makaron np. świderki', 'makaron, np. świderki', 'makaron gniazda ', 'makaron krótki, np. świderki', 'makaron razowy', 'makaron sojowy ', 'makaron koraliki', 'makaron Rigatoni', 'makaron Bucatini', 'makaron ryżowy', 'makaron rurki', 'makaron gryczany', 'makaron muszelki', 'makaron orzo ', 'makaron udon', 'makaron drobny', 'makaron tortellini', 'makaron np. spaghetti', 'makaron orechiette', 'makaron spaghetti', 'makaron kokardki', 'makaron fusilli', 'makaron, np. penne', 'makaron penne', 'makaron Wonton', 'makaron typu Penne', 'makaron orecchiette ', 'makaron, np. rurki', 'makaron rigatoni', 'makaron typ muszelki', 'makaron diatali', 'makaron świderki', 'makaron gniazda nitki', 'makaron typu świderki', 'makaronowe rurki', 'makaron pennette', 'makaron gwiazdki', 'makaron pipette', 'makaron ryzowy', 'makaron wstążki', 'makaron duże muszle', 'makaron nitki', 'makar