# Summary

Below Code was generated by Copilot on asking: I have a list of restaurant names. Often there are variations of specific terms. I would like to create a classification by such terms. How can I find a list with terms that actually exist with variations in the list?

In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd
import re

In [None]:
# ------------------------------------------------------
# 1. Load embedding model
# ------------------------------------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# ------------------------------------------------------
# 2. Your restaurant list
# ------------------------------------------------------
restaurants = [
    "Joe's Pizza",
    "Joes Pizzeria",
    "Pizzeria Bella",
    "Thai Express",
    "Thai Xpress",
    "Sushi Bar Kyoto",
    "Kyoto Sushi House"
]

In [8]:
df = pd.read_csv("data/df_restaurants_counts.csv")
restaurants = df["name"].tolist()
print(f"Loaded {len(restaurants)} restaurant names from CSV.")


Loaded 76840 restaurant names from CSV.


In [9]:
# ------------------------------------------------------
# 3. Normalize + tokenize
# ------------------------------------------------------
def tokenize(name):
    name = name.lower()
    name = re.sub(r"[^a-zäöüß0-9 ]", " ", name)
    return [t for t in name.split() if t]

tokens = []
for r in restaurants:
    tokens.extend(tokenize(r))

unique_tokens = list(set(tokens))
print("Unique tokens:", unique_tokens)

# ------------------------------------------------------
# 4. Create embeddings
# ------------------------------------------------------
embeddings = model.encode(unique_tokens)

# ------------------------------------------------------
# 5. Cluster tokens based on similarity
# ------------------------------------------------------
# DBSCAN groups points based on distance; eps controls similarity threshold
clustering = DBSCAN(eps=0.35, min_samples=2, metric="cosine").fit(embeddings)

clusters = {}
for token, label in zip(unique_tokens, clustering.labels_):
    clusters.setdefault(label, []).append(token)

# ------------------------------------------------------
# 6. Print clusters
# ------------------------------------------------------
print("\nToken clusters:")
for label, group in clusters.items():
    if label == -1:
        print("Unclustered:", group)
    else:
        print(f"Cluster {label}:", group)


Unique tokens: ['schloßkeller', 'pumpstation', 'eninger', 'eisenhut', 'zollbrücke', 'mie', 'schüler', '18zwo', 'muhl', 'morgenroth', 'roeßing', 'emmas', 'schnoits', 'dönercenter', 'ilion', '89anju', 'siller', 'backes', 'stuttgarter', 'feines19', 'bufalo', 'edenbergen', 'laganini', 'dosa', 'orlenberg', 'jasmy', 'metall', 'bua', 'kühlungsborner', 'muldaer', 'meerbusch', 'adaro', 'shahrzad', 'barolo', 'hohenwarsleben', 'samira', 'wittmund', 'radha', 'lebenshilfe', 'gärtner', 'buhbe', 'fleischeslust', 'dragone', 'uhles', 'pölkenhof', 'alarico', 'saarbrigger', 'stuhlmacher', 'monchis', 'wohnwelt', 'liebesbier', 'hexenhäuschen', 'nr1', 'westensee', 'duo', 'maulwurf', 'vareler', 'speist', 'caverna', 'cava', 'oelkers', 'lönskrug', 'füssen', 'posta', 'moccachino', 'hoffis', 'altöttinger', 'ugo', 'people', '400', 'recke', 'artur', 'klostergarten', 'jit', 'ikea', 'welschof', 'daeli', 'omero', 'düll', 'chhurrinos', 'chocolate', 'plan', 'haueis', 'peperone', 'degustibus', 'inselmühle', 'blick', 'go