In [47]:
import jellyfish

In [48]:
varosnevek = [
 'Debrecen', 'Budapest', 'budapest', 'Futapest','debrecen', 'dabratyin',
 'necerbed', 'Bduapets', 'Budapest', 'Budalest', 'budapes', 'Debrecen',
 'derbrecen', 'Debercen', 'Debrecen'
]

varoslista = ['Debrecen', 'Budapest']
hasonlo_lista = []
nem_hasonlo_lista = []

threshold = 0.9

for j in varosnevek:
    matched = False
    j_norm = j.casefold().strip()          # normalize case
    for i in varoslista:
        i_norm = i.casefold().strip()
        if jellyfish.jaro_similarity(i_norm, j_norm) > threshold:
            hasonlo_lista.append(j)
            matched = True
            break                          # stop after first good match
    if not matched:
        nem_hasonlo_lista.append(j)

hasonlo_lista, nem_hasonlo_lista


(['Debrecen',
  'Budapest',
  'budapest',
  'debrecen',
  'Bduapets',
  'Budapest',
  'Budalest',
  'budapes',
  'Debrecen',
  'derbrecen',
  'Debercen',
  'Debrecen'],
 ['Futapest', 'dabratyin', 'necerbed'])

### “Best canonical” version.  
When you have multiple canonicals (Debrecen, Budapest, maybe later more), it can be useful to not just say “similar” but also which one it’s closest to. That’s what the “best canonical” method does.  
Why it’s useful  
  
✅ You can cluster variants of each city under its canonical form.  
  
✅ You avoid duplicates across hasonlo and nem_hasonlo.  
  
✅ You preserve the mapping ("Bduapets" → "Budapest") which is often the real goal.  
  
✅ Easier to scale: if later you add "Miskolc" or "Szeged" to varoslista, the method still works.  

In [50]:
from collections import defaultdict

matches_by_canon = defaultdict(list)
unmatched = []

for j in varosnevek:
    j_norm = j.casefold().strip()
    best_i, best_score = None, 0
    for i in varoslista:
        i_norm = i.casefold().strip()
        s = jellyfish.jaro_similarity(i_norm, j_norm)
        if s > best_score:
            best_score, best_i = s, i
    if best_score > threshold:
        matches_by_canon[best_i].append(j)
    else:
        unmatched.append(j)

matches_by_canon, unmatched

(defaultdict(list,
             {'Debrecen': ['Debrecen',
               'debrecen',
               'Debrecen',
               'derbrecen',
               'Debercen',
               'Debrecen'],
              'Budapest': ['Budapest',
               'budapest',
               'Bduapets',
               'Budapest',
               'Budalest',
               'budapes']}),
 ['Futapest', 'dabratyin', 'necerbed'])