In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.cluster import DBSCAN
from utils import *

In [2]:
df = pd.read_pickle("data/flickr_data_with_features.pkl")

with open("data/tags_to_delete.txt", "r") as f:
    tags_to_delete = set(line.strip() for line in f)

In [3]:
YEAR = "all"
DISTANCE_METERS = 25
MIN_SAMPLES = 250
ONLY_SIMILAR_YEARS = True

sample = None
if YEAR == "all":
    sample = df.copy()
else:
    sample = df[df["date_taken"].dt.year == YEAR].copy()

if ONLY_SIMILAR_YEARS:
    sample = sample[sample["similar_year"] == 1]

coords = sample[["lat", "long"]].values

EARTH_RADIUS = 6371.0  # in km

clustering = DBSCAN(eps=DISTANCE_METERS / (EARTH_RADIUS * 1000), min_samples=MIN_SAMPLES, metric="haversine", algorithm="brute").fit(
    np.radians(coords)
)
sample["cluster"] = clustering.labels_

n_clusters = len(set(clustering.labels_)) - (1 if -1 in clustering.labels_ else 0)
n_noise = list(clustering.labels_).count(-1)

print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")
print(f"\nCluster distribution:")
print(sample["cluster"].value_counts().sort_index())

Number of clusters: 68
Number of noise points: 95516

Cluster distribution:
cluster
-1     95516
 0      7115
 1      5654
 2     14711
 3       332
       ...  
 63      769
 64      190
 65      269
 66     1208
 67      498
Name: count, Length: 69, dtype: int64


In [4]:
# Get top N tags for each cluster
N_TOP_TAGS = 5  # Change this number as needed

cluster_top_tags = get_cluster_top_tags(sample, tags_to_delete, N_TOP_TAGS)
print(f"\nTop {N_TOP_TAGS} tags per cluster:")
for cluster_id in sorted(cluster_top_tags.keys()):
    print(f"Cluster {cluster_id}: {', '.join(cluster_top_tags[cluster_id])}")


Top 5 tags per cluster:
Cluster 0: fourvière, basilique, church, rhône, fourviere
Cluster 1: terreaux, placedesterreaux, muséedesbeauxartsdelyon, museum, place
Cluster 2: demeureduchaos, abodeofchaos, thierryehrmann, alchemy, prophecy
Cluster 3: vieuxlyon, musée, histoire, historique, architequture
Cluster 4: bellecour, placebellecour, place, fêtedeslumières, places
Cluster 5: square, opéra, fêtedeslumières, foursquare:venue=5148ec5ce4b0c012d308cb24, opera
Cluster 6: vieuxlyon, square, placeduchange, rhône, unesco
Cluster 7: vieuxlyon, rhône, cathédrale, saintjean, musée
Cluster 8: rhône, night, river, nuit, jossaris
Cluster 9: théâtredescélestins, fêtedeslumières, rhône, parking, architecture
Cluster 10: jacobins, fontaine, placedesjacobins, fêtedeslumières, rhône
Cluster 11: fêtedeslumières, lumières, placedelarépublique, rhône, illuminations
Cluster 12: croixrousse, streetart, paper, wheatpaste, wheatpaper
Cluster 13: bridge, saône, pont, saxophone, les
Cluster 14: square, foursqua

In [5]:
save_dir = Path("./data/dbscan/")
save_dir.mkdir(parents=True, exist_ok=True)
create_cluster_map(sample=sample, cluster_top_tags=cluster_top_tags, year=YEAR, show_noise=False, show_points=False, save_dir=save_dir)

Cluster map saved to data/dbscan/all_clusters_map.html
