In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from utils import *

In [2]:
df = pd.read_pickle("data/flickr_data_with_features.pkl")

with open("data/tags_to_delete.txt", "r") as f:
    tags_to_delete = set(line.strip() for line in f)

In [3]:
YEAR = 2016
N_CLUSTERS = 240
ONLY_SIMILAR_YEARS = True

sample = None
if YEAR == "all":
    sample = df.copy()
else:
    sample = df[df["date_taken"].dt.year == YEAR].copy()

if ONLY_SIMILAR_YEARS:
    sample = sample[sample["similar_year"] == 1]

coords = sample[["lat", "long"]].values

lat_rad = np.radians(coords[:, 0])
long_rad = np.radians(coords[:, 1])
x = np.cos(lat_rad) * np.cos(long_rad)
y = np.cos(lat_rad) * np.sin(long_rad)
z = np.sin(lat_rad)
coords_cartesian = np.column_stack([x, y, z])

clustering = AgglomerativeClustering(
    n_clusters=N_CLUSTERS,
    metric='euclidean',
    linkage='ward'
)
sample["cluster"] = clustering.fit_predict(coords_cartesian)

n_clusters = len(set(sample["cluster"]))

print(f"Number of clusters: {n_clusters}")
print(f"\nCluster distribution:")
print(sample["cluster"].value_counts().sort_index())

Number of clusters: 240

Cluster distribution:
cluster
0       19
1      500
2       12
3       77
4       59
      ... 
235     19
236     15
237      5
238     65
239      1
Name: count, Length: 240, dtype: int64


In [4]:
N_TOP_TAGS = 5

cluster_top_tags = get_cluster_top_tags(sample, tags_to_delete, N_TOP_TAGS)
print(f"\nTop {N_TOP_TAGS} tags per cluster:")
for cluster_id in sorted(cluster_top_tags.keys()):
    print(f"Cluster {cluster_id}: {', '.join(cluster_top_tags[cluster_id])}")


Top 5 tags per cluster:
Cluster 0: streetphotography, noiretblanc, blackandwhite, streetart, pavement
Cluster 1: rhône, fourvière, square, basilique, nuit
Cluster 2: las, streetart, paper, wheatpaste, lasgatas
Cluster 3: adcet, tect, sanscontact»nfcollectivités, territorialeslyonsanscontactnfccollectivités, territoriales
Cluster 4: rhone, alpes, tige, fleur, flowers
Cluster 5: square, clarendon, valencia
Cluster 6: rhône, light, night, nuit, rhone
Cluster 7: streetart, wheatpaste, wheatpaper, paper, 5dmkiii
Cluster 8: fontaine, placedesjacobins, fetedeslumieres, night, jacobins
Cluster 9: basket, petits, rois, scb, tournoi
Cluster 10: croixrousse, streetart, origo, 4ème, restaurant
Cluster 11: square, mural, mur, peint, trompeloeil
Cluster 12: square, clarendon, perpetua, gingham
Cluster 13: skateboarding, skateboard, street, skate, skatepark
Cluster 14: miniature, rhône, nikkor, musée, 18105
Cluster 15: gare, paul, saint, fêtedeslumières, 5ème
Cluster 16: ldollfestival, ldoll2016, ld

In [5]:
save_dir = Path("./data/agglomerative/")
save_dir.mkdir(parents=True, exist_ok=True)
create_cluster_map(sample=sample, cluster_top_tags=cluster_top_tags, year=YEAR, show_noise=False, show_points=False, save_dir=save_dir)

Cluster map saved to data/agglomerative/2016_clusters_map.html
