In [12]:
import pandas as pd
import ast
import re
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from tqdm import tqdm

tqdm.pandas()  # Enables df.progress_apply

def normalize_text(s):
    """Lowercase, remove quotes and extra spaces."""
    return re.sub(r"\s+", " ", re.sub(r"[\"']", "", s.strip().lower()))

def extract_unique_amenities(df):
    unique = set()
    parsed_amenities = []
    
    for val in tqdm(df["amenities"], desc="Parsing amenities"):
        try:
            lst = ast.literal_eval(val) if isinstance(val, str) else []
            lst = [normalize_text(str(a)) for a in lst]
        except Exception:
            lst = []
        parsed_amenities.append(lst)
        unique.update(lst)
    
    return list(unique), parsed_amenities

from collections import defaultdict

def cluster_amenities(unique_amenities, n_clusters=None):
    print(f"Encoding {len(unique_amenities)} unique amenities...")
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = list(tqdm(model.encode(unique_amenities, show_progress_bar=False), 
                            total=len(unique_amenities), 
                            desc="Encoding embeddings"))
    
    if not n_clusters:
        n_clusters = max(2, int(len(unique_amenities) ** 0.5))
    
    print(f"Clustering into {n_clusters} groups...")
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clustering.fit_predict(embeddings)
    
    # Group amenities by cluster
    cluster_to_amenities = defaultdict(list)
    for amenity, label in zip(unique_amenities, labels):
        cluster_to_amenities[label].append(amenity)
    
    # Pick a representative name for each cluster
    cluster_names = {}
    for cluster, items in cluster_to_amenities.items():
        # Pick the shortest name (after sorting)
        rep_name = sorted(items, key=len)[0]
        # Clean name for column use
        rep_name = rep_name.replace(" ", "_").replace("-", "_")
        cluster_names[cluster] = rep_name
    
    # Map amenity → cluster name
    amenity_to_cluster = {amenity: cluster_names[label] 
                          for amenity, label in zip(unique_amenities, labels)}
    
    return amenity_to_cluster, cluster_to_amenities


def expand_amenities_semantic(df, amenity_to_cluster, parsed_amenities):
    cluster_names = set(amenity_to_cluster.values())
    
    # Initialize binary columns for each cluster name
    for cname in cluster_names:
        df[f"amenity_{cname}"] = 0
    
    for idx, lst in tqdm(enumerate(parsed_amenities), 
                         total=len(parsed_amenities), 
                         desc="Assigning amenities to clusters"):
        cluster_ids = {amenity_to_cluster[a] for a in lst if a in amenity_to_cluster}
        for cname in cluster_ids:
            df.at[idx, f"amenity_{cname}"] = 1
    
    return df

def process_airbnb_with_semantic_amenities(path, n_clusters=None):
    df = pd.read_csv(path)
    unique_amenities, parsed_amenities = extract_unique_amenities(df)
    amenity_to_cluster, cluster_to_amenities = cluster_amenities(unique_amenities, n_clusters)
    df = expand_amenities_semantic(df, amenity_to_cluster, parsed_amenities)
    return df, cluster_to_amenities



In [15]:
if __name__ == "__main__":
    df, mapping = process_airbnb_with_semantic_amenities(r"C:\Users\hodos\Documents\Uni\Uni-Year-3\Semester2\Data\cleaned_listings.csv", n_clusters=20)
    df.to_csv("cleaned_with_clusters.csv", index=False)
    print("Cluster mapping:", mapping)


Parsing amenities: 100%|██████████| 7831/7831 [00:01<00:00, 6691.36it/s]


Encoding 2931 unique amenities...


Encoding embeddings: 100%|██████████| 2931/2931 [00:00<00:00, 1462467.88it/s]


Clustering into 20 groups...


Assigning amenities to clusters: 100%|██████████| 7831/7831 [00:01<00:00, 5984.87it/s]


Cluster mapping: defaultdict(<class 'list'>, {0: ['private bbq grill', 'aeg gas stove', 'wolf stove', 'hot plates electric stove', 'stainless steel induction stove', 'mabe stainless steel electric stove', 'ecotech induction stove', 'lg electric stove', '2 burner glass stovetop electric stove', 'bosch stainless steel electric stove', 'thermador stainless steel gas stove', 'indoor fireplace: electric, wood-burning', 'garland gas stove', 'bosch induction stove', 'noxton electric stove', 'cusimax electric stove', 'bbq grill: gas', 'samsung stainless steel induction stove', 'other induction stove', 'thermador stainless steel stove', 'dacor gas stove', 'hot plate and microwave electric stove', 'thermador stainless steel induction stove', 'blomberg gas stove', 'forno stainless steel gas stove', 'induction cook-top induction stove', 'bbq grill: charcoal, gas', 'portable stove electric stove', 'kitchenn aid stainless steel gas stove', 'caloric gas stove', 'bluestar stainless steel gas stove', '