In [None]:
# import sys
# 
# import bertopic
# import tqdm
# !{sys.executable} -m pip install stanza --upgrade
# !{sys.executable} -m pip install torch --upgrade
# 
# import matplotlib.pyplot as plt
# from scipy.cluster.hierarchy import dendrogram, linkage
# 
# def plot_dendrogram(embeddings, labels):
#     linked = linkage(embeddings, 'ward')
#     plt.figure(figsize=(12, 6))
#     dendrogram(linked, labels=labels, leaf_rotation=90)
#     plt.show()
# 
# import pandas as pd
# import ast
# import re
# from sentence_transformers import SentenceTransformer
# from sklearn.cluster import AgglomerativeClustering
# from tqdm import tqdm
# 
# tqdm.pandas()  # Enables df.progress_apply
# 
# from nltk.stem import PorterStemmer
# from nltk.corpus import stopwords
# import nltk
# nltk.download("stopwords")
# 
# stop_words = set(stopwords.words("english"))
# stemmer = PorterStemmer()
# 
# def normalize_text(text):
#     words = re.findall(r'[a-z0-9]+', text.lower())
#     return " ".join(stemmer.stem(w) for w in words if w not in stop_words)
# 
# 
# def extract_unique_amenities(df):
#     unique = set()
#     parsed_amenities = []
#     
#     for val in tqdm(df["amenities"], desc="Parsing amenities"):
#         try:
#             lst = ast.literal_eval(val) if isinstance(val, str) else []
#             lst = [normalize_text(str(a)) for a in lst]
#         except Exception:
#             lst = []
#         parsed_amenities.append(lst)
#         unique.update(lst)
#     print(len(unique))
#     
#     return list(unique), parsed_amenities
# 
# from collections import defaultdict
# 
# def cluster_amenities(unique_amenities, n_clusters=None):
#     print(f"Encoding {len(unique_amenities)} unique amenities...")
#     
# 
#     model = SentenceTransformer("all-MiniLM-L6-v2")
# 
#     embeddings = list(tqdm(model.encode(unique_amenities, show_progress_bar=True), 
#                             total=len(unique_amenities), 
#                             desc="Encoding embeddings"))
#     
#     if not n_clusters:
#         n_clusters = max(2, int(len(unique_amenities) ** 0.5))
#     
#     print(f"Clustering into {n_clusters} groups...")
#     clustering = AgglomerativeClustering(n_clusters=n_clusters)
#     labels = clustering.fit_predict(embeddings)
#     
#     # Group amenities by cluster
#     cluster_to_amenities = defaultdict(list)
#     for amenity, label in zip(unique_amenities, labels):
#         cluster_to_amenities[label].append(amenity)
#     
#     # Pick a representative name for each cluster
#     cluster_names = {}
#     for cluster, items in cluster_to_amenities.items():
#         # Pick the shortest name (after sorting)
#         rep_name = sorted(items, key=len)[0]
#         # Clean name for column use
#         rep_name = rep_name.replace(" ", "_").replace("-", "_")
#         cluster_names[cluster] = rep_name
#     print(cluster_names)
#     
#     # Map amenity → cluster name
#     amenity_to_cluster = {amenity: cluster_names[label] 
#                           for amenity, label in zip(unique_amenities, labels)}
#     plot_dendrogram(embeddings, unique_amenities)
# 
#     return amenity_to_cluster, cluster_to_amenities
# 
# 
# def expand_amenities_semantic(df, amenity_to_cluster, parsed_amenities):
#     cluster_names = set(amenity_to_cluster.values())
#     
#     # Initialize binary columns for each cluster name
#     for cname in cluster_names:
#         df[f"amenity_{cname}"] = 0
#     
#     for idx, lst in tqdm(enumerate(parsed_amenities), 
#                          total=len(parsed_amenities), 
#                          desc="Assigning amenities to clusters"):
#         cluster_ids = {amenity_to_cluster[a] for a in lst if a in amenity_to_cluster}
#         for cname in cluster_ids:
#             df.at[idx, f"amenity_{cname}"] = 1
#     
#     return df
# 
# def process_airbnb_with_semantic_amenities(path, n_clusters=None):
#     df = pd.read_csv(path)
#     unique_amenities, parsed_amenities = extract_unique_amenities(df)
#     print(type(unique_amenities))
#     amenity_to_cluster, cluster_to_amenities = cluster_amenities(unique_amenities, n_clusters)
#     df = expand_amenities_semantic(df, amenity_to_cluster, parsed_amenities)
#     return df, cluster_to_amenities
# 
# 
# if __name__ == "__main__":
#     df, mapping = process_airbnb_with_semantic_amenities(r"C:\Users\hodos\Documents\Uni\Uni-Year-3\Semester2\Data\cleaned_listings_amsterdam.csv")
#     # df.to_csv("cleaned_with_clusters.csv", index=False)
#     # print("Cluster mapping:", mapping)
# 
# df.head()


In [11]:
import pandas as pd
import re
import ast
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from collections import defaultdict
import spacy

# --- NLP Setup ---
try:
    NLP = spacy.load("en_core_web_sm")
    print("✓ spaCy NLP model loaded successfully.")
except OSError:
    print("Error: spaCy model 'en_core_web_sm' not found.")
    print("Please run: python -m spacy download en_core_web_sm")
    exit()

# --- Setup: Define Stop Words ---
CUSTOM_STOP_WORDS = set(CountVectorizer(stop_words='english').get_stop_words()).union(['listing', 'available', 'extra cost', 'stay'])

tqdm.pandas(desc="Processing DataFrame")

def clean_amenity_text(text):
    if not isinstance(text, str): return ""

    # 1. Convert to lowercase
    text = text.lower()

    # --- NEW: Handle the "(word) allowed" pattern ---
    # This regex finds a word followed by "allowed" and merges them.
    # e.g., "pets allowed" -> "pets_allowed"
    text = re.sub(r'(\w+)\s+allowed\b', r'\1_allowed', text)

    # 3. Remove other specific unwanted patterns
    text = re.sub(r'\brequest\b', '', text)
    text = re.sub(r'\d+\s+years?\s+old', '', text)
    text = re.sub(r'ages?\s+\d+', '', text)

    # 4. Keep only lowercase English letters and underscores (for our new pattern)
    text = re.sub(r'[^a-z\s_]', '', text)
    
    # 5. Remove stop words
    words = text.split()
    filtered_words = [word for word in words if word not in CUSTOM_STOP_WORDS]
    text = ' '.join(filtered_words)

    # 6. Consolidate multiple spaces and strip
    return re.sub(r'\s+', ' ', text).strip()

def _get_primary_object(phrase, nlp_model):
    """
    Uses a "right-most noun" heuristic to find the most important object.
    """
    doc = nlp_model(phrase)
    try:
        if not doc.noun_chunks:
            return None
        longest_chunk = max(doc.noun_chunks, key=lambda chunk: len(chunk.text.split()))
        for token in reversed(longest_chunk):
            if token.pos_ in ['NOUN', 'PROPN']:
                return token.lemma_
        return longest_chunk.root.lemma_
    except ValueError:
        return None

def refine_and_group_amenities(candidates, nlp_model):
    """
    Applies the hybrid refinement process.
    """
    # --- Stage 1: Prune ---
    print(f"--> Pruning {len(candidates)} initial candidates...")
    candidates.sort(key=len, reverse=True)
    temp_set = set(candidates)
    for c in candidates:
        if c in temp_set:
            substrings = {other for other in temp_set if c != other and other in c}
            temp_set.difference_update(substrings)
    pruned_amenities = sorted(list(temp_set), key=len, reverse=True)
    print(f"--> After pruning, {len(pruned_amenities)} candidates remain.")

    # --- Stage 2: Grouping with Hybrid Logic ---
    ORDERED_SYNONYM_GROUPS = [
        # == High-Priority Overrides ==
        ('hair_dryer',        ['hair dryer']),
        ('dishwasher',        ['dishwasher']),
        ('washing_machine',   ['washer', 'washing machine']),
        ('clothes_dryer',     ['dryer', 'tumble dryer']),
        ('security_camera',   ['security camera', 'camera']),
        ('pets_allowed',      ['pets_allowed']), # Catch our transformed pattern
        ('smoking_allowed',   ['smoking_allowed']),
        ('drying_rack',       ['drying rack']),
        ('conditioner',       ['conditioner']), # Override for the NLP weakness
        
        # == General Concepts and Synonyms ==
        ('air_conditioning',  ['air conditioning', 'ac', 'aircon']),
        ('closet',            ['closet', 'wardrobe', 'dresser']),
        ('tv',                ['tv', 'hdtv', 'television', 'hbo', 'cable', 'netflix', 'hulu', 'amazon prime', 'disney']),
        ('wifi',              ['wifi', 'wireless internet', 'internet', 'ethernet']),
        ('kitchen',           ['kitchen']),
        ('coffee_maker',      ['coffee', 'coffee maker', 'nespresso', 'keurig']),
        ('dinnerware',        ['dinnerware']),
        ('heating',           ['heating', 'heater']),
        ('parking',           ['parking']),
        ('pool',              ['pool']),
        ('hot_tub',           ['hot tub', 'jacuzzi']),
        ('bathtub',           ['bath', 'bathtub']),
        ('patio_balcony',     ['patio', 'balcony']),
        ('gym',               ['gym', 'fitness']),
        ('first_aid_kit',     ['first aid']),
        ('smoke_alarm',       ['smoke alarm', 'smoke detector']),
        ('carbon_monoxide_alarm', ['carbon monoxide alarm', 'co alarm', 'carbon monoxide detector']),
        ('fire_extinguisher', ['fire extinguisher']),
        ('refrigerator',      ['refrigerator', 'fridge']),
        ('microwave',         ['microwave']),
        ('oven',              ['oven']),
        ('stove',             ['stove', 'cooktop']),
        ('books',             ['books']),
        ('waterfront',        ['river', 'canal', 'waterfront']),
        ('shampoo',           ['shampoo']),
        ('body_soap',         ['soap', 'body soap'])
    ]

    groups = defaultdict(list)
    
    # First pass: Use the high-precision synonym list
    amenities_to_process = set(pruned_amenities)
    for standard_name, synonyms in tqdm(ORDERED_SYNONYM_GROUPS, desc="Grouping by Synonyms"):
        matched_this_pass = set()
        for amenity in amenities_to_process:
            for synonym in synonyms:
                if re.search(r'\b' + re.escape(synonym) + r'\b', amenity):
                    groups[standard_name].append(amenity)
                    matched_this_pass.add(amenity)
                    break 
        amenities_to_process.difference_update(matched_this_pass)

    # Second pass: Use NLP as a fallback for everything else
    print(f"--> {len(amenities_to_process)} amenities remaining for NLP fallback grouping...")
    for amenity in tqdm(list(amenities_to_process), desc="Grouping by NLP"):
        primary_object = _get_primary_object(amenity, nlp_model)
        if primary_object:
            groups[primary_object].append(amenity)
        else:
            groups[amenity].append(amenity)

    print(f"--> Consolidated into {len(groups)} final amenity groups.")
    return groups

def standardize_amenities_final(file_path, min_df_threshold=50):
    try:
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        return None

    df = pd.read_csv(file_path, engine='python', on_bad_lines='warn')

    df['parsed_amenities'] = df['amenities'].progress_apply(
        lambda s: ast.literal_eval(s.strip()) if isinstance(s, str) and s.strip().startswith('[') else []
    )
    
    corpus = [clean_amenity_text(a) for l in df['parsed_amenities'] for a in l if clean_amenity_text(a)]
    
    vectorizer = CountVectorizer(ngram_range=(1, 5), min_df=min_df_threshold)
    vectorizer.fit(corpus)
    initial_candidates = vectorizer.get_feature_names_out().tolist()

    final_amenity_groups = refine_and_group_amenities(initial_candidates, nlp)
    print(final_amenity_groups)
    df['cleaned_amenities_text'] = df['parsed_amenities'].progress_apply(
        lambda lst: ' | '.join([clean_amenity_text(item) for item in lst])
    )

    for group_name, search_terms in tqdm(final_amenity_groups.items(), desc="Creating Columns"):
        column_name = f"has_{group_name.replace(' ', '_')}"
        pattern = '|'.join([r'\b' + re.escape(term) + r'\b' for term in search_terms])
        df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)

    df = df.drop(columns=['parsed_amenities', 'cleaned_amenities_text'])
    print("✓ Binary columns created successfully.")
    return df

# --- Main execution block ---
if __name__ == '__main__':
    csv_file_path = r"C:\Users\hodos\Documents\Uni\Uni-Year-3\Semester2\Data\cleaned_listings_amsterdam.csv"
    MIN_DF_THRESHOLD = 100 

    transformed_df = standardize_amenities_final(
        csv_file_path, min_df_threshold=MIN_DF_THRESHOLD
    )

    if transformed_df is not None:
        print("\nTransformation complete. Here's a preview:")
        amenity_cols = sorted([col for col in transformed_df.columns if col.startswith('has_')])
        display_cols = ['id', 'name'] + amenity_cols
        
        if len(display_cols) > 20:
            print(f"(Showing a subset of the {len(amenity_cols)} new amenity columns)")
            display_cols = display_cols[:20]

        print(transformed_df[display_cols].head())

✓ spaCy NLP model loaded successfully.


Processing DataFrame: 100%|██████████| 10168/10168 [00:00<00:00, 15192.47it/s]


--> Pruning 572 initial candidates...
--> After pruning, 168 candidates remain.


Grouping by Synonyms: 100%|██████████| 35/35 [00:00<00:00, 1589.68it/s]


--> 93 amenities remaining for NLP fallback grouping...


Grouping by NLP: 100%|██████████| 93/93 [00:00<00:00, 183.97it/s]


--> Consolidated into 112 final amenity groups.
defaultdict(<class 'list'>, {'hair_dryer': ['hair dryer'], 'dishwasher': ['dishwasher'], 'washing_machine': ['free washer building', 'free washer unit', 'paid washer'], 'clothes_dryer': ['free dryer building', 'paid dryer', 'free dryer unit'], 'pets_allowed': ['pets_allowed'], 'smoking_allowed': ['smoking_allowed'], 'drying_rack': ['drying rack clothing'], 'conditioner': ['conditioner'], 'air_conditioning': ['ac split type ductless', 'portable air conditioning', 'central air conditioning'], 'closet': ['clothing storage wardrobe', 'clothing storage closet dresser', 'clothing storage closet wardrobe dresser', 'clothing storage dresser', 'clothing storage walkin closet'], 'tv': ['inch hdtv amazon prime video', 'inch hdtv netflix', 'inch hdtv chromecast', 'hbo max netflix', 'hdtv standard cable', 'apple tv', 'netflix premium cable', 'chromecast netflix', 'netflix standard cable', 'disney', 'inch tv'], 'wifi': ['pocket wifi', 'fast wifi mbps',

Processing DataFrame: 100%|██████████| 10168/10168 [00:02<00:00, 4552.72it/s]
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).asty

✓ Binary columns created successfully.

Transformation complete. Here's a preview:
(Showing a subset of the 113 new amenity columns)
      id                                               name  has_access  \
0  27886  Romantic, stylish B&B houseboat in canal district           1   
1  28871                            Comfortable double room           0   
2  29051                   Comfortable single / double room           0   
3  44391    Quiet 2-bedroom Amsterdam city centre apartment           0   
4  47061                   Charming apartment in old centre           0   

   has_aeg  has_air_conditioning  has_area  has_availability  has_backyard  \
0        0                     0         1                 1             1   
1        0                     0         0                 1             0   
2        0                     0         0                 1             0   
3        0                     0         0                 1             0   
4        0                




In [30]:

csv_file_path = r"C:\Users\hodos\Documents\Uni\Uni-Year-3\Semester2\Data\cleaned_listings_amsterdam.csv"
MIN_DF_THRESHOLD = 100 


transformed_df = standardize_amenities_final(
    csv_file_path, min_df_threshold=MIN_DF_THRESHOLD
)

if transformed_df is not None:
    print("\nTransformation complete. Here's a preview:")
    amenity_cols = sorted([col for col in transformed_df.columns if col.startswith('has_')])
    display_cols = ['id', 'name'] + amenity_cols
    
    if len(display_cols) > 20:
        print(f"(Showing a subset of the {len(amenity_cols)} new amenity columns)")
        display_cols = display_cols[:20]

    print(transformed_df[display_cols].head())

Attempting to load data from: C:\Users\hodos\Documents\Uni\Uni-Year-3\Semester2\Data\cleaned_listings_amsterdam.csv


Processing DataFrame: 100%|██████████| 10168/10168 [00:00<00:00, 15107.20it/s]


--> Pruning 575 initial candidates...
--> After pruning, 173 candidates remain.


Grouping Amenities: 100%|██████████| 173/173 [00:00<00:00, 9970.38it/s]


--> Consolidated into 125 final amenity groups.
defaultdict(<class 'list'>, {'closet': ['clothing storage closet wardrobe', 'storage closet wardrobe dresser', 'clothing storage closet dresser', 'clothing storage walkin closet', 'clothing storage wardrobe', 'clothing storage dresser'], 'coffee_maker': ['coffee maker espresso machine', 'coffee maker pourover coffee', 'drip coffee maker espresso', 'coffee maker french press', 'coffee maker drip coffee', 'maker drip coffee maker', 'coffee maker nespresso', 'machine nespresso'], 'parking': ['paid parking garage premises', 'paid street parking premises', 'paid parking lot premises', 'free parking premises', 'paid parking premises', 'free street parking'], 'oven': ['stainless steel single oven', 'stainless steel oven', 'double oven'], 'air_conditioning': ['portable air conditioning', 'central air conditioning', 'ac split type ductless'], 'stove': ['stainless steel gas stove', 'induction stove', 'electric stove'], 'tv': ['hdtv amazon prime vid

Processing DataFrame: 100%|██████████| 10168/10168 [00:02<00:00, 4993.14it/s]
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).astype(int)
  df[column_name] = df['cleaned_amenities_text'].str.contains(pattern, na=False).asty

✓ Binary columns created successfully.

Transformation complete. Here's a preview:
(Showing a subset of the 126 new amenity columns)
      id                                               name  has_aeg  \
0  27886  Romantic, stylish B&B houseboat in canal district        0   
1  28871                            Comfortable double room        0   
2  29051                   Comfortable single / double room        0   
3  44391    Quiet 2-bedroom Amsterdam city centre apartment        0   
4  47061                   Charming apartment in old centre        0   

   has_aid_kit  has_air_conditioning  has_availability  has_baby_monitor  \
0            0                     0                 1                 0   
1            0                     0                 1                 0   
2            0                     0                 1                 0   
3            0                     0                 1                 0   
4            1                     0                 1


