In [1]:
import os
import re
import sys
import warnings
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
from skimpy import skim

warnings.filterwarnings("ignore")

In [2]:
data_dir = "/Users/ghadena/Desktop/Business analytics/DA3/DA3/assignment_1/data/clean"
path = os.path.join(data_dir,'airbnb_madrid_workfile_adj.csv') 
data = pd.read_csv(path, index_col=0)


In [3]:
import pandas as pd
import re

def clean_column_name(col):
    # Remove D_ prefix and any quotes/backslashes
    cleaned = re.sub(r'^d_[\'"]?|[\'"]$', '', col, flags=re.IGNORECASE)
    
    # Decode Unicode characters and special symbols
    cleaned = re.sub(r'\\u[0-9a-f]{4}', lambda m: bytes(m.group(0), 'utf-8').decode('unicode_escape'), cleaned)
    cleaned = re.sub(r'[^a-zA-Z0-9]', ' ', cleaned)  # Replace special chars with spaces
    
    # Standardize text
    cleaned = cleaned.lower().strip()
    cleaned = re.sub(r'\s+', '_', cleaned)  # Convert spaces to underscores
    
    return cleaned

# Apply to all columns
cleaned_columns = [clean_column_name(col) for col in data.columns]
data.columns = cleaned_columns 

In [4]:
aggregation_rules = {
    'shampoo': ['shampoo'],
    'conditioner': ['conditioner'],
    'body_soap': ['bodysoap'],
    'tv': ['tv', 'hdtv', 'television'],
    'streaming_services': ['netflix', 'amazonprime', 'hulu', 'disney+', 'streaming','appletv', 'chromecast','hbomax'],
    'refrigerator': ['refrigerator', 'fridge'],
    'stove': ['stove'],
    'child_friendly': ['child', 'baby', 'infant', 'toddler', 'highchair', 'crib', 'babybath', 'babymonitor', 'changingtable', 'children', 'kids', 'family'],
    'free_parking': ['free_parking', 'freeresidentialgarageonpremises', 'freestreetparking', 'freeparkingonpremises'],
    'paid_parking': ['paid_parking', 'paidparkingoffpremises','paidparkingonpremises','paid_garage'],
    'wifi': ['wifi', 'internet'],
    'bidet': ['bidet'],
    'oven': ['oven'],
    'cable': ['cable'],
    'sound_system': ['soundsystem', 'speaker'],
    'backyard': ['backyard', 'garden', 'yard'],
    'view': ['view'],
    'balcony': ['balcony'],
    'bbq': ['bbq'],
    'bathtub': ['bathtub'],
    'coffee_maker': ['coffee', 'espresso', 'keurig'],
    'excersise_equipment': ['excersise', 'freeweights', 'treadmill', 'elliptical', 'yoga', 'pilates','workoutbench','stationarybike'],
    'dryer': ['dryer', 'freedryer'],
    'washer': ['washer', 'freewasher'],
    'gym': ['gym'],
    'heating': ['heating','heated','centralheating'],
    'housekeeping_included': ['housekeeping', 'included', 'housekeepingavailable'],
    'housekeeping_extracost': ['housekeeping', 'extracost','housekeepingavailable'],
    'indoor_fireplace': ['indoorfireplace'],
    'paid_dryer_washer': ['paidwasher', 'paiddryer','laundromat'],
    'airconditioning': ['airconditioning', 'ac','centralairconditioning'],
    'kitchen': ['kitchen', 'kitchenette'],
    'game_console': ['gameconsole', 'playstation', 'xbox'],
    'clothing_storage': ['clothingstorage', 'closet', 'wardrobe', 'dresser'],
    'electric_car_charging': ['evcharger', 'freecarportonpremises'],
    'indoor_pool': ['indoorpool'],
    'outdoor_pool': ['outdoorpool','infinity','olympicsized','lappool','poolheathed'],
    'outdoor_space': ['outdoording','outdoorseating','outdoorshower','outdoorgrill','outdoorpatio','outdoorfirepit','outdoorbar','outdoorbarbecue','outdoorfurniture','outdoorplay'],
    'private_ameneties': ['private'],
    'shared_ameneties': ['shared'],
    'private_pool': ['privatepool'],
    'sauna': ['sauna','saunasteamroom','privatesauna'], 
    # Add more rules as needed...
}

In [6]:
import re

def aggregate_features(df, rules, strict_multi_word=None):
    """
    Aggregates binary features based on keyword matching in column names.
    
    - For `strict_multi_word` categories (e.g., housekeeping), ALL words must be present in the column name.
    - For everything else, a column matches if ANY keyword appears.
    
    Args:
        df (DataFrame): The dataset with binary dummy variables.
        rules (dict): Mapping of new feature names to keyword lists.
        strict_multi_word (list): List of category names that require ALL keywords in the column name.
    
    Returns:
        DataFrame: Updated dataset with aggregated features.
    """
    strict_multi_word = strict_multi_word or []  # Default to an empty list if None
    
    for new_col, keywords in rules.items():
        # Escape special regex characters in keywords
        patterns = [re.escape(k) for k in keywords]
        
        if new_col in strict_multi_word:
            # 🟢 Strict Mode: Ensure ALL keywords appear together (for housekeeping-related columns)
            matching_cols = [
                col for col in df.columns
                if all(re.search(p, col, flags=re.IGNORECASE) for p in patterns)
            ]
        else:
            # 🔵 Standard Mode: Match if ANY keyword appears (for all other features)
            pattern = r'(' + '|'.join(patterns) + r')'
            matching_cols = [
                col for col in df.columns
                if re.search(pattern, col, flags=re.IGNORECASE)
            ]
        
        print(f"\nCreating '{new_col}' from:")
        print(matching_cols)
        
        # Create new aggregated column
        if matching_cols:
            df[new_col] = df[matching_cols].any(axis=1).astype(int)
        else:
            print(f"No columns matched for {new_col}!")
    
    return df


# 🔵 Housekeeping Columns Require **Both Words**
strict_categories = ['housekeeping_extra_cost', 'housekeeping_included']

# 🚀 Apply Aggregation
data = aggregate_features(data, aggregation_rules, strict_categories)

# 🟢 Show Updated Dataset
import ace_tools as tools
tools.display_dataframe_to_user(name="Aggregated Data", dataframe=data)


Creating 'shampoo' from:
['elvivedel_orealshampoo', 'johnson_sbabyshampoo', 'l_occitaneenprovenceshampoo', 'l_or_alprimerasmarcasshampoo', 'l_or_alshampoo', 'shampoo', 'shampoo', '3marcasdistintasshampoo', 'shampoo', 'shampoo', 'ademandashampoo', 'acondicionadorygeldeduchashampoo', 'aesopshampoo', 'almendraalmondsshampoo', 'almondshampoo', 'aloeverashampoo', 'alphaparfshampoo', 'alqvimiashampoo', 'amenitieshampoo', 'amenitiesdebienvenidashampoo', 'amenitiesdebienvenida_shampoo', 'amenitiesshampoo', 'amenityshampoo', 'anyshampoo', 'anyahshampoo', 'apivitashampoo', 'aussieshampoo', 'avellanashampoo', 'avonshampoo', 'azlshampoo', 'b_sicoshampoo', 'babariadecocoshampoo', 'basicoshampoo', 'bienvenueshampoo', 'bioshampoo', 'blancshampoo', 'blancashampoo', 'blancasshampoo', 'bodyplusshampoo', 'bonteshampoo', 'botededosotresdosisshampoo', 'camomilashampoo', 'carrefourshampoo', 'celad_pendshampoo', 'champ_deintensahidrataci_ntresemm_shampoo', 'champ_depeloschwarkopfosimilarshampoo', 'champ_par

ModuleNotFoundError: No module named 'ace_tools'