In [117]:
import os
import re
import sys
import warnings
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
from skimpy import skim

warnings.filterwarnings("ignore")

In [118]:
data_dir = "/Users/ghadena/Desktop/Business analytics/DA3/DA3/assignment_1/data/clean"
path = os.path.join(data_dir,'airbnb_madrid_workfile_adj.csv') 
data = pd.read_csv(path, index_col=0)


In [119]:
data.isnull().sum().loc[lambda x: x > 0]

Series([], dtype: int64)

In [120]:
data.columns.tolist()

['f_property_type',
 'f_room_type2',
 'f_neighbourhood_cleansed',
 'usd_price_day',
 'p_host_response_rate',
 'n_accommodates',
 'n_bathrooms',
 'n_review_scores_rating',
 'n_number_of_reviews',
 'n_reviews_per_month',
 'n_minimum_nights',
 'n_beds',
 'n_days_since',
 'd_"children\'splayroom"',
 'd_"elvivedel\'orealconditioner"',
 'd_"elvivedel\'orealshampoo"',
 'd_"johnson\'sbabyshampoo"',
 'd_"l\'occitanebodysoap"',
 'd_"l\'occitaneenprovencebodysoap"',
 'd_"l\'occitaneenprovenceconditioner"',
 'd_"l\'occitaneenprovenceshampoo"',
 'd_"l\'or\\\\u00e9alprimerasmarcasbodysoap"',
 'd_"l\'or\\\\u00e9alprimerasmarcasshampoo"',
 'd_"l\'or\\\\u00e9alshampoo"',
 'd_"l\'orealconditioner"',
 'd_"lov\'ycbodysoap"',
 "d_''",
 "d_'(contienealimentosnuestros'",
 "d_'bodysoap'",
 "d_'.bodysoap'",
 "d_'.conditioner'",
 "d_'.doubleoven'",
 "d_'.refrigerator'",
 "d_'.shampoo'",
 "d_'.singleoven'",
 "d_'.stainlesssteeloven'",
 "d_'..bodysoap'",
 "d_'..shampoo'",
 "d_'...refrigerator'",
 "d_'0%bodysoap'"

In [121]:
import pandas as pd
import re

def clean_column_name(col):
    # Remove D_ prefix and any quotes/backslashes
    cleaned = re.sub(r'^d_[\'"]?|[\'"]$', '', col, flags=re.IGNORECASE)
    
    # Decode Unicode characters and special symbols
    cleaned = re.sub(r'\\u[0-9a-f]{4}', lambda m: bytes(m.group(0), 'utf-8').decode('unicode_escape'), cleaned)
    cleaned = re.sub(r'[^a-zA-Z0-9]', ' ', cleaned)  # Replace special chars with spaces
    
    # Standardize text
    cleaned = cleaned.lower().strip()
    cleaned = re.sub(r'\s+', '_', cleaned)  # Convert spaces to underscores
    
    return cleaned

# Apply to all columns
cleaned_columns = [clean_column_name(col) for col in data.columns]
data.columns = cleaned_columns 

In [122]:
aggregation_rules = {
    'shampoo': ['shampoo'],
    'conditioner': ['conditioner'],
    'body_soap': ['bodysoap'],
    'tv': ['tv', 'hdtv', 'television'],
    'streaming_services': ['netflix', 'amazonprime', 'hulu', 'disney+', 'streaming','appletv', 'chromecast','hbomax'],
    'refrigerator': ['refrigerator', 'fridge'],
    'stove': ['stove'],
    'child_friendly': ['child', 'baby', 'infant', 'toddler', 'highchair', 'crib', 'babybath', 'babymonitor', 'changingtable', 'children', 'kids', 'family'],
    'free_parking': ['free_parking', 'freeresidentialgarageonpremises', 'freestreetparking', 'freeparkingonpremises'],
    'paid_parking': ['paid_parking', 'paidparkingoffpremises','paidparkingonpremises','paid_garage'],
    'wifi': ['wifi', 'internet'],
    'bidet': ['bidet'],
    'oven': ['oven'],
    'cable': ['cable'],
    'sound_system': ['soundsystem', 'speaker'],
    'backyard': ['backyard', 'garden', 'yard'],
    'view': ['view'],
    'balcony': ['balcony'],
    'bbq': ['bbq'],
    'bathtub': ['bathtub'],
    'coffee_maker': ['coffee', 'espresso', 'keurig'],
    'excersise_equipment': ['excersise', 'freeweights', 'treadmill', 'elliptical', 'yoga', 'pilates','workoutbench','stationarybike'],
    'dryer': ['dryer', 'freedryer'],
    'washer': ['washer', 'freewasher'],
    'gym': ['gym'],
    'heating': ['heating','heated','centralheating'],
    'housekeeping_included': ['housekeeping', 'included', 'housekeepingavailable'],
    'housekeeping_extracost': ['housekeeping', 'extracost','housekeepingavailable'],
    'indoor_fireplace': ['indoorfireplace'],
    'paid_dryer_washer': ['paidwasher', 'paiddryer','laundromat'],
    'airconditioning': ['airconditioning', 'ac','centralairconditioning'],
    'kitchen': ['kitchen', 'kitchenette'],
    'game_console': ['gameconsole', 'playstation', 'xbox'],
    'clothing_storage': ['clothingstorage', 'closet', 'wardrobe', 'dresser'],
    'electric_car_charging': ['evcharger', 'freecarportonpremises'],
    'indoor_pool': ['indoorpool'],
    'outdoor_pool': ['outdoorpool','infinity','olympicsized','lappool','poolheathed'],
    'outdoor_space': ['outdoording','outdoorseating','outdoorshower','outdoorgrill','outdoorpatio','outdoorfirepit','outdoorbar','outdoorbarbecue','outdoorfurniture','outdoorplay'],
    'private_ameneties': ['private'],
    'shared_ameneties': ['shared'],
    'private_pool': ['privatepool'],
    'sauna': ['sauna','saunasteamroom','privatesauna'], 
    # Add more rules as needed...
}

In [123]:
import re

def aggregate_features(df, rules, strict_multi_word=None):
    """
    Aggregates binary features based on keyword matching in column names.
    
    - For `strict_multi_word` categories (e.g., housekeeping), ALL words must be present in the column name.
    - For everything else, a column matches if ANY keyword appears.
    
    Args:
        df (DataFrame): The dataset with binary dummy variables.
        rules (dict): Mapping of new feature names to keyword lists.
        strict_multi_word (list): List of category names that require ALL keywords in the column name.
    
    Returns:
        DataFrame: Updated dataset with aggregated features.
    """
    strict_multi_word = strict_multi_word or []  # Default to an empty list if None
    columns_to_drop = []  # List to track original columns for removal

    for new_col, keywords in rules.items():
        # Escape special regex characters in keywords
        patterns = [re.escape(k) for k in keywords]
        
        if new_col in strict_multi_word:
            # 🟢 Strict Mode: Ensure ALL keywords appear together (for housekeeping-related columns)
            matching_cols = [
                col for col in df.columns
                if all(re.search(p, col, flags=re.IGNORECASE) for p in patterns)
            ]
        else:
            # 🔵 Standard Mode: Match if ANY keyword appears (for all other features)
            pattern = r'(' + '|'.join(patterns) + r')'
            matching_cols = [
                col for col in df.columns
                if re.search(pattern, col, flags=re.IGNORECASE)
            ]
        
        print(f"\nCreating '{new_col}' from:")
        print(matching_cols)
        
        # Create new aggregated column
        if matching_cols:
            df[new_col] = df[matching_cols].any(axis=1).astype(int)
            columns_to_drop.extend(matching_cols)  # ✅ Track columns to remove
    
    print(f"Columns before dropping: {df.columns.tolist()}")
    
    # ✅ Drop the original detailed columns that were aggregated
    # ✅ FIX: Prevent Aggregated Columns from Being Dropped
    columns_to_drop = [col for col in columns_to_drop if col not in rules.keys()]
    df.drop(columns=columns_to_drop, inplace=True, errors="ignore")

    print(f"Final columns after dropping: {df.columns.tolist()}")
    
    return df


# 🔵 Housekeeping Columns Require **Both Words**
strict_categories = ['housekeeping_extra_cost', 'housekeeping_included']

# 🚀 Apply Aggregation
data = aggregate_features(data, aggregation_rules, strict_categories)

print(data.info())  # Shows data types and missing values
print(data.describe())  # Shows summary statistics for numeric columns


Creating 'shampoo' from:
['elvivedel_orealshampoo', 'johnson_sbabyshampoo', 'l_occitaneenprovenceshampoo', 'l_or_alprimerasmarcasshampoo', 'l_or_alshampoo', 'shampoo', 'shampoo', '3marcasdistintasshampoo', 'shampoo', 'shampoo', 'ademandashampoo', 'acondicionadorygeldeduchashampoo', 'aesopshampoo', 'almendraalmondsshampoo', 'almondshampoo', 'aloeverashampoo', 'alphaparfshampoo', 'alqvimiashampoo', 'amenitieshampoo', 'amenitiesdebienvenidashampoo', 'amenitiesdebienvenida_shampoo', 'amenitiesshampoo', 'amenityshampoo', 'anyshampoo', 'anyahshampoo', 'apivitashampoo', 'aussieshampoo', 'avellanashampoo', 'avonshampoo', 'azlshampoo', 'b_sicoshampoo', 'babariadecocoshampoo', 'basicoshampoo', 'bienvenueshampoo', 'bioshampoo', 'blancshampoo', 'blancashampoo', 'blancasshampoo', 'bodyplusshampoo', 'bonteshampoo', 'botededosotresdosisshampoo', 'camomilashampoo', 'carrefourshampoo', 'celad_pendshampoo', 'champ_deintensahidrataci_ntresemm_shampoo', 'champ_depeloschwarkopfosimilarshampoo', 'champ_par

In [124]:
# Find all duplicated column names
duplicate_columns = data.columns[data.columns.duplicated(keep=False)].tolist()

# Print value counts for each duplicate version of the column
for col in set(duplicate_columns):  # Ensure we check each unique name
    duplicate_indices = [i for i, name in enumerate(data.columns) if name == col]
    
    for index in duplicate_indices:
        print(f"\n🔹 Value Counts for '{col}' (Column Position {index}):")
        print(data.iloc[:, index].value_counts())


🔹 Value Counts for 'conditioner' (Column Position 11):
conditioner
0    18654
1     2296
Name: count, dtype: int64

🔹 Value Counts for 'conditioner' (Column Position 21):
conditioner
0    18654
1     2296
Name: count, dtype: int64

🔹 Value Counts for 'conditioner' (Column Position 23):
conditioner
0    18654
1     2296
Name: count, dtype: int64

🔹 Value Counts for 'conditioner' (Column Position 54):
conditioner
0    18654
1     2296
Name: count, dtype: int64

🔹 Value Counts for 'shampoo' (Column Position 13):
shampoo
0    20310
1      640
Name: count, dtype: int64

🔹 Value Counts for 'shampoo' (Column Position 14):
shampoo
0    20310
1      640
Name: count, dtype: int64

🔹 Value Counts for 'shampoo' (Column Position 20):
shampoo
0    20310
1      640
Name: count, dtype: int64

🔹 Value Counts for 'shampoo' (Column Position 24):
shampoo
0    20310
1      640
Name: count, dtype: int64

🔹 Value Counts for 'shampoo' (Column Position 158):
shampoo
0    20310
1      640
Name: count, dtype: i

In [125]:
# Remove duplicate columns, keeping only the first occurrence
data = data.loc[:, ~data.columns.duplicated(keep="first")]

In [128]:
data.columns.tolist()

['f_property_type',
 'f_room_type2',
 'f_neighbourhood_cleansed',
 'usd_price_day',
 'p_host_response_rate',
 'n_bathrooms',
 'n_minimum_nights',
 'n_beds',
 'n_days_since',
 '',
 'contienealimentosnuestros',
 'conditioner',
 'refrigerator',
 'shampoo',
 '25yearsold',
 '3daysaweekincludedwithyourstay',
 '510yearsold',
 'airconditioning',
 'and10_yearsold',
 'and510yearsold',
 'arcadegames',
 'backyard',
 'bakingsheet',
 'barbecueutensils',
 'bathtub',
 'bedlinens',
 'beko_frigor_ficonuevo',
 'bidet',
 'bikes',
 'blanca',
 'blender',
 'boardgames',
 'boatslip',
 'booksandreadingmaterial',
 'bowlingalley',
 'breadmaker',
 'breakfast',
 'buildingstaff',
 'carbonmonoxidealarm',
 'carrefour',
 'ceilingfan',
 'cerave',
 'champ',
 'cleaningavailableduringstay',
 'cleaningproducts',
 'climbingwall',
 'cookingbasics',
 'deliplus',
 'diningtable',
 'dishesandsilverware',
 'disney',
 'dovehidratante',
 'dove',
 'dryer',
 'dvdplayer',
 'elevator',
 'esunhornillopeque_oyel_ctrico',
 'essentials',
 

In [129]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20950 entries, Entire to Entire
Columns: 219 entries, f_property_type to private_pool
dtypes: float64(11), int64(199), object(9)
memory usage: 35.2+ MB


In [114]:
print(data.columns.get_loc('body_soap'))

194


In [92]:
dummies = list(data.columns[10:178]) + list(data.columns[194:-1])

In [130]:
import re

# 1) Define exact names to exclude (case-insensitive match).
excluded_exact = [
    "property_type", "room_type", "neighborhood", 
    "cleansed", "host_id", "price", "property_category"
]

# 2) Define prefixes to exclude (case-insensitive).
excluded_prefixes = ("f_", "usd_", "p_", "n_", "ln_", "flag")

# 3) Build a list of columns that should become dummies.
dummy_candidates = []
for col in data.columns:
    # Check exact-name exclusion
    if col.lower() in [ec.lower() for ec in excluded_exact]:
        continue
    
    # Check prefix exclusion (case-insensitive)
    # We'll lowercase the column name and see if it starts with any prefix in `excluded_prefixes`.
    col_lower = col.lower()
    if any(col_lower.startswith(pref) for pref in excluded_prefixes):
        continue
    
    # If the column made it here, it's a valid dummy candidate
    dummy_candidates.append(col)

# 4) Create new columns with 'd_' prefix for each dummy candidate
for col in dummy_candidates:
    # Optionally sanitize the name (remove slashes, spaces, dashes, "(s)"), 
    # or just keep `col` as is if you only want to add `d_`.
    cleaned_col = re.sub(r"/|\s|-", "", col).replace("(s)", "s").lower()
    
    # Dummy column name
    d_col_name = f"d_{cleaned_col}"
    
    # Copy over the data from the original column
    data[d_col_name] = data[col]

# (Optional) Drop the original columns if you only want the new d_ versions:
# data.drop(columns=dummy_candidates, inplace=True)

In [93]:
for col in dummies:
    data["d_" + (re.sub("/|\s|-", "", col)).replace("(s)", "s").lower()] = data[col]

In [131]:
data.columns.to_list()

['f_property_type',
 'f_room_type2',
 'f_neighbourhood_cleansed',
 'usd_price_day',
 'p_host_response_rate',
 'n_bathrooms',
 'n_minimum_nights',
 'n_beds',
 'n_days_since',
 '',
 'contienealimentosnuestros',
 'conditioner',
 'refrigerator',
 'shampoo',
 '25yearsold',
 '3daysaweekincludedwithyourstay',
 '510yearsold',
 'airconditioning',
 'and10_yearsold',
 'and510yearsold',
 'arcadegames',
 'backyard',
 'bakingsheet',
 'barbecueutensils',
 'bathtub',
 'bedlinens',
 'beko_frigor_ficonuevo',
 'bidet',
 'bikes',
 'blanca',
 'blender',
 'boardgames',
 'boatslip',
 'booksandreadingmaterial',
 'bowlingalley',
 'breadmaker',
 'breakfast',
 'buildingstaff',
 'carbonmonoxidealarm',
 'carrefour',
 'ceilingfan',
 'cerave',
 'champ',
 'cleaningavailableduringstay',
 'cleaningproducts',
 'climbingwall',
 'cookingbasics',
 'deliplus',
 'diningtable',
 'dishesandsilverware',
 'disney',
 'dovehidratante',
 'dove',
 'dryer',
 'dvdplayer',
 'elevator',
 'esunhornillopeque_oyel_ctrico',
 'essentials',
 