#### airbnb cleaning file 

final data: airbnb_x_workfile.csv 

In [None]:
import pandas as pd
import os
import warnings
import re
import sys
from datetime import datetime
from pathlib import Path
import numpy as np
from skimpy import skim

warnings.filterwarnings("ignore")

In [38]:
print(os.getcwd()) 

/Users/ghadena/Desktop/Business analytics/DA3/DA3/assignment_1


In [39]:
data_dir = "/Users/ghadena/Desktop/Business analytics/DA3/DA3/assignment_1/"

# location of folders
data_in = data_dir + "data/raw/"
data_out = data_dir + "data/clean/"

In [40]:
# zero step
data = pd.read_csv(data_in + "barcelona.csv", delimiter=",", dtype="unicode")
drops = [
    "host_thumbnail_url",
    "host_picture_url",
    "listing_url",
    "picture_url",
    "host_url",
    "last_scraped",
    "description",
    "neighborhood_overview",
    "host_about",
    "host_response_time",
    "name",
    "host_location",
]
data.drop(columns=drops, inplace=True)
data.to_csv(data_in + "airbnb_barcelona.csv", index=False)

In [41]:
# opening dataset
df = pd.read_csv(data_in + "airbnb_barcelona.csv", delimiter=",")

In [42]:

# Step 1: Identify the Review Columns and Add the 'accommodates' Column
review_columns = [col for col in df.columns if 'review' in col.lower()]
columns_to_extract = review_columns + ['accommodates']
print(f"Columns to extract: {columns_to_extract}")

# Step 2: Extract the Columns from the Original DataFrame
review_df = df[columns_to_extract].copy()

Columns to extract: ['number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month', 'accommodates']


In [43]:
# drop broken lines - where id is not a character of numbers
df.id = pd.to_numeric(df.id, errors="coerce")
df = df[df.id.notna()]

In [44]:
# display the class and type of each columns
df.dtypes

id                                                int64
scrape_id                                         int64
source                                           object
host_id                                           int64
host_name                                        object
                                                 ...   
calculated_host_listings_count                    int64
calculated_host_listings_count_entire_homes       int64
calculated_host_listings_count_private_rooms      int64
calculated_host_listings_count_shared_rooms       int64
reviews_per_month                               float64
Length: 63, dtype: object

In [45]:
#####################
# formatting columns
for perc in ["host_response_rate", "host_acceptance_rate"]:
    df[perc] = pd.to_numeric(df[perc], errors="coerce")

In [46]:
#remove dollar sign from price
if 'price' in df.columns:
    df['price'] = df['price'].dropna().astype(str).str.replace(r"\$", "", regex=True)
    df['price'] = pd.to_numeric(df['price'], errors="coerce")

# Check results
print(df['price'].head(10))
print(df['price'].isnull().sum())  # Count of remaining null values
print(df['price'].dtype)  # Should be float64

0    119.0
1    219.0
2     70.0
3    118.0
4    203.0
5     46.0
6     70.0
7    130.0
8    100.0
9      NaN
Name: price, dtype: float64
3895
float64


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19455 entries, 0 to 19454
Data columns (total 63 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            19455 non-null  int64  
 1   scrape_id                                     19455 non-null  int64  
 2   source                                        19455 non-null  object 
 3   host_id                                       19455 non-null  int64  
 4   host_name                                     19453 non-null  object 
 5   host_since                                    19453 non-null  object 
 6   host_response_rate                            0 non-null      float64
 7   host_acceptance_rate                          0 non-null      float64
 8   host_is_superhost                             19016 non-null  object 
 9   host_neighbourhood                            9644 non-null  

In [48]:
# format binary variables
for binary in [
    "host_is_superhost",
    "host_has_profile_pic",
    "host_identity_verified",
    "instant_bookable",
]:
    df[binary] = df[binary].map({"t": True, "f": False})

In [49]:
# amenities
df["amenities"] = df["amenities"].str.strip("{}").str.replace('"', "").str.split(",")

In [51]:

# Standardize amenities before creating dummies
df["amenities"] = (
    df["amenities"]
    .astype(str)  # Ensure it's a string
    .str.lower()  # Convert to lowercase for consistency
    .str.replace(r"[\[\]\{\}]", "", regex=True)  # Remove brackets
    .str.replace(r"\s+", " ", regex=True)  # Remove extra spaces
    .str.strip()  # Trim spaces at the beginning/end
)

# Convert amenities into separate columns
dummies = pd.get_dummies(df["amenities"].apply(lambda x: pd.Series(x.split(","))).stack()).groupby(level=0).sum()

# Concatenate with original DataFrame
df = pd.concat([df, dummies], axis=1)

# Drop the original 'amenities' column
df.drop(columns=["amenities"], inplace=True, errors="ignore")

In [52]:
# Check results
print(df.columns)

Index(['id', 'scrape_id', 'source', 'host_id', 'host_name', 'host_since',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_neighbourhood',
       ...
       ''smoke alarm'', ''smoking allowed'', ''stainless steel oven'',
       ''standard oven'', ''teka refrigerator'', ''todas shampoo'',
       ''tv with standard cable'', ''tv'', ''washer'', ''wifi''],
      dtype='object', length=2412)


In [53]:
df.columns.tolist()

['id',
 'scrape_id',
 'source',
 'host_id',
 'host_name',
 'host_since',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bathrooms_text',
 'bedrooms',
 'beds',
 'price',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'calendar_updated',
 'has_availability',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'calendar_last_scraped',
 'number_of_reviews',
 'number_of_reviews_ltm',
 'number_of_reviews_l30d',
 'first_review',
 'last_review',
 'review_scores_rating',
 're

In [54]:
import pandas as pd
import re

def clean_column_name(col):
    # Remove D_ prefix and any quotes/backslashes
    cleaned = re.sub(r'^d_[\'"]?|[\'"]$', '', col, flags=re.IGNORECASE)
    
    # Decode Unicode characters and special symbols
    cleaned = re.sub(r'\\u[0-9a-f]{4}', lambda m: bytes(m.group(0), 'utf-8').decode('unicode_escape'), cleaned)
    cleaned = re.sub(r'[^a-zA-Z0-9]', ' ', cleaned)  # Replace special chars with spaces
    
    # Standardize text
    cleaned = cleaned.lower().strip()
    cleaned = re.sub(r'\s+', '_', cleaned)  # Convert spaces to underscores
    
    return cleaned

# Apply to all columns
cleaned_columns = [clean_column_name(col) for col in df.columns]
df.columns = cleaned_columns 

In [55]:
aggregation_rules = {
    'shampoo': ['shampoo'],
    'conditioner': ['conditioner'],
    'body_soap': ['body_soap'],
    'tv': ['tv', 'hdtv', 'television'],
    'streaming_services': ['netflix', 'amazon_prime', 'hulu', 'disney+', 'streaming','appletv', 'chromecast','hbomax'],
    'refrigerator': ['refrigerator', 'fridge'],
    'stove': ['stove'],
    'child_friendly': ['child', 'baby', 'infant', 'toddler', 'high_chair', 'crib', 'babybath', 'babymonitor', 'changing_table', 'children', 'kids', 'family'],
    'free_parking': ['free_parking', 'freeresidentialgarageonpremises', 'freestreetparking', 'freeparkingonpremises'],
    'paid_parking': ['paid_parking', 'paidparkingoffpremises','paidparkingonpremises','paid_garage'],
    'wifi': ['wifi', 'internet'],
    'bidet': ['bidet'],
    'oven': ['oven'],
    'cable': ['cable'],
    'sound_system': ['sound_system', 'speaker'],
    'backyard': ['backyard', 'garden', 'yard'],
    'view': ['view'],
    'balcony': ['balcony'],
    'bbq': ['bbq'],
    'bathtub': ['bathtub'],
    'coffee_maker': ['coffee', 'espresso', 'keurig'],
    'exercise_equipment': ['exercise', 'free_weights', 'treadmill', 'elliptical', 'yoga', 'pilates','workout_bench','stationary_bike'],
    'dryer': ['dryer', 'free_dryer'],
    'washer': ['washer', 'free_washer'],
    'gym': ['gym'],
    'heating': ['heating','heated','central_heating'],
    'housekeeping_included': ['housekeeping', 'included', 'housekeeping_available'],
    'housekeeping_extracost': ['housekeeping', 'extra_cost','housekeeping_available'],
    'indoor_fireplace': ['indoor_fireplace'],
    'paid_dryer_washer': ['paid_washer', 'paid_dryer','laundromat'],
    'airconditioning': ['air_conditioning','central_airconditioning'],
    'kitchen': ['kitchen', 'kitchenette'],
    'game_console': ['game_console', 'playstation', 'xbox'],
    'clothing_storage': ['clothing_storage', 'closet', 'wardrobe', 'dresser'],
    'electric_car_charging': ['ev_charger', 'free_carport_on_premises'],
    'indoor_pool': ['indoor_pool'],
    'outdoor_pool': ['outdoor_pool','infinity','olympic_sized','lap_pool','pool_heathed'],
    'outdoor_space': ['outdoor_dining','outdoor_seating','outdoor_shower','outdoor_grill','outdoor_patio','outdoor_firepit','outdoor_bar','outdoor_barbecue','outdoor_furniture','outdoor_play'],
    'private_ameneties': ['private'],
    'shared_ameneties': ['shared'],
    'private_pool': ['private_pool'],
    'sauna': ['sauna','sauna_steam_room','private_sauna'], 
    # Add more rules as needed...
}

In [56]:
import re

def aggregate_features(df, rules, strict_multi_word=None):
    """
    Aggregates binary features based on keyword matching in column names.
    
    - For `strict_multi_word` categories (e.g., housekeeping), ALL words must be present in the column name.
    - For everything else, a column matches if ANY keyword appears.
    
    Args:
        df (DataFrame): The dataset with binary dummy variables.
        rules (dict): Mapping of new feature names to keyword lists.
        strict_multi_word (list): List of category names that require ALL keywords in the column name.
    
    Returns:
        DataFrame: Updated dataset with aggregated features.
    """
    strict_multi_word = strict_multi_word or []  # Default to an empty list if None
    columns_to_drop = []  # List to track original columns for removal

    for new_col, keywords in rules.items():
        # Escape special regex characters in keywords
        patterns = [re.escape(k) for k in keywords]
        
        if new_col in strict_multi_word:
            # 🟢 Strict Mode: Ensure ALL keywords appear together (for housekeeping-related columns)
            matching_cols = [
                col for col in df.columns
                if all(re.search(p, col, flags=re.IGNORECASE) for p in patterns)
            ]
        else:
            # 🔵 Standard Mode: Match if ANY keyword appears (for all other features)
            pattern = r'(' + '|'.join(patterns) + r')'
            matching_cols = [
                col for col in df.columns
                if re.search(pattern, col, flags=re.IGNORECASE)
            ]
        
        print(f"\nCreating '{new_col}' from:")
        print(matching_cols)
        
        # Create new aggregated column
        if matching_cols:
            df[new_col] = df[matching_cols].any(axis=1).astype(int)
            columns_to_drop.extend(matching_cols)  # ✅ Track columns to remove
    
    print(f"Columns before dropping: {df.columns.tolist()}")
    
    # ✅ Drop the original detailed columns that were aggregated
    # ✅ FIX: Prevent Aggregated Columns from Being Dropped
    columns_to_drop = [col for col in columns_to_drop if col not in rules.keys()]
    df.drop(columns=columns_to_drop, inplace=True, errors="ignore")

    print(f"Final columns after dropping: {df.columns.tolist()}")
    
    return df

# 🔵 Housekeeping Columns Require **Both Words**
strict_categories = ['housekeeping_extra_cost', 'housekeeping_included']

# 🚀 Apply Aggregation
df = aggregate_features(df, aggregation_rules, strict_categories)

print(df.info())  # Shows data types and missing values
print(df.describe())  # Shows summary statistics for numeric columns


Creating 'shampoo' from:
['l_occitane_shampoo', 'l_oreal_shampoo', 'l_oreal_shampoo', 'shampoo', '0_shampoo', '1_bote_grande_basico_al_entrar_no_se_repone_despues_shampoo', 'loe_vera_shampoo', 'actahotels_shampoo', 'aloe_vera_shampoo', 'amenities_rituals_shampoo', 'amenities_shampoo', 'any_shampoo', 'anyah_shampoo', 'argan_essencial_oil_shampoo', 'argan_oil_essence_shampoo', 'argan_oil_shampoo', 'atalis_shampoo', 'aveda_shampoo', 'basic_shampoo', 'basic_neutro_shampoo', 'blanca_shampoo', 'bont_or_similar_shampoo_conditioner_shampoo', 'bp_shampoo', 'c_o_bigelow_lavender_amenities_shampoo', 'c_o_bigelow_lavender_amenities_shampoo', 'champ_shampoo', 'champu_de_alta_calidad_shampoo', 'cien_shampoo', 'clarins_shampoo', 'corte_ingl_s_shampoo', 'cowshed_shampoo', 'dave_shampoo', 'de_ducha_shampoo', 'deli_plus_shampoo', 'deliplus_shampoo', 'delyplus_shampoo', 'depende_shampoo', 'dermatloico_shampoo', 'desconocido_shampoo', 'dove_shampoo', 'eco_shampoo', 'ecoderma_shampoo', 'ecol_gico_shampoo'

In [57]:
df.columns.to_list()

['id',
 'scrape_id',
 'source',
 'host_id',
 'host_name',
 'host_since',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bathrooms_text',
 'bedrooms',
 'beds',
 'price',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'calendar_updated',
 'has_availability',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'calendar_last_scraped',
 'license',
 'instant_bookable',
 'calculated_host_listings_count',
 'calculated_host_listings_count_entire_homes',
 'body_soap',
 'body_

In [58]:
df.drop(columns=[''], inplace=True)



In [23]:
# write csv
#df.to_csv(data_out + "airbnb_madrid_cleaned.csv", index=False)

In [59]:
df.view.value_counts()
#df.airconditioning.value_counts()

view
1    14720
0     4735
Name: count, dtype: int64

In [60]:
import pandas as pd

# Assuming df is your original DataFrame


# Step 3: Merge the Extracted Columns Back into the Cleaned DataFrame
df = pd.concat([df, review_df], axis=1)

print("Merged the review columns and 'accommodates' column back into the cleaned DataFrame.")

Merged the review columns and 'accommodates' column back into the cleaned DataFrame.


In [61]:
import numpy as np
# Function to check if an element is multidimensional
def is_multidimensional(x):
    return isinstance(x, (list, tuple, np.ndarray))

# Apply the function to each element in the DataFrame
multidimensional_columns = df.applymap(is_multidimensional).any()

# Print columns with multidimensional data
print("Columns with multidimensional data:")
print(multidimensional_columns[multidimensional_columns].index.tolist())

# Inspect the dimensions of the multidimensional columns
for col in multidimensional_columns[multidimensional_columns].index:
    print(f"Column '{col}' dimensions:")
    print(df[col].apply(lambda x: np.array(x).shape if is_multidimensional(x) else (1,)).value_counts())

Columns with multidimensional data:
[]


In [62]:
import pandas as pd

# Assuming df is your original DataFrame

# Identify duplicated columns
duplicate_columns = df.columns[df.columns.duplicated(keep='first')]

# Drop the first instance of each duplicated column
df = df.loc[:, ~df.columns.duplicated(keep='first')]

print(f"Dropped the first instance of each duplicate column: {duplicate_columns.tolist()}")

Dropped the first instance of each duplicate column: ['body_soap', 'body_soap', 'body_soap', 'conditioner', 'dove', 'refrigerator', 'shampoo', 'backyard', 'bathtub', 'beach_access_beachfront', 'bed_linens', 'blender', 'books_and_reading_material', 'breakfast', 'carbon_monoxide_alarm', 'ceiling_fan', 'conditioner', 'dedicated_workspace', 'dryer', 'elevator', 'essentials', 'exercise_equipment', 'exterior_security_cameras_on_property', 'fire_extinguisher', 'first_aid_kit', 'free_street_parking', 'freezer', 'hangers', 'heating', 'host_greets_you', 'hot_water_kettle', 'hot_water', 'kitchen', 'lock_on_bedroom_door', 'long_term_stays_allowed', 'luggage_dropoff_allowed', 'pets_allowed', 'pool', 'portable_heater', 'private_pool', 'refrigerator', 'room_darkening_shades', 'self_check_in', 'smart_lock', 'smoke_alarm', 'smoking_allowed', 'tv', 'washer', 'wifi', 'accommodates']


In [63]:
df.shampoo

0        1
1        0
2        1
3        0
4        1
        ..
19450    1
19451    1
19452    1
19453    1
19454    0
Name: shampoo, Length: 19455, dtype: int64

In [64]:
# Identify duplicated columns
duplicate_columns = df.columns[df.columns.duplicated()]
duplicate_columns

Index([], dtype='object')

In [65]:
# Filter columns that contain the word "review"
review_columns = [col for col in df.columns if 'review' in col.lower()]

print("Columns that might contain review data:")
print(review_columns)

Columns that might contain review data:
['number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month']


In [66]:
df.shape

(19455, 266)

In [67]:
df['accommodates']

0        8
1        5
2        6
3        2
4        9
        ..
19450    6
19451    6
19452    3
19453    3
19454    2
Name: accommodates, Length: 19455, dtype: int64

In [68]:
print(df.view.value_counts())
print(df.airconditioning.value_counts())


view
1    14720
0     4735
Name: count, dtype: int64
airconditioning
1    12426
0     7029
Name: count, dtype: int64


In [69]:
df.shape

(19455, 266)

In [70]:
df.to_csv(data_out + "airbnb_barcelona.csv", index=False)

## Clean part 2 

In [None]:

data_dir = "/Users/ghadena/Desktop/Business analytics/DA3/DA3/assignment_1/data/clean"
path = os.path.join(data_dir,'airbnb_barcelona.csv') 
data = pd.read_csv(path)


In [None]:
import pandas as pd

# Define keyword-based mapping
property_mappings = {
    "Apartment": ["rental unit", "condo", "loft"],
    "House": ["home", "townhouse", "vacation home", "guesthouse", "chalet", "cabin", "bungalow", "villa", "cottage"],
    "Hotel/Serviced Stay": ["hotel", "serviced apartment", "boutique hotel", "bed and breakfast", "guest suite", "pension", "resort"],
    "Hostel": ["hostel"],
    "Traditional Stay": ["casa particular", "ryokan", "minsu"],
    "Alternative Stay": ["camper", "rv", "tiny home", "tent", "yurt", "cave", "dome", "barn", "earthen home", "religious building", "hut", "floor"],
    "Unknown": ["shared room", "entire place", "private room"],  # General category for shared spaces
}

# Function to assign property category
def categorize_property(property_type):
    property_type = str(property_type).lower()  # Ensure lowercase for comparison
    for category, keywords in property_mappings.items():
        if any(keyword in property_type for keyword in keywords):
            return category
    return "Unknown"  # Default category if no match is found

# Apply mapping function
data["property_category"] = data["property_type"].apply(categorize_property)

# Display the updated categories
property_category_counts = data["property_category"].value_counts()
print("Property Category Counts:\n", property_category_counts)

# Check remaining "Unknown" values
print("Property types classified as 'Unknown':")
print(data[data["property_category"] == "Unknown"]["property_type"].value_counts().head(20))

In [None]:
# Extract Room Type (Entire, Private, Shared)
data["f_room_type"] = data["property_type"].str.extract(r"^(Entire|Private room|Shared room)")
# Fill missing values with "Unknown" before mapping
data["f_room_type"].fillna("Unknown", inplace=True)
print(data["f_room_type"].value_counts())

In [None]:
#room type as factors 
data["f_property_type"] = data["property_category"].astype("category")
data["f_room_type"] = data["f_room_type"].astype("category")
data["f_room_type2"] = data["f_room_type"].map(
    {
        "Entire": "Entire/Apt",
        "Private room": "Private",
        "Shared room": "Shared",
        "Unknown": "Unknown"
    }
)
data["f_neighbourhood_cleansed"] = data["neighbourhood_cleansed"].astype("category")

## create numerical variables 

In [None]:
data["usd_price_day"] = data["price"]
data["p_host_response_rate"] = data["host_response_rate"].fillna(0).astype(int)
numericals = [
    "accommodates",
    "bathrooms",
    "review_scores_rating",
    "number_of_reviews",
    "reviews_per_month",
    "minimum_nights",
    "beds",
]
for col in numericals:
    data["n_" + col] = pd.to_numeric(data[col], errors="coerce")

In [None]:
data["n_days_since"] = (
    data.calendar_last_scraped.apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
    - data.first_review.fillna("1950-01-01").apply(
        lambda x: datetime.strptime(x, "%Y-%m-%d")
    )
).dt.days

data["n_days_since"] = np.where(data.first_review.isnull(), np.nan, data.n_days_since)


In [None]:
# rename dummy cols 
data.columns.get_loc("shared_ameneties")
dummies = data.columns[47:254]
for col in dummies:
    data["d_" + (re.sub("/|\s|-", "", col)).replace("(s)", "s").lower()] = data[col]


In [None]:
#Keep columns if contain d_, n_,f_, p_, usd_ and some others
data = pd.concat(
    [
        data.filter(
            regex="^d_.*|^n_.*|^f_.*|^p_.*|^usd_.*",
        ),
        data[
            [
                "price",
                "host_id",
                "neighbourhood_cleansed",
                "room_type",
                "property_type",
            ]
        ],
    ],
    axis=1,
)


In [None]:
skim(data["price"].to_frame())

In [None]:
data["price"].hist()

In [None]:
data["price"].apply(np.log).hist()

In [None]:
# Squares and further values to create
data = data.assign(
    n_accommodates2=lambda x: x["n_accommodates"] ** 2,
    ln_accommodates=lambda x: np.log(x["n_accommodates"]),
    ln_accommodates2=lambda x: np.log(x["n_accommodates"]) ** 2,
    ln_beds=lambda x: np.log(x["n_beds"]),
    ln_number_of_reviews=lambda x: np.log(x["n_number_of_reviews"] + 1),
)


In [None]:
#creating a category for bathrooms 
bins = pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (2, 10)], closed="left")
f_bath = pd.cut(data["n_bathrooms"], bins=bins, labels=[0, 1, 2])
data["f_bathroom"] = f_bath
print(data["f_bathroom"].value_counts(dropna=False))

In [None]:
# pool num of reviews 
bins = pd.IntervalIndex.from_tuples(
    [(0, 1), (1, 51), (51, max(data["n_number_of_reviews"]))], closed="left")
fnor = pd.cut(data["n_number_of_reviews"], bins=bins, labels=[0, 1, 2])
data["f_number_of_reviews"] = fnor
print(data["f_number_of_reviews"].value_counts(dropna=False))

In [None]:
#pool min nights 
print("Max n_minimum_nights:", data["n_minimum_nights"].max())
print("Summary Statistics:\n", data["n_minimum_nights"].describe())
print("Unique values in n_minimum_nights:", data["n_minimum_nights"].unique())
#i assume those 3 extra values are outliers, so i will remove them
data = data[data["n_minimum_nights"] < 365] 

In [None]:
bins = pd.IntervalIndex.from_tuples(
    [(1, 2), (2, 3), (3, max(data.n_minimum_nights))], closed="left"
)
f_min_n = pd.cut(data["n_minimum_nights"].to_list(), bins, labels=["1", "2", "3"])
data["f_minimum_nights"] = f_min_n
data["f_minimum_nights"].value_counts(dropna=False)

In [None]:
# chnage inf oto nans 
data = data.replace([np.inf, -np.inf], np.nan)

## handling msssing vars 

In [None]:
data.isnull().sum().loc[lambda x: x > 0]

What to do with missing values?

1. drop if no target

In [None]:
data = data.loc[lambda x: x["price"].notnull()]

2. imput when few, not that important

In [None]:
# Ensure 'n_accommodates' has no missing values before using it in 'n_beds'
data["n_accommodates"] = data["n_accommodates"].fillna(data["n_accommodates"].median())

# Fix categorical columns by adding "99" as a category before filling NaNs
for col in ["f_bathroom", "f_minimum_nights", "f_number_of_reviews"]:
    if col in data.columns and pd.api.types.is_categorical_dtype(data[col]):
        data[col] = data[col].cat.add_categories([99]).fillna(99)  # Add category & fill NaNs
    else:
        data[col] = data[col].fillna(99)  # If not categorical, fill normally

# Assign and fill missing values in a single step
data = data.assign(
    n_bathrooms=lambda x: x["n_bathrooms"].fillna(np.median(x["n_bathrooms"].dropna())),
    n_beds=lambda x: np.where(x["n_beds"].isnull(), x["n_accommodates"], x["n_beds"]),
    ln_beds=lambda x: x["ln_beds"].fillna(0),
)

In [None]:
data.isnull().sum().loc[lambda x: x > 0]

 4. Replace missing variables re reviews with zero, when no review + add flags

In [None]:
data = data.assign(
    flag_days_since=np.multiply(data["n_days_since"].isnull(), 1),
    n_days_since=data["n_days_since"].fillna(np.median(data["n_days_since"].dropna())),
    flag_review_scores_rating=np.multiply(data["n_review_scores_rating"].isnull(), 1),
    n_review_scores_rating=data["n_review_scores_rating"].fillna(
        np.median(data["n_review_scores_rating"].dropna())
    ),
    flag_reviews_per_month=np.multiply(data["n_reviews_per_month"].isnull(), 1),
    n_reviews_per_month=data["n_reviews_per_month"].fillna(
        np.median(data["n_reviews_per_month"].dropna())
    ),
    flag_n_number_of_reviews=np.multiply(data["n_number_of_reviews"].isnull(), 1),
)

In [None]:
data.flag_days_since.value_counts()



Redo features

Create variables, measuring the time since: squared, cubic, logs

In [None]:
data = data.assign(
    ln_days_since=lambda x: np.log(x["n_days_since"] + 1),
    ln_days_since2=lambda x: np.log(x["n_days_since"] + 1) ** 2,
    ln_days_since3=lambda x: np.log(x["n_days_since"] + 1) ** 3,
    n_days_since2=lambda x: x["n_days_since"] ** 2,
    n_days_since3=lambda x: x["n_days_since"] ** 3,
    ln_review_scores_rating=lambda x: np.log(x["n_review_scores_rating"]),
)


In [None]:
data["n_days_since"] = data["ln_days_since"].fillna(0)
data["ln_days_since2"] = data["ln_days_since2"].fillna(0)
data["ln_days_since3"] = data["ln_days_since3"].fillna(0)

In [None]:
data.isnull().sum().loc[lambda x: x > 0]

In [None]:
data.describe()


In [None]:
data.d_view.value_counts()

In [None]:
data.to_csv("data/barcelona_workfile.csv", index=False)