# 02: Feature Engineering â€” NYC Airbnb

Goal: Create meaningful features (listing, location, temporal, demand, competition, host) from the cleaned NYC dataset (`nyc_clean.parquet`).


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

# Paths
PROJECT_ROOT = Path("..") if Path.cwd().name == "notebooks" else Path(".")
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Processed data dir:", DATA_PROCESSED)


Project root: .
Processed data dir: data/processed


In [2]:
clean_path = DATA_PROCESSED / "nyc_clean.parquet"
print("Loading:", clean_path)

nyc = pd.read_parquet(clean_path)
print(f"Shape: {nyc.shape}")
nyc.head()


Loading: data/processed/nyc_clean.parquet
Shape: (48602, 16)


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [4]:
# Start from a copy of nyc
df = nyc.copy()

# 1. Basic numeric transforms
df["log_price"] = np.log(df["price"])
df["min_nights_log"] = np.log1p(df["minimum_nights"])

# 2. Review-related features
df["has_reviews"] = (df["number_of_reviews"] > 0).astype(int)
df["reviews_per_month_filled"] = df["reviews_per_month"].fillna(0.0)

# 3. Host-related counts
df["host_listing_count"] = df["calculated_host_listings_count"]

# 4. Simple availability features
df["is_fully_available"] = (df["availability_365"] >= 350).astype(int)
df["is_rarely_available"] = (df["availability_365"] <= 30).astype(int)

# Quick sanity check
df[[
    "price", "log_price", "minimum_nights", "min_nights_log",
    "number_of_reviews", "has_reviews", "reviews_per_month_filled",
    "host_listing_count", "availability_365",
    "is_fully_available", "is_rarely_available"
]].head()


Unnamed: 0,price,log_price,minimum_nights,min_nights_log,number_of_reviews,has_reviews,reviews_per_month_filled,host_listing_count,availability_365,is_fully_available,is_rarely_available
0,149,5.003946,1,0.693147,9,1,0.21,6,365,1,0
1,225,5.4161,1,0.693147,45,1,0.38,2,355,1,0
2,150,5.010635,3,1.386294,0,0,0.0,1,365,1,0
3,89,4.488636,1,0.693147,270,1,4.64,1,194,0,0
4,80,4.382027,10,2.397895,9,1,0.1,1,0,0,1


In [5]:
# Borough one-hot encoding
borough_dummies = pd.get_dummies(df["neighbourhood_group"], prefix="borough")
df = pd.concat([df, borough_dummies], axis=1)

# Neighbourhood-level price stats
neigh_stats = (
    df.groupby("neighbourhood")["price"]
      .agg(["mean", "median", "count"])
      .rename(columns={
          "mean": "neigh_price_mean",
          "median": "neigh_price_median",
          "count": "neigh_listing_count"
      })
)

df = df.merge(neigh_stats, on="neighbourhood", how="left")

# Price premium/discount vs neighbourhood mean
df["price_premium_vs_neigh"] = (
    (df["price"] - df["neigh_price_mean"]) / df["neigh_price_mean"]
)

df[[
    "neighbourhood_group", "neighbourhood",
    "price", "neigh_price_mean", "neigh_price_median",
    "neigh_listing_count", "price_premium_vs_neigh"
]].head()


Unnamed: 0,neighbourhood_group,neighbourhood,price,neigh_price_mean,neigh_price_median,neigh_listing_count,price_premium_vs_neigh
0,Brooklyn,Kensington,149,92.885714,70.0,175,0.604122
1,Manhattan,Midtown,225,265.693316,208.5,1526,-0.153159
2,Manhattan,Harlem,150,114.110524,89.0,2651,0.314515
3,Brooklyn,Clinton Hill,89,149.460457,120.0,569,-0.404525
4,Manhattan,East Harlem,80,120.560647,99.0,1113,-0.336434


In [7]:
# Convert last_review to datetime
df["last_review_date"] = pd.to_datetime(df["last_review"], errors="coerce")

# Review year and recency flags
df["last_review_year"] = df["last_review_date"].dt.year

# Was reviewed in the last 2 years? (relative to max date in data)
max_review_date = df["last_review_date"].max()
df["days_since_last_review"] = (max_review_date - df["last_review_date"]).dt.days
df["days_since_last_review"] = df["days_since_last_review"].fillna(df["days_since_last_review"].max())

df["recent_review"] = (df["days_since_last_review"] <= 365).astype(int)

df[[
    "last_review", "last_review_date",
    "last_review_year", "days_since_last_review", "recent_review"
]].head(10)


Unnamed: 0,last_review,last_review_date,last_review_year,days_since_last_review,recent_review
0,2018-10-19,2018-10-19,2018.0,262.0,1
1,2019-05-21,2019-05-21,2019.0,48.0,1
2,,NaT,,3024.0,0
3,2019-07-05,2019-07-05,2019.0,3.0,1
4,2018-11-19,2018-11-19,2018.0,231.0,1
5,2019-06-22,2019-06-22,2019.0,16.0,1
6,2017-10-05,2017-10-05,2017.0,641.0,0
7,2019-06-24,2019-06-24,2019.0,14.0,1
8,2017-07-21,2017-07-21,2017.0,717.0,0
9,2019-06-09,2019-06-09,2019.0,29.0,1


In [8]:
# Demand proxies
df["high_review_count"] = (df["number_of_reviews"] >= df["number_of_reviews"].median()).astype(int)
df["is_available_next_year"] = (df["availability_365"] > 0).astype(int)

# Host type: multi-listing vs single
df["is_multi_listing_host"] = (df["host_listing_count"] > 1).astype(int)

df[[
    "number_of_reviews", "high_review_count",
    "availability_365", "is_available_next_year",
    "host_listing_count", "is_multi_listing_host"
]].head()


Unnamed: 0,number_of_reviews,high_review_count,availability_365,is_available_next_year,host_listing_count,is_multi_listing_host
0,9,1,365,1,6,1
1,45,1,355,1,2,1
2,0,0,365,1,1,0
3,270,1,194,1,1,0
4,9,1,0,0,1,0


In [12]:
# Target
target_col = "log_price"

# Start from all numeric columns except id-like
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
exclude = ["id", "host_id", "price"]  # we use log_price instead of price
feature_cols = [c for c in numeric_cols if c not in exclude]

print("Number of features:", len(feature_cols))
feature_cols[:20]  # peek


Number of features: 24


['latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365',
 'log_price',
 'min_nights_log',
 'has_reviews',
 'reviews_per_month_filled',
 'host_listing_count',
 'is_fully_available',
 'is_rarely_available',
 'neigh_price_mean',
 'neigh_price_median',
 'neigh_listing_count',
 'price_premium_vs_neigh',
 'last_review_year',
 'days_since_last_review']

In [13]:
# Ensure target is not in feature_cols
target_col = "log_price"
feature_cols = [c for c in feature_cols if c != target_col]

print("Number of features (cleaned):", len(feature_cols))
feature_cols


Number of features (cleaned): 23


['latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365',
 'min_nights_log',
 'has_reviews',
 'reviews_per_month_filled',
 'host_listing_count',
 'is_fully_available',
 'is_rarely_available',
 'neigh_price_mean',
 'neigh_price_median',
 'neigh_listing_count',
 'price_premium_vs_neigh',
 'last_review_year',
 'days_since_last_review',
 'recent_review',
 'high_review_count',
 'is_available_next_year',
 'is_multi_listing_host']

In [14]:
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

X = df[feature_cols].copy()
y = df[target_col].copy()

print("X shape:", X.shape)
print("y shape:", y.shape)

features_path = DATA_PROCESSED / "features.parquet"
df_out = df[feature_cols + [target_col]].copy()
df_out.to_parquet(features_path, index=False)

print(f"\nSaved features to: {features_path}")
print(f"Rows: {df_out.shape[0]}, Cols: {df_out.shape[1]}")


X shape: (48602, 23)
y shape: (48602,)

Saved features to: data/processed/features.parquet
Rows: 48602, Cols: 24
