In [1]:
import pandas as pd
import numpy as np

In [None]:
lis_df = pd.read_parquet('data/processed/02_listings.parquet')
rev_df = pd.read_parquet('data/processed/02_reviews.parquet')
print(lis_df.shape,'\n',rev_df.shape)

### 7. Handle Missing Values

#### Imputation for Listings

In [3]:
# Physical characteristics imputation - revised approach
# Bathrooms - keep room_type median as it must exist
lis_df['bathrooms'] = lis_df.groupby('room_type')['bathrooms'].transform(lambda x: x.fillna(x.median()))
lis_df['bathrooms'] = lis_df['bathrooms'].fillna(1.0)  # Fallback to 1 if still missing

# Bedrooms - set to 0 for shared/studio, impute for homes
lis_df.loc[lis_df['room_type'].isin(['Shared room', 'Private room']), 'bedrooms'] = \
    lis_df.loc[lis_df['room_type'].isin(['Shared room', 'Private room']), 'bedrooms'].fillna(0)
lis_df.loc[lis_df['room_type'] == 'Entire home/apt', 'bedrooms'] = \
    lis_df.loc[lis_df['room_type'] == 'Entire home/apt', 'bedrooms'].transform(lambda x: x.fillna(x.median()))
lis_df['bedrooms'] = lis_df['bedrooms'].fillna(0)  # Any remaining missing to 0

# Beds - ensure at least 1 bed per listing based on accommodates
lis_df['beds'] = lis_df['beds'].fillna(lis_df['accommodates'].clip(lower=1))

# Numeric imputation - ensure all numeric fields have appropriate values for PostgreSQL numeric types
numeric_cols_to_impute = {
    'host_acceptance_rate_pct': 0.0,  # Use 0 instead of mean for percentage
    'review_scores_rating': 0,  # Use 0 instead of median for scores
    'review_scores_accuracy': 0,
    'review_scores_cleanliness': 0,
    'review_scores_checkin': 0,
    'review_scores_communication': 0,
    'review_scores_location': 0,
    'review_scores_value': 0,
    'reviews_per_month': 0,
    'host_total_listings_count': 1
}

for col, value in numeric_cols_to_impute.items():
    lis_df[col] = lis_df[col].fillna(value)

# Categorical imputation - use empty string for VARCHAR fields where appropriate
categorical_cols_to_impute = {
    'host_response_time': '',  # VARCHAR
    'host_neighbourhood': '',  # VARCHAR
    'bathrooms_text': '',  # VARCHAR
    'neighbourhood': 'mode',  # Keep mode for geographic consistency
    'neighbourhood_cleansed': 'mode',  # Keep mode for geographic consistency
    'host_location': 'Copenhagen, Denmark',  # Important location default
}

for col, strategy in categorical_cols_to_impute.items():
    if strategy == 'mode':
        lis_df[col] = lis_df[col].fillna(lis_df[col].mode()[0])
    else:
        lis_df[col] = lis_df[col].fillna(strategy)

# Text columns - use empty string for TEXT fields
text_cols_to_impute = ['host_about', 'neighborhood_overview', 'description']
for col in text_cols_to_impute:
    lis_df[col] = lis_df[col].fillna('')

# Date columns - use explicit PostgreSQL-compatible dates
current_date = pd.Timestamp.now().date()
lis_df['host_since'] = lis_df['host_since'].fillna(pd.Timestamp(pd.NaT))  # Use explicit default date
lis_df['first_review'] = lis_df['first_review'].fillna(pd.NaT)  # Keep as NULL for no reviews
lis_df['last_review'] = lis_df['last_review'].fillna(pd.NaT)  # Keep as NULL for no reviews

# Boolean columns - ensure True/False (not NULL)
boolean_cols = ['instant_bookable', 'superhost', 'host_has_profile_pic', 
                'host_identity_verified', 'has_availability']
for col in boolean_cols:
    lis_df[col] = lis_df[col].fillna(False)  # Default to False for missing booleans

#### Imputation for Calendar

In [4]:
# Price imputation for calendar
# cal_df['price_USD'] = cal_df.groupby(['listing_id', 'available'])['price_USD'].transform(lambda x: x.fillna(x.median()))

# Todo: Maybe 0 is not as good as median for default?
# cal_df['price_USD'] = cal_df['price_USD'].fillna(0)  # Default to 0 for any remaining NULL prices

# Handle minimum and maximum nights
# cal_df['minimum_nights'] = cal_df.groupby('listing_id')['minimum_nights'].transform(lambda x: x.fillna(x.median()))
# cal_df['maximum_nights'] = cal_df.groupby('listing_id')['maximum_nights'].transform(lambda x: x.fillna(x.median()))
# Default to 1 and 365 if still missing
# cal_df['minimum_nights'] = cal_df['minimum_nights'].fillna(1)
# cal_df['maximum_nights'] = cal_df['maximum_nights'].fillna(365)

#### Imputation for Reviews

In [5]:
# Handle missing comments
rev_df['comments'] = rev_df['comments'].fillna('')  # Empty string for TEXT field

In [None]:
# After all imputations, check which columns still have nulls
null_check = lis_df.isnull().sum()
print("\nColumns with remaining NULL values:")
print(null_check[null_check > 0])

# Fix remaining nulls based on their data types
for col in lis_df.columns[lis_df.isnull().any()]:
    dtype = lis_df[col].dtype
    
    if np.issubdtype(dtype, np.number):  # Numeric columns
        lis_df[col] = lis_df[col].fillna(0)
    elif dtype == 'datetime64[ns]':  # DateTime columns
        lis_df[col] = lis_df[col].fillna(pd.NaT)
    elif dtype == 'bool':  # Boolean columns
        lis_df[col] = lis_df[col].fillna(False)
    else:  # String/object columns
        lis_df[col] = lis_df[col].fillna('')

In [None]:
# Verify imputation
print("\nVerifying no NULL values remain:")
print("\nListings nulls:", lis_df.isnull().sum().sum())
print("Reviews nulls:", rev_df.isnull().sum().sum())

### 8. Feature Engineering

In [8]:
# Store original columns
original_columns = lis_df.columns.copy()

In [None]:
lis_df.info()

In [10]:
# Listing Quality Indicators
# lis_df['is_superhost'] = lis_df['host_is_superhost']  # Already boolean
#lis_df['total_reviews'] = lis_df['number_of_reviews'] #TODO
# lis_df['avg_rating'] = lis_df['review_scores_rating'] #TODO 
# lis_df['review_frequency'] = lis_df['reviews_per_month']


In [11]:
# Location Features
COPENHAGEN_CENTER_LAT = 55.6761
COPENHAGEN_CENTER_LON = 12.5683

# Distance to center in birdeye view
lis_df['distance_to_center_km'] = np.sqrt(
    (lis_df['latitude'] - COPENHAGEN_CENTER_LAT)**2 + 
    (lis_df['longitude'] - COPENHAGEN_CENTER_LON)**2
) * 111  # Rough conversion to kilometers

# Neighborhood density
lis_df['listings_in_neighborhood'] = lis_df.groupby('neighbourhood_cleansed')['id'].transform('count')

In [12]:
# Price Features
# Neighborhood price comparison
lis_df['neighborhood_avg_price'] = lis_df.groupby('neighbourhood_cleansed')['price_DKK'].transform('mean')
lis_df['price_vs_neighborhood'] = lis_df['price_DKK'] / lis_df['neighborhood_avg_price']

# Room type price comparison
lis_df['room_type_avg_price'] = lis_df.groupby('room_type')['price_DKK'].transform('mean')
lis_df['price_vs_room_type'] = lis_df['price_DKK'] / lis_df['room_type_avg_price']

# Value indicators
lis_df['price_per_person'] = lis_df['price_DKK'] / lis_df['accommodates']
lis_df['price_per_bedroom'] = lis_df['price_DKK'].div(lis_df['bedrooms'].where(lis_df['bedrooms'] > 0, 1))

# Host

In [13]:
# Create boolean masks for different conditions
mask_diff_dates = lis_df['first_review'] != lis_df['last_review']
mask_same_dates = lis_df['first_review'] == lis_df['last_review']

# Initialize host_experience_years with zeros
# lis_df['host_experience_years'] = 0

# Calculate for different first and last review dates
lis_df.loc[mask_diff_dates, 'host_experience_years'] = (
    (lis_df.loc[mask_diff_dates, 'last_review'] - 
     lis_df.loc[mask_diff_dates, 'first_review']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
)

# Calculate for same first and last review date
lis_df.loc[mask_same_dates, 'host_experience_years'] = (
    (pd.Timestamp.now() - lis_df.loc[mask_same_dates, 'last_review']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
)

# Set host_experience_years to 0 where both review dates are missing
mask_no_reviews = lis_df['last_review'].isna() & lis_df['first_review'].isna()
lis_df.loc[mask_no_reviews, 'host_experience_years'] = 0

In [14]:
# Response Quality: encode ordinal rating of each possible response time category
# lis_df['host_response_time']

In [15]:
# How big of a scale does the host operate on relatively to the rest in this neighborhood
lis_df['host_listings_ratio'] = lis_df['host_total_listings_count'] / lis_df['listings_in_neighborhood']

In [16]:
# Avoid division by zero in yearly_review calculation
lis_df['yearly_review'] = lis_df['number_of_reviews'] / lis_df['host_experience_years'].replace(0, 1)
lis_df['yearly_review'] = lis_df['yearly_review'].fillna(0)  # No reviews = 0 reviews per year

# Calculate review score variance; consistency of the host's recieved reviews by guests
review_score_cols = [col for col in lis_df.columns if col.startswith('review_scores_')]
lis_df['review_score_variance'] = lis_df[review_score_cols].var(axis=1)

In [None]:
# Print summary of new features
print("\nNew Feature Summary:")
new_features = lis_df.columns.difference(original_columns)
for col in new_features:
    print(f"\n{col}:")
    print(lis_df[col].describe())

# Check for any issues in new features
print("\nChecking for issues in new features:")
print(lis_df[new_features].isnull().sum())

## Drop first|last review date, and latitude|longitude after calculated.

In [18]:
lis_df.drop(['latitude', 'longitude'], axis=1, inplace=True)

In [19]:
lis_df.drop(['first_review', 'last_review'], axis=1, inplace=True)

### 9. Final Status Check

In [None]:
print("Listings shape:", lis_df.shape)
print("Reviews shape:", rev_df.shape)

### 10. Save Processed Datasets

In [21]:
lis_df.to_parquet('data/processed/03_listings.parquet')
rev_df.to_parquet('data/processed/03_reviews.parquet')