In [1]:
import pandas as pd
import numpy as np

In [2]:
lis_df = pd.read_parquet('data/processed/02_listings.parquet')
rev_df = pd.read_parquet('data/processed/02_reviews.parquet')
print(lis_df.shape,'\n',rev_df.shape)

(20905, 66) 
 (366636, 5)


In [3]:
# Drop rows where price_DKK is missing, keep all other columns
#lis_df = lis_df.dropna(subset=['price_DKK'])

# Drop listings with same first/last review dates or no reviews
# lis_df = lis_df[
#     (lis_df['first_review'] != lis_df['last_review']) & 
#     ~(lis_df[['first_review', 'last_review']].isnull().all(axis=1))
# ]
# print(lis_df.shape,'\n',rev_df.shape)

### 7. Handle Missing Values

#### Imputation for Listings

In [4]:
# Physical characteristics imputation - revised approach
# Bathrooms - keep room_type median as it must exist
lis_df['bathrooms'] = lis_df.groupby('room_type')['bathrooms'].transform(lambda x: x.fillna(x.median()))
lis_df['bathrooms'] = lis_df['bathrooms'].fillna(1.0)  # Fallback to 1 if still missing

# Bedrooms - set to 0 for shared/studio, impute for homes
lis_df.loc[lis_df['room_type'].isin(['Shared room', 'Private room']), 'bedrooms'] = \
    lis_df.loc[lis_df['room_type'].isin(['Shared room', 'Private room']), 'bedrooms'].fillna(0)
lis_df.loc[lis_df['room_type'] == 'Entire home/apt', 'bedrooms'] = \
    lis_df.loc[lis_df['room_type'] == 'Entire home/apt', 'bedrooms'].transform(lambda x: x.fillna(x.median()))
lis_df['bedrooms'] = lis_df['bedrooms'].fillna(0)  # Any remaining missing to 0

# Beds - ensure at least 1 bed per listing based on accommodates
lis_df['beds'] = lis_df['beds'].fillna(lis_df['accommodates'].clip(lower=1))


# Price imputation
# If price is missing, use the median price for that room type
# lis_df['price_DKK'] = lis_df.groupby('room_type')['price_DKK'].transform(lambda x: x.fillna(x.median()))

# If any prices are still missing (very rare case), use overall median
# lis_df['price_DKK'] = lis_df['price_DKK'].fillna(lis_df['price_DKK'].median())
# lis_df['price_DKK'].isna().sum()


# Numeric imputation - ensure all numeric fields have appropriate values for PostgreSQL numeric types
numeric_cols_to_impute = {
    'host_response_rate_pct': 0.0,  # Use 0 instead of mean for percentage
    'host_acceptance_rate_pct': 0.0,  # Use 0 instead of mean for percentage
    'review_scores_rating': 0,  # Use 0 instead of median for scores
    'review_scores_accuracy': 0,
    'review_scores_cleanliness': 0,
    'review_scores_checkin': 0,
    'review_scores_communication': 0,
    'review_scores_location': 0,
    'review_scores_value': 0,
    'reviews_per_month': 0,
    'host_listings_count': 1,
    'host_total_listings_count': 1
}

for col, value in numeric_cols_to_impute.items():
    lis_df[col] = lis_df[col].fillna(value)

# Categorical imputation - use empty string for VARCHAR fields where appropriate
categorical_cols_to_impute = {
    'host_response_time': '',  # VARCHAR
    'host_neighbourhood': '',  # VARCHAR
    'bathrooms_text': '',  # VARCHAR
    'neighbourhood': 'mode',  # Keep mode for geographic consistency
    'neighbourhood_cleansed': 'mode',  # Keep mode for geographic consistency
    'host_location': 'Copenhagen, Denmark',  # Important location default
}

for col, strategy in categorical_cols_to_impute.items():
    if strategy == 'mode':
        lis_df[col] = lis_df[col].fillna(lis_df[col].mode()[0])
    else:
        lis_df[col] = lis_df[col].fillna(strategy)

# Text columns - use empty string for TEXT fields
text_cols_to_impute = ['host_about', 'neighborhood_overview', 'description']
for col in text_cols_to_impute:
    lis_df[col] = lis_df[col].fillna('')

# Date columns - use explicit PostgreSQL-compatible dates
current_date = pd.Timestamp.now().date()
lis_df['host_since'] = lis_df['host_since'].fillna(pd.Timestamp(pd.NaT))  # Use explicit default date
lis_df['first_review'] = lis_df['first_review'].fillna(pd.NaT)  # Keep as NULL for no reviews
lis_df['last_review'] = lis_df['last_review'].fillna(pd.NaT)  # Keep as NULL for no reviews

# Boolean columns - ensure True/False (not NULL)
boolean_cols = ['instant_bookable', 'host_is_superhost', 'host_has_profile_pic', 
                'host_identity_verified', 'has_availability']
for col in boolean_cols:
    lis_df[col] = lis_df[col].fillna(False)  # Default to False for missing booleans

#### Imputation for Calendar

In [5]:
# Price imputation for calendar
# cal_df['price_USD'] = cal_df.groupby(['listing_id', 'available'])['price_USD'].transform(lambda x: x.fillna(x.median()))

# Todo: Maybe 0 is not as good as median for default?
# cal_df['price_USD'] = cal_df['price_USD'].fillna(0)  # Default to 0 for any remaining NULL prices

# Handle minimum and maximum nights
# cal_df['minimum_nights'] = cal_df.groupby('listing_id')['minimum_nights'].transform(lambda x: x.fillna(x.median()))
# cal_df['maximum_nights'] = cal_df.groupby('listing_id')['maximum_nights'].transform(lambda x: x.fillna(x.median()))
# Default to 1 and 365 if still missing
# cal_df['minimum_nights'] = cal_df['minimum_nights'].fillna(1)
# cal_df['maximum_nights'] = cal_df['maximum_nights'].fillna(365)

#### Imputation for Reviews

In [6]:
# Handle missing comments
rev_df['comments'] = rev_df['comments'].fillna('')  # Empty string for TEXT field

In [7]:
# After all imputations, check which columns still have nulls
null_check = lis_df.isnull().sum()
print("\nColumns with remaining NULL values:")
print(null_check[null_check > 0])

# Fix remaining nulls based on their data types
for col in lis_df.columns[lis_df.isnull().any()]:
    dtype = lis_df[col].dtype
    
    if np.issubdtype(dtype, np.number):  # Numeric columns
        lis_df[col] = lis_df[col].fillna(0)
    elif dtype == 'datetime64[ns]':  # DateTime columns
        lis_df[col] = lis_df[col].fillna(pd.NaT)
    elif dtype == 'bool':  # Boolean columns
        lis_df[col] = lis_df[col].fillna(False)
    else:  # String/object columns
        lis_df[col] = lis_df[col].fillna('')


Columns with remaining NULL values:
host_since                     1
first_review                3218
last_review                 3218
host_verifications_count       1
dtype: int64


In [8]:
# Verify imputation
print("\nVerifying no NULL values remain:")
print("\nListings nulls:", lis_df.isnull().sum().sum())
print("Reviews nulls:", rev_df.isnull().sum().sum())


Verifying no NULL values remain:

Listings nulls: 6437
Reviews nulls: 0


### 8. Feature Engineering

In [9]:
# Store original columns
original_columns = lis_df.columns.copy()

In [10]:
lis_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20905 entries, 0 to 20904
Data columns (total 66 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   id                                            20905 non-null  int64         
 1   listing_url                                   20905 non-null  string        
 2   last_scraped                                  20905 non-null  datetime64[ns]
 3   source                                        20905 non-null  string        
 4   name                                          20905 non-null  string        
 5   description                                   20905 non-null  string        
 6   neighborhood_overview                         20905 non-null  string        
 7   host_id                                       20905 non-null  int64         
 8   host_since                                    20904 non-null  date

In [11]:
# Listing Quality Indicators
# lis_df['is_superhost'] = lis_df['host_is_superhost']  # Already boolean
lis_df['total_reviews'] = lis_df['number_of_reviews'] # fix this
lis_df['avg_rating'] = lis_df['review_scores_rating'] #TODO 
# lis_df['review_frequency'] = lis_df['reviews_per_month']


In [12]:
# Location Features
COPENHAGEN_CENTER_LAT = 55.6761
COPENHAGEN_CENTER_LON = 12.5683

# Distance to center in birdeye view
lis_df['distance_to_center_km'] = np.sqrt(
    (lis_df['latitude'] - COPENHAGEN_CENTER_LAT)**2 + 
    (lis_df['longitude'] - COPENHAGEN_CENTER_LON)**2
) * 111  # Rough conversion to kilometers

# Neighborhood density
lis_df['listings_in_neighborhood'] = lis_df.groupby('neighbourhood_cleansed')['id'].transform('count')

In [13]:
# Price Features
# Neighborhood price comparison
lis_df['neighborhood_avg_price'] = lis_df.groupby('neighbourhood_cleansed')['price_DKK'].transform('mean')
lis_df['price_vs_neighborhood'] = lis_df['price_DKK'] / lis_df['neighborhood_avg_price']

# Room type price comparison
lis_df['room_type_avg_price'] = lis_df.groupby('room_type')['price_DKK'].transform('mean')
lis_df['price_vs_room_type'] = lis_df['price_DKK'] / lis_df['room_type_avg_price']

# Value indicators
lis_df['price_per_person'] = lis_df['price_DKK'] / lis_df['accommodates']
lis_df['price_per_bedroom'] = lis_df['price_DKK'].div(lis_df['bedrooms'].where(lis_df['bedrooms'] > 0, 1))

# Host

In [14]:
# Create boolean masks for different conditions
mask_diff_dates = lis_df['first_review'] != lis_df['last_review']
mask_same_dates = lis_df['first_review'] == lis_df['last_review']

# Initialize host_experience_years with zeros
# lis_df['host_experience_years'] = 0

# Calculate for different first and last review dates
lis_df.loc[mask_diff_dates, 'host_experience_years'] = (
    (lis_df.loc[mask_diff_dates, 'last_review'] - 
     lis_df.loc[mask_diff_dates, 'first_review']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
)

# Calculate for same first and last review date
lis_df.loc[mask_same_dates, 'host_experience_years'] = (
    (pd.Timestamp.now() - lis_df.loc[mask_same_dates, 'last_review']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
)

# Set host_experience_years to 0 where both review dates are missing
mask_no_reviews = lis_df['last_review'].isna() & lis_df['first_review'].isna()
lis_df.loc[mask_no_reviews, 'host_experience_years'] = 0


In [15]:
# Response Quality: ordinal rating of each possible response time category
# One hot encode: lis_df['host_response_time']

In [16]:
# How big of a scale does the host operate on relatively to the rest in this neighborhood
lis_df['host_listings_ratio'] = lis_df['host_total_listings_count'] / lis_df['listings_in_neighborhood']

In [17]:
# Avoid division by zero in yearly_review calculation
lis_df['yearly_review'] = lis_df['number_of_reviews'] / lis_df['host_experience_years'].replace(0, 1)
lis_df['yearly_review'] = lis_df['yearly_review'].fillna(0)  # No reviews = 0 reviews per year

# Calculate review score variance; consistency of the host's recieved reviews by guests
review_score_cols = [col for col in lis_df.columns if col.startswith('review_scores_')]
lis_df['review_score_variance'] = lis_df[review_score_cols].var(axis=1)

In [18]:
# Print summary of new features
print("\nNew Feature Summary:")
new_features = lis_df.columns.difference(original_columns)
for col in new_features:
    print(f"\n{col}:")
    print(lis_df[col].describe())

# Check for any issues in new features
print("\nChecking for issues in new features:")
print(lis_df[new_features].isnull().sum())


New Feature Summary:

avg_rating:
count    20905.000000
mean         4.088264
std          1.760024
min          0.000000
25%          4.590000
50%          4.860000
75%          5.000000
max          5.000000
Name: avg_rating, dtype: float64

distance_to_center_km:
count    20905.000000
mean         3.733406
std          2.014533
min          0.096576
25%          2.259348
50%          3.445544
75%          4.762564
max         13.204881
Name: distance_to_center_km, dtype: float64

host_experience_years:
count    20905.000000
mean         2.159031
std          2.661824
min          0.000000
25%          0.134155
50%          1.013005
75%          3.107461
max         13.859001
Name: host_experience_years, dtype: float64

host_listings_ratio:
count    20905.000000
mean         0.005180
std          0.033082
min          0.000268
25%          0.000334
50%          0.000558
75%          0.001115
max          0.802680
Name: host_listings_ratio, dtype: float64

listings_in_neighborhood:
c

### 9. Final Status Check

In [19]:
print("Listings shape:", lis_df.shape)
print("Reviews shape:", rev_df.shape)

Listings shape: (20905, 81)
Reviews shape: (366636, 5)


### 10. Save Processed Datasets

In [20]:
lis_df.to_parquet('data/processed/03_listings.parquet')
rev_df.to_parquet('data/processed/03_reviews.parquet')