# Data Cleaning and Feature Engineering

### 1. Setup and Data Import

In [1]:
import pandas as pd
import numpy as np

In [2]:
cal_df = pd.read_csv('data/raw/calendar2024.csv')
lis_df = pd.read_csv('data/raw/listings2024.csv') 
rev_df = pd.read_csv('data/raw/reviews2024.csv')

### 2. Initial Column Cleanup

In [3]:
# Remove empty columns
null_cols_lis = lis_df.columns[lis_df.isna().all()].tolist()
lis_df = lis_df.drop(columns=null_cols_lis)

# Drop unnecessary columns
lis_df.drop(columns=['scrape_id', 'host_name', 'picture_url', 'host_url', 'host_thumbnail_url', 'host_picture_url'], inplace=True)
cal_df.drop(columns=['adjusted_price'], inplace=True)
rev_df.drop(columns=['reviewer_name'], inplace=True)

### 3. Data Type Conversions

In [4]:
# Helper functions
def convert_to_boolean(df, columns, true_value='t'):
    """Convert specified columns from string indicators to boolean"""
    for col in columns:
        df[col] = df[col] == true_value
    return df

def convert_to_datetime(df, columns):
    """Convert specified columns to datetime"""
    for col in columns:
        df[col] = pd.to_datetime(df[col])
    return df

def convert_to_type(df, columns, dtype):
    """Convert specified columns to given dtype"""
    for col in columns:
        df[col] = df[col].astype(dtype)
    return df

### 4. Apply Type Conversions

In [5]:
# Boolean conversions
boolean_cols = ['instant_bookable', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'has_availability']
lis_df = convert_to_boolean(lis_df, boolean_cols)
cal_df['available'] = cal_df['available'] == 't'

# Datetime conversions
datetime_cols_lis = ['calendar_last_scraped', 'first_review', 'last_review', 'last_scraped', 'host_since']
lis_df = convert_to_datetime(lis_df, datetime_cols_lis)
cal_df['date'] = pd.to_datetime(cal_df['date'])
rev_df['date'] = pd.to_datetime(rev_df['date'])

# String conversions
string_columns = ['bathrooms_text', 'neighbourhood', 'neighbourhood_cleansed', 'property_type', 'room_type', 'host_location', 'host_about', 'host_neighbourhood', 'listing_url', 'host_response_time', 'source', 'name', 'description', 'neighborhood_overview']
lis_df = convert_to_type(lis_df, string_columns, "string")
rev_df['comments'] = rev_df['comments'].astype("string")

### 5. Handle Percentage and Currency Columns

In [6]:
# Convert percentage columns
percentage_cols = ['host_response_rate', 'host_acceptance_rate']
for col in percentage_cols:
    lis_df = lis_df.rename(columns={col: f"{col}_pct"})
    lis_df[f"{col}_pct"] = lis_df[f"{col}_pct"].str.rstrip('%').astype('float') / 100

# Clean and standardize price columns
#DKK
lis_df['price'] = lis_df['price'].str.replace(r'[\$,]', '', regex=True)
lis_df = lis_df.rename(columns={'price': 'price_DKK'})
lis_df['price_DKK'] = pd.to_numeric(lis_df['price_DKK'], errors='coerce')

#USD
cal_df['price'] = cal_df['price'].str.replace(r'[\$,]', '', regex=True)
cal_df = cal_df.rename(columns={'price': 'price_USD'})
cal_df['price_USD'] = pd.to_numeric(cal_df['price_USD'], errors='coerce')

### 6. Process array-Type Columns

In [7]:
# Process amenities
lis_df['amenities_count'] = lis_df.amenities.str.strip('[]').str.split(',').str.len()

def clean_amenity(text):
    """Clean individual amenity strings"""
    import re
    text = str(text) # Convert to string if not already
    text = text.strip().strip('"\'').strip('.- ') # Basic cleaning
    text = text.encode('ascii', 'ignore').decode('ascii') # Replace unicode escape sequences with their characters
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters, keeping only alphanumeric and spaces
    text = text.lower().strip() # Convert to lowercase, strip again, and remove any remaining leading/trailing spaces
    return text

# Clean and convert amenities to comma-separated string
lis_df['amenities'] = lis_df['amenities'].str.strip('[]').str.split(',').apply(
    lambda x: ','.join(
        sorted(  # Sort for consistency
            filter(None,  # Remove empty strings
                [clean_amenity(item) for item in x]
            )
        )
    )
)

lis_df['amenities'] = lis_df['amenities'].astype('string') # Convert to string dtype




# Process host verifications
# Count number of verifications per host
lis_df['host_verifications_count'] = lis_df['host_verifications'].str.strip('[]').str.split(', ').str.len()
lis_df['host_verifications'] = lis_df['host_verifications'].str.strip('[]').str.replace("'", "").str.split(', ')
# Create one-hot encoded columns
verification_dummies = lis_df['host_verifications'].str.join('|').str.get_dummies()
verification_dummies = verification_dummies.add_prefix('verification_')
lis_df = pd.concat([lis_df, verification_dummies], axis=1)

lis_df.drop(columns=['host_verifications', 'amenities'], inplace=True)

In [8]:
lis_df.to_parquet('data/processed/02_listings.parquet')
cal_df.to_parquet('data/processed/02_calendar.parquet')
rev_df.to_parquet('data/processed/02_reviews.parquet')

### 7. Handle Missing Values

#### Imputation for Listings

In [9]:
# Physical characteristics imputation - revised approach
# Bathrooms - keep room_type median as it must exist
lis_df['bathrooms'] = lis_df.groupby('room_type')['bathrooms'].transform(lambda x: x.fillna(x.median()))
lis_df['bathrooms'] = lis_df['bathrooms'].fillna(1.0)  # Fallback to 1 if still missing

# Bedrooms - set to 0 for shared/studio, impute for homes
lis_df.loc[lis_df['room_type'].isin(['Shared room', 'Private room']), 'bedrooms'] = \
    lis_df.loc[lis_df['room_type'].isin(['Shared room', 'Private room']), 'bedrooms'].fillna(0)
lis_df.loc[lis_df['room_type'] == 'Entire home/apt', 'bedrooms'] = \
    lis_df.loc[lis_df['room_type'] == 'Entire home/apt', 'bedrooms'].transform(lambda x: x.fillna(x.median()))
lis_df['bedrooms'] = lis_df['bedrooms'].fillna(0)  # Any remaining missing to 0

# Beds - ensure at least 1 bed per listing based on accommodates
lis_df['beds'] = lis_df['beds'].fillna(lis_df['accommodates'].clip(lower=1))


# Price imputation
# If price is missing, use the median price for that room type
lis_df['price_DKK'] = lis_df.groupby('room_type')['price_DKK'].transform(lambda x: x.fillna(x.median()))

# If any prices are still missing (very rare case), use overall median
lis_df['price_DKK'] = lis_df['price_DKK'].fillna(lis_df['price_DKK'].median())



# Numeric imputation - ensure all numeric fields have appropriate values for PostgreSQL numeric types
numeric_cols_to_impute = {
    'host_response_rate_pct': 0.0,  # Use 0 instead of mean for percentage
    'host_acceptance_rate_pct': 0.0,  # Use 0 instead of mean for percentage
    'review_scores_rating': 0,  # Use 0 instead of median for scores
    'review_scores_accuracy': 0,
    'review_scores_cleanliness': 0,
    'review_scores_checkin': 0,
    'review_scores_communication': 0,
    'review_scores_location': 0,
    'review_scores_value': 0,
    'reviews_per_month': 0,
    'host_listings_count': 1,
    'host_total_listings_count': 1
}

for col, value in numeric_cols_to_impute.items():
    lis_df[col] = lis_df[col].fillna(value)

# Categorical imputation - use empty string for VARCHAR fields where appropriate
categorical_cols_to_impute = {
    'host_response_time': '',  # VARCHAR
    'host_neighbourhood': '',  # VARCHAR
    'bathrooms_text': '',  # VARCHAR
    'neighbourhood': 'mode',  # Keep mode for geographic consistency
    'neighbourhood_cleansed': 'mode',  # Keep mode for geographic consistency
    'host_location': 'Copenhagen, Denmark',  # Important location default
}

for col, strategy in categorical_cols_to_impute.items():
    if strategy == 'mode':
        lis_df[col] = lis_df[col].fillna(lis_df[col].mode()[0])
    else:
        lis_df[col] = lis_df[col].fillna(strategy)

# Text columns - use empty string for TEXT fields
text_cols_to_impute = ['host_about', 'neighborhood_overview', 'description']
for col in text_cols_to_impute:
    lis_df[col] = lis_df[col].fillna('')

# Date columns - use explicit PostgreSQL-compatible dates
current_date = pd.Timestamp.now().date()
lis_df['host_since'] = lis_df['host_since'].fillna(pd.Timestamp('2000-01-01'))  # Use explicit default date
lis_df['first_review'] = lis_df['first_review'].fillna(pd.NaT)  # Keep as NULL for no reviews
lis_df['last_review'] = lis_df['last_review'].fillna(pd.NaT)  # Keep as NULL for no reviews

# Boolean columns - ensure True/False (not NULL)
boolean_cols = ['instant_bookable', 'host_is_superhost', 'host_has_profile_pic', 
                'host_identity_verified', 'has_availability']
for col in boolean_cols:
    lis_df[col] = lis_df[col].fillna(False)  # Default to False for missing booleans

#### Imputation for Calendar

In [10]:
# Price imputation for calendar
cal_df['price_USD'] = cal_df.groupby(['listing_id', 'available'])['price_USD'].transform(lambda x: x.fillna(x.median()))
cal_df['price_USD'] = cal_df['price_USD'].fillna(0)  # Default to 0 for any remaining NULL prices

# Handle minimum and maximum nights
cal_df['minimum_nights'] = cal_df.groupby('listing_id')['minimum_nights'].transform(lambda x: x.fillna(x.median()))
cal_df['maximum_nights'] = cal_df.groupby('listing_id')['maximum_nights'].transform(lambda x: x.fillna(x.median()))
# Default to 1 and 365 if still missing
cal_df['minimum_nights'] = cal_df['minimum_nights'].fillna(1)
cal_df['maximum_nights'] = cal_df['maximum_nights'].fillna(365)

#### Imputation for Reviews

In [11]:
# Handle missing comments
rev_df['comments'] = rev_df['comments'].fillna('')  # Empty string for TEXT field

# Handle missing dates (shouldn't be any, but just in case)
rev_df['date'] = rev_df['date'].fillna(pd.Timestamp('2000-01-01'))  # Use explicit default date

In [12]:
# After all imputations, check which columns still have nulls
null_check = lis_df.isnull().sum()
print("\nColumns with remaining NULL values:")
print(null_check[null_check > 0])

# Fix remaining nulls based on their data types
for col in lis_df.columns[lis_df.isnull().any()]:
    dtype = lis_df[col].dtype
    
    if np.issubdtype(dtype, np.number):  # Numeric columns
        lis_df[col] = lis_df[col].fillna(0)
    elif dtype == 'datetime64[ns]':  # DateTime columns
        lis_df[col] = lis_df[col].fillna(pd.Timestamp('2000-01-01'))
    elif dtype == 'bool':  # Boolean columns
        lis_df[col] = lis_df[col].fillna(False)
    else:  # String/object columns
        lis_df[col] = lis_df[col].fillna('')


Columns with remaining NULL values:
first_review                3220
last_review                 3220
host_verifications_count       1
dtype: int64


In [16]:
# Verify imputation
print("\nVerifying no NULL values remain:")
print("\nListings nulls:", lis_df.isnull().sum().sum())
print("Calendar nulls:", cal_df.isnull().sum().sum())
print("Reviews nulls:", rev_df.isnull().sum().sum())


Verifying no NULL values remain:

Listings nulls: 0
Calendar nulls: 0
Reviews nulls: 0


### 8. Feature Engineering

In [17]:
# Store original columns
original_columns = lis_df.columns.copy()

In [27]:
lis_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20909 entries, 0 to 20908
Data columns (total 89 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   id                                            20909 non-null  int64         
 1   listing_url                                   20909 non-null  string        
 2   last_scraped                                  20909 non-null  datetime64[ns]
 3   source                                        20909 non-null  string        
 4   name                                          20909 non-null  string        
 5   description                                   20909 non-null  string        
 6   neighborhood_overview                         20909 non-null  string        
 7   host_id                                       20909 non-null  int64         
 8   host_since                                    20909 non-null  date

In [15]:
# Listing Quality Indicators
lis_df['is_superhost'] = lis_df['host_is_superhost']  # Already boolean
lis_df['total_reviews'] = lis_df['number_of_reviews']
lis_df['avg_rating'] = lis_df['review_scores_rating']
# lis_df['review_frequency'] = lis_df['reviews_per_month']

# Availability and Demand
lis_df['occupancy_rate_next_30days'] = 1 - (lis_df['availability_30'] / 30)  # Inverse of availability
lis_df['occupancy_rate_next_60days'] = 1 - (lis_df['availability_60'] / 60)  # Inverse of availability
lis_df['occupancy_rate_next_90days'] = 1 - (lis_df['availability_90'] / 90)  # Inverse of availability
lis_df['occupancy_rate_next_year'] = 1 - (lis_df['availability_365'] / 365)  # Inverse of availability

pd.DataFrame({
    'availability_30': lis_df.availability_30[:20],
    'availability_60': lis_df.availability_60[:20], 
    'availability_90': lis_df.availability_90[:20],
    'availability_365': lis_df.availability_365[:20]
})

Unnamed: 0,availability_30,availability_60,availability_90,availability_365
0,0,0,0,0
1,0,0,0,0
2,0,4,32,32
3,18,22,22,22
4,10,17,45,45
5,0,0,0,0
6,0,0,0,0
7,0,0,0,266
8,1,1,1,1
9,0,7,20,20


In [16]:
# Location Features
COPENHAGEN_CENTER_LAT = 55.6761
COPENHAGEN_CENTER_LON = 12.5683

# Distance to center in birdeye view
lis_df['distance_to_center_km'] = np.sqrt(
    (lis_df['latitude'] - COPENHAGEN_CENTER_LAT)**2 + 
    (lis_df['longitude'] - COPENHAGEN_CENTER_LON)**2
) * 111  # Rough conversion to kilometers

# Neighborhood density
lis_df['listings_in_neighborhood'] = lis_df.groupby('neighbourhood_cleansed')['id'].transform('count')

In [22]:
# Price Features
# Neighborhood price comparison
lis_df['neighborhood_avg_price'] = lis_df.groupby('neighbourhood_cleansed')['price_DKK'].transform('mean')
lis_df['price_vs_neighborhood'] = lis_df['price_DKK'] / lis_df['neighborhood_avg_price']

# Room type price comparison
lis_df['room_type_avg_price'] = lis_df.groupby('room_type')['price_DKK'].transform('mean')
lis_df['price_vs_room_type'] = lis_df['price_DKK'] / lis_df['room_type_avg_price']

# Value indicators
lis_df['price_per_person'] = lis_df['price_DKK'] / lis_df['accommodates']
lis_df['price_per_bedroom'] = lis_df['price_DKK'].div(lis_df['bedrooms'].where(lis_df['bedrooms'] > 0, 1))

In [23]:
# Host Features
lis_df['host_experience_years'] = (pd.Timestamp.now() - lis_df['host_since']).dt.total_seconds() / (365.25 * 24 * 60 * 60)

# How big of a scale does the host operate on relatively to the rest in this neighborhood
lis_df['host_listings_ratio'] = lis_df['host_total_listings_count'] / lis_df['listings_in_neighborhood']

# Response Quality
# One hot encode: lis_df['host_response_time']

In [38]:
# Review Features
lis_df['days_since_last_review'] = (pd.Timestamp.now() - lis_df['last_review']).dt.total_seconds() / (24 * 60 * 60)
lis_df['yearly_review'] = lis_df['number_of_reviews'] / lis_df['host_experience_years']

##### CHECKPOINT ELIAS #####


# Calculate review score variance
review_score_cols = [col for col in lis_df.columns if col.startswith('review_scores_')]
lis_df['review_score_variance'] = lis_df[review_score_cols].var(axis=1)

In [25]:
# Calendar Features
cal_df['date'] = pd.to_datetime(cal_df['date'])
cal_df['is_weekend'] = cal_df['date'].dt.dayofweek >= 5
cal_df['is_holiday'] = cal_df['date'].dt.month.isin([6, 7, 8, 12])  # Summer and December

# Aggregate to listing level
calendar_features = cal_df.groupby('listing_id').agg({
    'price_USD': ['mean', 'std'],
    'is_weekend': 'mean',  # Proportion of weekend days
    'is_holiday': 'mean',  # Proportion of holiday days
    'available': 'mean'    # Proportion of available days
}).reset_index()

In [26]:
# Print summary of new features
print("\nNew Feature Summary:")
new_features = lis_df.columns.difference(original_columns)
for col in new_features:
    print(f"\n{col}:")
    print(lis_df[col].describe())

# Check for any issues in new features
print("\nChecking for issues in new features:")
print(lis_df[new_features].isnull().sum())


New Feature Summary:


NameError: name 'original_columns' is not defined

### 9. Final Status Check

In [25]:
print("Listings shape:", lis_df.shape)
print("Calendar shape:", cal_df.shape) 
print("Reviews shape:", rev_df.shape)

Listings shape: (20909, 89)
Calendar shape: (7631731, 8)
Reviews shape: (366636, 5)


### 10. Save Processed Datasets

In [26]:
lis_df.to_parquet('data/processed/03_listings_cleaned.parquet')
cal_df.to_parquet('data/processed/03_calendar_cleaned.parquet') 
rev_df.to_parquet('data/processed/03_reviews_cleaned.parquet')