# Data Cleaning

### 1. Setup and Data Import

In [1]:
import pandas as pd
import numpy as np

In [None]:
cal_df = pd.read_csv('data/raw/calendar2024.csv')
lis_df = pd.read_csv('data/raw/listings2024.csv') 
rev_df = pd.read_csv('data/raw/reviews2024.csv')
print('rev:',rev_df.shape)
print('lis:',lis_df.shape)
print('cal:',cal_df.shape)

### 2. Initial Column Cleanup

In [3]:
# Remove empty columns
null_cols_lis = lis_df.columns[lis_df.isna().all()].tolist()
lis_df = lis_df.drop(columns=null_cols_lis)

# Drop completely irrelevant columns from listings data
lis_df.drop(columns=['scrape_id', 'host_name', 'picture_url', 'host_url', 'host_thumbnail_url', 
                     'host_picture_url', 'host_listings_count', 'calculated_host_listings_count', 
                     'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 
                     'calculated_host_listings_count_shared_rooms',
                     'number_of_reviews_l30d', 'number_of_reviews_ltm', 
                     'last_scraped', 'source', 'calendar_last_scraped',
                     'host_about', 'neighborhood_overview',
                     ], inplace=True) 

lis_df.drop(columns=[col for col in lis_df.columns if 'host_listings' in col], inplace=True)
lis_df.drop(columns=[col for col in lis_df.columns if 'availability_' in col], inplace=True)
lis_df.drop(columns=[col for col in lis_df.columns if 'minimum_nights' in col], inplace=True)
lis_df.drop(columns=[col for col in lis_df.columns if 'maximum_nights' in col], inplace=True)

rev_df.drop(columns=['reviewer_name'], inplace=True)

In [4]:
# Rename
lis_df['superhost'] = lis_df['host_is_superhost']
lis_df.drop(['host_is_superhost'], axis=1, inplace=True)

#### Neigborhood Attribute Processing

In [None]:
mapping = {
    'Nrrebro': 'Nørrebro',
    'sterbro': 'Østerbro',
    'Vanlse': 'Vanløse',
    'Brnshj-Husum': 'Brønshøj',
    'Vesterbro-Kongens Enghave': 'Vesterbro',
}

lis_df['neighbourhood_cleansed'] = lis_df['neighbourhood_cleansed'].replace(mapping)


# Replace empty or NA host_neighbourhood values with corresponding neighbourhood_cleansed values
mask = (lis_df['host_neighbourhood'] == '') | (lis_df['host_neighbourhood'].isna())
lis_df.loc[mask, 'host_neighbourhood'] = lis_df.loc[mask, 'neighbourhood_cleansed']

#########################################################

non_cph_neighborhoods = ['', 'Nyboder', 'Niagara', 
                        '6th Arrondissement', 'Södermalm',
                         'Embajadores', 'Batignolles', 'Montmartre', 'Vesturbær', 'Passy',
                         'Almagro', 'Clinton Hill', 'Jakkur Layout', 'Jardim das Bandeiras',
                         'Williamsburg', 'Notting Hill', 'Belém', 'Campo Belo', 'El Madroñal',
                         'Alphabet City', 'Buzovna', 'Bastille']

# Count listings for each non-Copenhagen neighborhood
non_cph_counts = lis_df[lis_df['host_neighbourhood'].isin(non_cph_neighborhoods)]['host_neighbourhood'].value_counts()

# Get neighborhoods with count < 9 from non_cph_counts
small_non_cph = non_cph_counts[non_cph_counts < 9].index

# Update host_neighbourhood to match neighbourhood_cleansed where host_neighbourhood is in small_non_cph
mask = lis_df['host_neighbourhood'].isin(small_non_cph)
lis_df.loc[mask, 'host_neighbourhood'] = lis_df.loc[mask, 'neighbourhood_cleansed']

# Remap Hackney to Nørrebro
lis_df.loc[lis_df['host_neighbourhood'] == 'Hackney', 'host_neighbourhood'] = 'Nørrebro'

lis_df = lis_df.drop(['neighbourhood', 'host_location'], axis=1) # 'neighbourhood_cleansed' is just a simplified host_neighbourhood with less unique values
print(lis_df['host_neighbourhood'].unique())

### 3. Data Type Conversions

In [6]:
# Helper functions
def convert_to_boolean(df, columns, true_value='t'):
    """Convert specified columns from string indicators to boolean"""
    for col in columns:
        df[col] = df[col] == true_value
    return df

def convert_to_datetime(df, columns):
    """Convert specified columns to datetime"""
    for col in columns:
        df[col] = pd.to_datetime(df[col])
    return df

def convert_to_type(df, columns, dtype):
    """Convert specified columns to given dtype"""
    for col in columns:
        df[col] = df[col].astype(dtype)
    return df

In [7]:
# Boolean conversions
boolean_cols = ['instant_bookable', 'host_has_profile_pic', 'host_identity_verified', 'has_availability', 'superhost']
lis_df = convert_to_boolean(lis_df, boolean_cols)
# cal_df['available'] = cal_df['available'] == 't'

# Datetime conversions
datetime_cols_lis = ['first_review', 'last_review', 'host_since']
lis_df = convert_to_datetime(lis_df, datetime_cols_lis)
# cal_df['date'] = pd.to_datetime(cal_df['date'])
rev_df['date'] = pd.to_datetime(rev_df['date'])

# String conversions
string_columns = ['bathrooms_text', 'neighbourhood_cleansed', 'property_type', 'room_type', 'host_neighbourhood', 'listing_url', 'host_response_time', 'name', 'description']
lis_df = convert_to_type(lis_df, string_columns, "string")
rev_df['comments'] = rev_df['comments'].astype("string")

### 4. Handle Percentage Columns

In [8]:
# Convert percentage columns
percentage_cols = ['host_response_rate', 'host_acceptance_rate'] # used later for response stats
for col in percentage_cols:
    lis_df = lis_df.rename(columns={col: f"{col}_pct"})
    lis_df[f"{col}_pct"] = lis_df[f"{col}_pct"].str.rstrip('%').astype('float') / 100

### 5. Process array-Type Columns

In [9]:
# Process amenities
lis_df['amenities_count'] = lis_df.amenities.str.strip('[]').str.split(',').str.len()

def clean_amenity(text):
    """Clean individual amenity strings"""
    import re
    text = str(text) # Convert to string if not already
    text = text.strip().strip('"\'').strip('.- ') # Basic cleaning
    text = text.encode('ascii', 'ignore').decode('ascii') # Replace unicode escape sequences with their characters
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
    text = re.sub(r'[^a-zA-Z0-9\s\-.,:/+&æøåÆØÅ]', '', text) # Remove special characters, keeping only alphanumeric and spaces
    text = text.lower().strip() # Convert to lowercase, strip again, and remove any remaining leading/trailing spaces
    return text

# Clean and convert amenities to comma-separated string
lis_df['amenities'] = lis_df['amenities'].str.strip('[]').str.split(',').apply(
    lambda x: ','.join(
        sorted(  # Sort for consistency
            filter(None,  # Remove empty strings
                [clean_amenity(item) for item in x]
            )
        )
    )
)

lis_df['amenities'] = lis_df['amenities'].astype('string') # Convert to string dtype

# Process host verifications. Count number of verifications per host
lis_df['host_verifications_count'] = lis_df['host_verifications'].str.strip('[]').str.split(', ').str.len()

lis_df.drop(columns=['host_verifications', 'amenities'], inplace=True)

### 6. Handle Missing Values (Imputation)

#### Function to analyze missing values

In [None]:
def analyze_missing_values(df, df_name):
    # Calculate missing values
    missing = df.isnull().sum()
    missing_percent = (df.isnull().sum() / len(df)) * 100
    
    # Create a summary DataFrame
    missing_info = pd.DataFrame({
        'Missing Values': missing,
        'Missing Percentage': missing_percent.round(2)
    })
    
    # Only show columns with missing values, sorted by percentage
    missing_info = missing_info[missing_info['Missing Values'] > 0].sort_values(
        'Missing Percentage', ascending=False
    )
    
    print(f"\nMissing Values Analysis for {df_name}:")
    print("-" * 50)
    if len(missing_info) > 0:
        print(missing_info)
    else:
        print("No missing values found!")
    print(f"\nTotal rows in dataset: {len(df)}")

# Analyze both datasets
analyze_missing_values(lis_df, "Listings")
analyze_missing_values(rev_df, "Reviews")

#### Price Imputation

In [11]:
# Fix price columns
#DKK
lis_df['price'] = lis_df['price'].str.replace(r'[\$,]', '', regex=True)
lis_df = lis_df.rename(columns={'price': 'price_DKK'})
lis_df['price_DKK'] = pd.to_numeric(lis_df['price_DKK'], errors='coerce')

# Drop the 4 most expensive listings (they looked like ingenuine outliers)
expensive_indices = lis_df['price_DKK'].nlargest(4).index
lis_df = lis_df.drop(expensive_indices)

#USD
cal_df['price'] = cal_df['price'].str.replace(r'[\$,]', '', regex=True)
cal_df = cal_df.rename(columns={'price': 'price_USD'})
cal_df['price_USD'] = pd.to_numeric(cal_df['price_USD'], errors='coerce')


# Prepare average prices per listing
cal_prices = cal_df.groupby('listing_id')['price_USD'].mean().reset_index()
# Multiply prices under 500 by 5.9
cal_prices.loc[cal_prices['price_USD'] < 500, 'price_USD'] *= 5.9

# Merge with listings
lis_df = lis_df.merge(
    cal_prices,
    left_on='id',
    right_on='listing_id',
    how='left'
)
lis_df.drop('listing_id', axis=1, inplace=True)

# Fill missing DKK prices with USD prices
lis_df.loc[lis_df['price_DKK'].isna(), 'price_DKK'] = lis_df.loc[lis_df['price_DKK'].isna(), 'price_USD']
# Drop price_USD if no missing values in price_DKK
if not lis_df['price_DKK'].isna().any():
    lis_df.drop('price_USD', axis=1, inplace=True)

#### Listing Data Imputation

In [12]:
# Host fields
host_cols_to_impute = {
    'host_acceptance_rate_pct': 0.0,  #
    'host_total_listings_count': 1,  #
    'host_verifications_count': 0,  # 
    'host_response_time': '',  # 
    'host_since': pd.NaT  # Used to calculate host_experience_years
}

for col, value in host_cols_to_impute.items():
    lis_df[col] = lis_df[col].fillna(value)

In [13]:
# Review score fields - impute with median values
review_score_cols = [col for col in lis_df.columns if col.startswith('review_scores_')]
for col in review_score_cols:
    median_score = lis_df[col].median()
    lis_df[col] = lis_df[col].fillna(median_score)

In [None]:
lis_df['bathrooms_text'].unique()

In [None]:
# Handle half baths and NAs in bathrooms_text
# First convert to float to avoid dtype warning
half_bath_mask = lis_df['bathrooms_text'].isin(['Half-bath', 'Shared half-bath', 'Private half-bath'])
lis_df.loc[half_bath_mask, 'bathrooms_text'] = '0.5'

# Extract numeric values from bathrooms_text
# Convert to string first before using str accessor
lis_df['bathrooms_text'] = lis_df['bathrooms_text'].astype(str).str.extract(r'^(\d+\.?\d?)').astype(float)

# Handle NAs with explicit float conversion
lis_df['bathrooms_text'] = lis_df['bathrooms_text'].astype(float).fillna(0.0)

# Fill NaN bathrooms values with bathrooms_text values
lis_df.loc[lis_df['bathrooms'].isna(), 'bathrooms'] = lis_df.loc[lis_df['bathrooms'].isna(), 'bathrooms_text']
print(len(lis_df['bathrooms_text'].unique()), lis_df['bathrooms_text'].unique())

In [16]:
# Accommodation fields
accommodation_cols_to_impute = {
    'room_type': pd.NA,    
    'property_type': pd.NA,
    'amenities_count': 0,
    'accommodates': 1,  
    'has_availability': False,
    'instant_bookable': False 
}

for col, value in accommodation_cols_to_impute.items():
    lis_df[col] = lis_df[col].fillna(value)

#### Response Time Imputation

In [None]:
# Verify host_response_rate_pct is always 0 when host_response_time is empty
empty_response_time = lis_df[lis_df['host_response_time'] == '']
print("\nHost response rates when response time is empty:")
print(empty_response_time['host_response_rate_pct'].value_counts())
print(f"\nAll zeros? {(empty_response_time['host_response_rate_pct'] == 0).all()}")

# Remap empty strings to 'never'
lis_df.loc[lis_df['host_response_time'] == '', 'host_response_time'] = 'never'

lis_df['host_response_time'].unique()

#### Review Data Imputation

Since the missing values are not missing in the reviews data, we will not impute them.

### 8. Save Processed Data

In [None]:
lis_df.to_parquet('data/processed/02_listings.parquet')
rev_df.to_parquet('data/processed/02_reviews.parquet')
print('rev:',rev_df.shape)
print('lis:',lis_df.shape)

 —————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————

### Reasoning for Currency Issue

In [19]:
#lis_df[['listing_url', 'price_DKK', 'price_USD']][:20]
# Check missing DKK prices where USD price > 250
# missing_dkk = lis_df[lis_df['price_DKK'].isna()]
# missing_dkk_high_usd = missing_dkk[missing_dkk['price_USD'] < 500]
# print(f"Out of {len(missing_dkk)} listings with missing DKK prices,")
# print(f"{len(missing_dkk_high_usd)} have USD prices < 500")

# Extract listing IDs from URLs and check their review dates
# missing_dkk_listings = lis_df.loc[missing_dkk_high_usd.index, ['listing_url', 'price_DKK', 'price_USD', 'first_review', 'last_review']]
# missing_dkk_listings = missing_dkk_listings.sort_values('first_review')

# print("First listing's last review:", missing_dkk_listings['first_review'].iloc[0])
# print("Last listing's last review:", missing_dkk_listings['last_review'].iloc[0])
# # missing_dkk_listings[['listing_url', 'price_DKK', 'price_USD']]

# lis_df.loc[missing_dkk_high_usd.index, ['listing_url', 'price_DKK', 'price_USD']]

# Compare price columns
# price_df = lis_df[lis_df.columns[lis_df.columns.str.startswith('price')]]

# # Count missing values
# missing_both = price_df[['price_DKK', 'price_USD']].isna().all(axis=1).sum()
# missing_dkk_only = price_df['price_DKK'].isna().sum() - missing_both
# missing_usd_only = price_df['price_USD'].isna().sum() - missing_both

# # Count same/different prices (excluding missing)
# valid_prices = price_df.dropna()
# same_prices = (valid_prices['price_DKK'] == valid_prices['price_USD']).sum()
# diff_prices = len(valid_prices) - same_prices

# print(f"Missing both prices: {missing_both}")
# print(f"Missing only DKK price: {missing_dkk_only}")
# print(f"Missing only USD price: {missing_usd_only}")

# print(f"Number of listings with same prices in DKK and USD: {same_prices}")
# print(f"Number of listings with different prices in DKK and USD: {diff_prices}")
# same_prices = (valid_prices['price_DKK'] == valid_prices['price_USD']).sum()
# diff_prices = len(valid_prices) - same_prices


# # Check how many rows with missing DKK prices have USD prices available
# missing_dkk = lis_df[lis_df['price_DKK'].isna()]
# dkk_missing_usd_available = missing_dkk['price_USD'].notna().sum()

# print(f"Out of {len(missing_dkk)} rows with missing DKK prices,")
# print(f"{dkk_missing_usd_available} have USD prices available")


# import matplotlib.pyplot as plt
# import numpy as np

# # Calculate 1st and 99th percentiles
# lower_bound = lis_df['price_USD'].quantile(0.05)
# upper_bound = lis_df['price_USD'].quantile(0.95)

# # Filter out extreme outliers (1%)
# clean_prices = lis_df['price_USD'][(lis_df['price_USD'] >= lower_bound) & 
#                                  (lis_df['price_USD'] <= upper_bound)]

# # Plot distribution of cleaned USD prices
# plt.figure(figsize=(10, 6))
# plt.hist(clean_prices, bins=50)
# plt.title('Distribution of USD Prices (5% Extreme Outliers Removed)')
# plt.xlabel('Price (USD)')
# plt.ylabel('Frequency')
# plt.show()

# # Print summary statistics
# print("\nSummary statistics for cleaned USD prices:")
# print(clean_prices.describe())
# print(f"\nRemoved {len(lis_df['price_USD'].dropna()) - len(clean_prices)} outliers")
