# Data Cleaning

### 1. Setup and Data Import

In [1]:
import pandas as pd
import numpy as np

In [None]:
cal_df = pd.read_csv('data/raw/calendar2024.csv')
lis_df = pd.read_csv('data/raw/listings2024.csv') 
rev_df = pd.read_csv('data/raw/reviews2024.csv')
print('rev:',rev_df.shape)
print('lis:',lis_df.shape)
print('cal:',cal_df.shape)

### 2. Initial Column Cleanup

In [3]:
# Remove empty columns
null_cols_lis = lis_df.columns[lis_df.isna().all()].tolist()
lis_df = lis_df.drop(columns=null_cols_lis)

# Drop unnecessary columns from listings data
lis_df.drop(columns=['scrape_id', 'host_name', 'picture_url', 'host_url', 'host_thumbnail_url', 
                     'host_picture_url', 'host_listings_count', 'calculated_host_listings_count', 
                     'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 
                     'calculated_host_listings_count_shared_rooms', 'host_listings_count', 'calculated_host_listings_count', 
                     'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 
                     'calculated_host_listings_count_shared_rooms', 'number_of_reviews_l30d', 'number_of_reviews_ltm'], inplace=True)

# cal_df.drop(columns=['adjusted_price'], inplace=True)
rev_df.drop(columns=['reviewer_name'], inplace=True)

In [4]:
lis_df['superhost'] = lis_df['host_is_superhost']
lis_df.drop(['host_is_superhost'], axis=1, inplace=True)

### 3. Data Type Conversions

In [5]:
# Helper functions
def convert_to_boolean(df, columns, true_value='t'):
    """Convert specified columns from string indicators to boolean"""
    for col in columns:
        df[col] = df[col] == true_value
    return df

def convert_to_datetime(df, columns):
    """Convert specified columns to datetime"""
    for col in columns:
        df[col] = pd.to_datetime(df[col])
    return df

def convert_to_type(df, columns, dtype):
    """Convert specified columns to given dtype"""
    for col in columns:
        df[col] = df[col].astype(dtype)
    return df

### 4. Apply Type Conversions

In [None]:
# Boolean conversions
boolean_cols = ['instant_bookable', 'host_has_profile_pic', 'host_identity_verified', 'has_availability']
lis_df = convert_to_boolean(lis_df, boolean_cols)
# cal_df['available'] = cal_df['available'] == 't'

# Datetime conversions
datetime_cols_lis = ['calendar_last_scraped', 'first_review', 'last_review', 'last_scraped', 'host_since']
lis_df = convert_to_datetime(lis_df, datetime_cols_lis)
# cal_df['date'] = pd.to_datetime(cal_df['date'])
rev_df['date'] = pd.to_datetime(rev_df['date'])

# String conversions
string_columns = ['bathrooms_text', 'neighbourhood', 'neighbourhood_cleansed', 'property_type', 'room_type', 'host_location', 'host_about', 'host_neighbourhood', 'listing_url', 'host_response_time', 'source', 'name', 'description', 'neighborhood_overview']
lis_df = convert_to_type(lis_df, string_columns, "string")
rev_df['comments'] = rev_df['comments'].astype("string")

### 5. Handle Percentage and Currency Columns

In [8]:
# Convert percentage columns
percentage_cols = ['host_response_rate', 'host_acceptance_rate']
for col in percentage_cols:
    lis_df = lis_df.rename(columns={col: f"{col}_pct"})
    lis_df[f"{col}_pct"] = lis_df[f"{col}_pct"].str.rstrip('%').astype('float') / 100

### 6. Process array-Type Columns

In [9]:
# Process amenities
lis_df['amenities_count'] = lis_df.amenities.str.strip('[]').str.split(',').str.len()

def clean_amenity(text):
    """Clean individual amenity strings"""
    import re
    text = str(text) # Convert to string if not already
    text = text.strip().strip('"\'').strip('.- ') # Basic cleaning
    text = text.encode('ascii', 'ignore').decode('ascii') # Replace unicode escape sequences with their characters
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
    text = re.sub(r'[^a-zA-Z0-9\s\-.,:/+&æøåÆØÅ]', '', text) # Remove special characters, keeping only alphanumeric and spaces
    text = text.lower().strip() # Convert to lowercase, strip again, and remove any remaining leading/trailing spaces
    return text

# Clean and convert amenities to comma-separated string
lis_df['amenities'] = lis_df['amenities'].str.strip('[]').str.split(',').apply(
    lambda x: ','.join(
        sorted(  # Sort for consistency
            filter(None,  # Remove empty strings
                [clean_amenity(item) for item in x]
            )
        )
    )
)

lis_df['amenities'] = lis_df['amenities'].astype('string') # Convert to string dtype

# Process host verifications. Count number of verifications per host
lis_df['host_verifications_count'] = lis_df['host_verifications'].str.strip('[]').str.split(', ').str.len()

lis_df.drop(columns=['host_verifications', 'amenities'], inplace=True)

### 7. Handle Missing Values (Imputation)

In [21]:
lis_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20905 entries, 0 to 20904
Data columns (total 59 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   id                           20905 non-null  int64         
 1   listing_url                  20905 non-null  string        
 2   last_scraped                 20905 non-null  datetime64[ns]
 3   source                       20905 non-null  string        
 4   name                         20905 non-null  string        
 5   description                  20232 non-null  string        
 6   neighborhood_overview        8984 non-null   string        
 7   host_id                      20905 non-null  int64         
 8   host_since                   20904 non-null  datetime64[ns]
 9   host_location                20905 non-null  string        
 10  host_about                   8863 non-null   string        
 11  host_response_time           20905 non-nu

In [20]:
# Function to analyze missing values
def analyze_missing_values(df, df_name):
    # Calculate missing values
    missing = df.isnull().sum()
    missing_percent = (df.isnull().sum() / len(df)) * 100
    
    # Create a summary DataFrame
    missing_info = pd.DataFrame({
        'Missing Values': missing,
        'Missing Percentage': missing_percent.round(2)
    })
    
    # Only show columns with missing values, sorted by percentage
    missing_info = missing_info[missing_info['Missing Values'] > 0].sort_values(
        'Missing Percentage', ascending=False
    )
    
    print(f"\nMissing Values Analysis for {df_name}:")
    print("-" * 50)
    if len(missing_info) > 0:
        print(missing_info)
    else:
        print("No missing values found!")
    print(f"\nTotal rows in dataset: {len(df)}")

# Analyze both datasets
analyze_missing_values(lis_df, "Listings")
analyze_missing_values(rev_df, "Reviews")


Missing Values Analysis for Listings:
--------------------------------------------------
                             Missing Values  Missing Percentage
host_neighbourhood                    15489               74.09
host_about                            12042               57.60
neighborhood_overview                 11921               57.02
bathrooms                              7249               34.68
beds                                   7249               34.68
host_response_rate_pct                 6466               30.93
review_scores_communication            3242               15.51
review_scores_value                    3243               15.51
review_scores_location                 3243               15.51
review_scores_accuracy                 3242               15.51
review_scores_cleanliness              3242               15.51
review_scores_checkin                  3242               15.51
first_review                           3218               15.39
reviews_per_mo

#### Price Imputation

In [10]:
# Fix price columns
#DKK
lis_df['price'] = lis_df['price'].str.replace(r'[\$,]', '', regex=True)
lis_df = lis_df.rename(columns={'price': 'price_DKK'})
lis_df['price_DKK'] = pd.to_numeric(lis_df['price_DKK'], errors='coerce')

# Drop the 4 most expensive listings (they looked like ingenuine outliers)
expensive_indices = lis_df['price_DKK'].nlargest(4).index
lis_df = lis_df.drop(expensive_indices)

#USD
cal_df['price'] = cal_df['price'].str.replace(r'[\$,]', '', regex=True)
cal_df = cal_df.rename(columns={'price': 'price_USD'})
cal_df['price_USD'] = pd.to_numeric(cal_df['price_USD'], errors='coerce')


# Prepare average prices per listing
cal_prices = cal_df.groupby('listing_id')['price_USD'].mean().reset_index()
# Multiply prices under 500 by 5.9
cal_prices.loc[cal_prices['price_USD'] < 500, 'price_USD'] *= 5.9

# Merge with listings
lis_df = lis_df.merge(
    cal_prices,
    left_on='id',
    right_on='listing_id',
    how='left'
)
lis_df.drop('listing_id', axis=1, inplace=True)

# Fill missing DKK prices with USD prices
lis_df.loc[lis_df['price_DKK'].isna(), 'price_DKK'] = lis_df.loc[lis_df['price_DKK'].isna(), 'price_USD']
# Drop price_USD if no missing values in price_DKK
if not lis_df['price_DKK'].isna().any():
    lis_df.drop('price_USD', axis=1, inplace=True)

#### Listing Data Imputation

In [None]:
# Host fields
host_cols_to_impute = {
    'host_acceptance_rate_pct': 0.0,  # host_acceptance_rate_pct
    'host_total_listings_count': 1,  # Used to calculate host_listings_ratio
    'host_verifications_count': 0,  # host_verifications_count
    'host_response_time': '',  # host_response_time
    'host_location': 'Copenhagen, Denmark',  # host_location
    'host_since': pd.NaT  # Used to calculate host_experience_years
}

for col, value in host_cols_to_impute.items():
    lis_df[col] = lis_df[col].fillna(value)

In [15]:
# Accommodation fields
accommodation_cols_to_impute = {
    'room_type': 'Unknown',  # room_type
    'property_type': 'Unknown',  # property_type
    'neighbourhood': 'Unknown',  # neighborhood
    'amenities_count': 0,  # amenities_count
    'accommodates': 1,  # accommodates
    'has_availability': False,  # has_availability
    'instant_bookable': False  # instant_bookable
}

for col, value in accommodation_cols_to_impute.items():
    lis_df[col] = lis_df[col].fillna(value)

In [16]:
# Listings fields
misc_cols_to_impute = {
    'has_availability': False,  # has_availability
    'instant_bookable': False  # instant_bookable
}

for col, value in misc_cols_to_impute.items():
    lis_df[col] = lis_df[col].fillna(value)

#### Review Data Imputation

Since the missing values are not missing at random, we will not impute them.

### 8. Save Processed Data

In [None]:
#lis_df.to_parquet('data/processed/02_listings.parquet')
#rev_df.to_parquet('data/processed/02_reviews.parquet')
#print('rev:',rev_df.shape)
#print('lis:',lis_df.shape)

 —————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————

### Reasoning for Currency Issue

In [19]:
#lis_df[['listing_url', 'price_DKK', 'price_USD']][:20]
# Check missing DKK prices where USD price > 250
# missing_dkk = lis_df[lis_df['price_DKK'].isna()]
# missing_dkk_high_usd = missing_dkk[missing_dkk['price_USD'] < 500]
# print(f"Out of {len(missing_dkk)} listings with missing DKK prices,")
# print(f"{len(missing_dkk_high_usd)} have USD prices < 500")

# Extract listing IDs from URLs and check their review dates
# missing_dkk_listings = lis_df.loc[missing_dkk_high_usd.index, ['listing_url', 'price_DKK', 'price_USD', 'first_review', 'last_review']]
# missing_dkk_listings = missing_dkk_listings.sort_values('first_review')

# print("First listing's last review:", missing_dkk_listings['first_review'].iloc[0])
# print("Last listing's last review:", missing_dkk_listings['last_review'].iloc[0])
# # missing_dkk_listings[['listing_url', 'price_DKK', 'price_USD']]

# lis_df.loc[missing_dkk_high_usd.index, ['listing_url', 'price_DKK', 'price_USD']]

# Compare price columns
# price_df = lis_df[lis_df.columns[lis_df.columns.str.startswith('price')]]

# # Count missing values
# missing_both = price_df[['price_DKK', 'price_USD']].isna().all(axis=1).sum()
# missing_dkk_only = price_df['price_DKK'].isna().sum() - missing_both
# missing_usd_only = price_df['price_USD'].isna().sum() - missing_both

# # Count same/different prices (excluding missing)
# valid_prices = price_df.dropna()
# same_prices = (valid_prices['price_DKK'] == valid_prices['price_USD']).sum()
# diff_prices = len(valid_prices) - same_prices

# print(f"Missing both prices: {missing_both}")
# print(f"Missing only DKK price: {missing_dkk_only}")
# print(f"Missing only USD price: {missing_usd_only}")

# print(f"Number of listings with same prices in DKK and USD: {same_prices}")
# print(f"Number of listings with different prices in DKK and USD: {diff_prices}")
# same_prices = (valid_prices['price_DKK'] == valid_prices['price_USD']).sum()
# diff_prices = len(valid_prices) - same_prices


# # Check how many rows with missing DKK prices have USD prices available
# missing_dkk = lis_df[lis_df['price_DKK'].isna()]
# dkk_missing_usd_available = missing_dkk['price_USD'].notna().sum()

# print(f"Out of {len(missing_dkk)} rows with missing DKK prices,")
# print(f"{dkk_missing_usd_available} have USD prices available")


# import matplotlib.pyplot as plt
# import numpy as np

# # Calculate 1st and 99th percentiles
# lower_bound = lis_df['price_USD'].quantile(0.05)
# upper_bound = lis_df['price_USD'].quantile(0.95)

# # Filter out extreme outliers (1%)
# clean_prices = lis_df['price_USD'][(lis_df['price_USD'] >= lower_bound) & 
#                                  (lis_df['price_USD'] <= upper_bound)]

# # Plot distribution of cleaned USD prices
# plt.figure(figsize=(10, 6))
# plt.hist(clean_prices, bins=50)
# plt.title('Distribution of USD Prices (5% Extreme Outliers Removed)')
# plt.xlabel('Price (USD)')
# plt.ylabel('Frequency')
# plt.show()

# # Print summary statistics
# print("\nSummary statistics for cleaned USD prices:")
# print(clean_prices.describe())
# print(f"\nRemoved {len(lis_df['price_USD'].dropna()) - len(clean_prices)} outliers")
