# Airbnb Data Cleaning & Processing - Multi-Dataset

**Objective**: Clean and process Lisbon and Porto Airbnb listing data for analysis and modeling.

**Datasets**: 
- `listings_2025Q1_Lisbon.csv`
- `listings_2025Q3_Lisbon.csv`
- `listings_2025Q3_Porto.csv`

---

## 1. Setup & Data Loading

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [2]:
# Load all three datasets
file_path_lisbon_q1 = os.path.join(os.pardir, 'data', 'listings_2025Q1_Lisbon.csv')
file_path_lisbon_q3 = os.path.join(os.pardir, 'data', 'listings_2025Q3_Lisbon.csv')
file_path_porto_q3 = os.path.join(os.pardir, 'data', 'listings_2025Q3_Porto.csv')

df_lisbon_q1 = pd.read_csv(file_path_lisbon_q1, low_memory=False)
df_lisbon_q3 = pd.read_csv(file_path_lisbon_q3, low_memory=False)
df_porto_q3 = pd.read_csv(file_path_porto_q3, low_memory=False)

print(f"Lisbon Q1 2025: {df_lisbon_q1.shape[0]} rows × {df_lisbon_q1.shape[1]} columns")
print(f"Lisbon Q3 2025: {df_lisbon_q3.shape[0]} rows × {df_lisbon_q3.shape[1]} columns")
print(f"Porto Q3 2025: {df_porto_q3.shape[0]} rows × {df_porto_q3.shape[1]} columns")

Lisbon Q1 2025: 24264 rows × 79 columns
Lisbon Q3 2025: 25449 rows × 79 columns
Porto Q3 2025: 14806 rows × 79 columns


---
## 2. Initial Data Exploration

In [3]:
print("=== LISBON Q1 2025 ===")
print(df_lisbon_q1.head())

print("\n=== LISBON Q3 2025 ===")
print(df_lisbon_q3.head())

print("\n=== PORTO Q3 2025 ===")
print(df_porto_q3.head())

=== LISBON Q1 2025 ===
      id                         listing_url       scrape_id last_scraped  \
0   6499   https://www.airbnb.com/rooms/6499  20250308054758   2025-03-16   
1  25659  https://www.airbnb.com/rooms/25659  20250308054758   2025-03-09   
2  29396  https://www.airbnb.com/rooms/29396  20250308054758   2025-03-11   
3  29720  https://www.airbnb.com/rooms/29720  20250308054758   2025-03-09   
4  29915  https://www.airbnb.com/rooms/29915  20250308054758   2025-03-16   

        source                                           name  \
0  city scrape           Belém 1 Bedroom Historical Apartment   
1  city scrape  Heart of Alfama - Le cœur d'Alfama (3 people)   
2  city scrape               Alfama Hill - Boutique apartment   
3  city scrape                    TheHOUSE - Your luxury home   
4  city scrape        Modern and Spacious Apartment in Lisboa   

                                         description  \
0  This apartment is all about Location, next to ...   
1  Charming

---
## 3. Data Cleaning

### 3.1 Define Cleaning Function

In [4]:
def clean_airbnb_data(df, dataset_name):
    """
    Clean and process Airbnb listing data.
    """
    
    df = df.copy()
    
    print(f"\n{'='*60}")
    print(f"Processing: {dataset_name}")
    print(f"{'='*60}")
    print(f"Initial shape: {df.shape}")
    
    # Extract bathrooms from bathrooms_text BEFORE dropping columns, because later this column got errors 
    if 'bathrooms_text' in df.columns:
        # Convert to string and extract numeric value
        df['bathrooms'] = df['bathrooms_text'].astype(str).str.extract(r'(\d+\.?\d*)', expand=False)
        df['bathrooms'] = pd.to_numeric(df['bathrooms'], errors='coerce')
    
    # Drop unnecessary columns
    cols_to_drop = [
        # IDs and URLs
        'listing_url', 'scrape_id', 'picture_url', 'host_url', 
        'host_thumbnail_url', 'host_picture_url',
        
        # Text descriptions
        'name', 'description', 'neighborhood_overview', 'host_name', 
        'host_location', 'host_about', 'host_verifications',
        
        # Columns not needed
        'calendar_last_scraped', 'calendar_updated', 'last_scraped',
        'bathrooms_text', 'license',
        
        # Columns specified for removal
        'id', 'host_neighbourhood', 'neighbourhood_cleansed', 'neighbourhood',
        'minimum_minimum_nights', 'maximum_minimum_nights',
        'minimum_maximum_nights', 'maximum_maximum_nights',
        'availability_eoy', 'estimated_occupancy_l365d', 'estimated_revenue_l365d',
        'number_of_reviews_ly'
    ]
    
    cols_to_drop_existing = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=cols_to_drop_existing)
    print(f"After dropping {len(cols_to_drop_existing)} columns: {df.shape}")
    
    # Convert binary columns (t/f → 1/0)
    binary_cols = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 
                   'has_availability', 'instant_bookable']
    for col in binary_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).str.lower()
            df[col] = df[col].map({'t': 1, 'f': 0})
    
    # Clean price column
    if 'price' in df.columns:
        df['price'] = df['price'].astype(str).str.replace(r'[$,€]', '', regex=True)
        df['price'] = pd.to_numeric(df['price'], errors='coerce')
    
    # Convert percentage columns
    percentage_cols = ['host_response_rate', 'host_acceptance_rate']
    for col in percentage_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace('%', '').replace('', np.nan)
            df[col] = pd.to_numeric(df[col], errors='coerce') / 100.0
    
    # Convert date columns
    date_cols = ['host_since', 'first_review', 'last_review']
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # Convert numeric columns
    numeric_cols = ['host_listings_count', 'host_total_listings_count',
                   'bedrooms', 'beds', 'accommodates', 'minimum_nights', 'maximum_nights',
                   'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm',
                   'availability_30', 'availability_60', 'availability_90', 'availability_365',
                   'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
                   'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                   'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                   'review_scores_value', 'calculated_host_listings_count', 
                   'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms',
                   'calculated_host_listings_count_shared_rooms', 'reviews_per_month']
    
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Remove rows with missing critical values
    critical_cols = ['price', 'latitude', 'longitude', 'bedrooms', 'beds']
    existing_critical = [col for col in critical_cols if col in df.columns]
    df = df.dropna(subset=existing_critical)
    print(f"After removing critical missing values: {df.shape}")
    
    # Remove outliers
    if 'price' in df.columns:
        price_lower = df['price'].quantile(0.01)
        price_upper = df['price'].quantile(0.99)
        df = df[(df['price'] >= price_lower) & (df['price'] <= price_upper)]
    
    if 'minimum_nights' in df.columns:
        df = df[df['minimum_nights'] <= 365]
    
    print(f"After removing outliers: {df.shape}")
    
    return df

### 3.2 Apply Cleaning to All Datasets

In [5]:
df_lisbon_q1_clean = clean_airbnb_data(df_lisbon_q1, "Lisbon Q1 2025")
df_lisbon_q3_clean = clean_airbnb_data(df_lisbon_q3, "Lisbon Q3 2025")
df_porto_q3_clean = clean_airbnb_data(df_porto_q3, "Porto Q3 2025")


Processing: Lisbon Q1 2025
Initial shape: (24264, 79)
After dropping 30 columns: (24264, 49)
After removing critical missing values: (21022, 49)
After removing outliers: (20611, 49)

Processing: Lisbon Q3 2025
Initial shape: (25449, 79)
After dropping 30 columns: (25449, 49)
After removing critical missing values: (21802, 49)
After removing outliers: (21378, 49)

Processing: Porto Q3 2025
Initial shape: (14806, 79)
After dropping 30 columns: (14806, 49)
After removing critical missing values: (13157, 49)
After removing outliers: (12896, 49)


---
## 4. Feature Engineering

### 4.1 Geographic Features - Distance to City Center

Latitudes and longitudes are not so useful if we use them as two variables separately.\
Instead, I converted them into a new variable: Distance to City Center.

In [6]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points on Earth.
    Returns distance in kilometers.
    """
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    r = 6371  # Radius of Earth in kilometers
    return c * r

# Lisbon city center coordinates
lisbon_center_lat = 38.7071
lisbon_center_lon = -9.1359

# Porto city center coordinates
porto_center_lat = 41.1579
porto_center_lon = -8.6291

# Calculate distances for each dataset
df_lisbon_q1_clean['distance_to_center_km'] = haversine_distance(
    df_lisbon_q1_clean['latitude'], 
    df_lisbon_q1_clean['longitude'],
    lisbon_center_lat, 
    lisbon_center_lon
)

df_lisbon_q3_clean['distance_to_center_km'] = haversine_distance(
    df_lisbon_q3_clean['latitude'], 
    df_lisbon_q3_clean['longitude'],
    lisbon_center_lat, 
    lisbon_center_lon
)

df_porto_q3_clean['distance_to_center_km'] = haversine_distance(
    df_porto_q3_clean['latitude'], 
    df_porto_q3_clean['longitude'],
    porto_center_lat, 
    porto_center_lon
)

print("Distance to city center calculated for all datasets")

Distance to city center calculated for all datasets


### 4.2 Amenity Features

In [7]:
def extract_amenities(df):
    """
    Extract binary features from the amenities column.
    """
    
    if 'amenities' not in df.columns:
        return df
    
    amenity_keywords = [
        'Wifi', 'Air conditioning', 'Kitchen', 'Heating', 'Washer', 'Dryer',
        'TV', 'Elevator', 'Pool', 'Gym', 'Parking', 'Breakfast',
        'Hot tub', 'Balcony', 'Patio', 'Garden', 'Beach', 'Waterfront',
        'workspace', 'Dishwasher', 'Coffee maker', 'Microwave',
        'Refrigerator', 'Oven', 'Iron', 'Hair dryer', 'Smoke alarm',
        'Carbon monoxide alarm', 'Fire extinguisher', 'First aid kit'
    ]
    
    for amenity in amenity_keywords:
        col_name = f'has_{amenity.lower().replace(" ", "_")}'
        df[col_name] = df['amenities'].astype(str).str.contains(amenity, case=False, na=False).astype(int)
    
    # Count total amenities
    df['amenities_count'] = df['amenities'].astype(str).str.count('"')
    
    df = df.drop('amenities', axis=1)
    
    return df

df_lisbon_q1_clean = extract_amenities(df_lisbon_q1_clean)
df_lisbon_q3_clean = extract_amenities(df_lisbon_q3_clean)
df_porto_q3_clean = extract_amenities(df_porto_q3_clean)

print("Amenity features extracted for all datasets")

Amenity features extracted for all datasets


### 4.4 Host Features

In [8]:
def create_host_features(df):
    """
    Create host-related features.
    """
    
    if 'calculated_host_listings_count' in df.columns:
        df['host_is_multi_listing'] = (df['calculated_host_listings_count'] > 1).astype(int)
        df['host_is_professional'] = (df['calculated_host_listings_count'] > 3).astype(int)
    
    if 'host_since' in df.columns:
        reference_date = pd.Timestamp('2025-03-16')
        df['host_days_active'] = (reference_date - df['host_since']).dt.days
        df = df.drop('host_since', axis=1)
    
    return df

df_lisbon_q1_clean = create_host_features(df_lisbon_q1_clean)
df_lisbon_q3_clean = create_host_features(df_lisbon_q3_clean)
df_porto_q3_clean = create_host_features(df_porto_q3_clean)

print("Host features created for all datasets")

Host features created for all datasets


### 4.5 Review Features

In [9]:
def create_review_features(df):
    """
    Create review-related features.
    """
    
    review_score_cols = [
        'review_scores_accuracy', 'review_scores_cleanliness',
        'review_scores_checkin', 'review_scores_communication',
        'review_scores_location', 'review_scores_value'
    ]
    
    existing_score_cols = [col for col in review_score_cols if col in df.columns]
    if existing_score_cols:
        df['review_scores_average'] = df[existing_score_cols].mean(axis=1)
    
    if 'review_scores_rating' in df.columns:
        df['is_highly_rated'] = (df['review_scores_rating'] >= 4.5).astype(int)
    
    reference_date = pd.Timestamp('2025-03-16')
    if 'first_review' in df.columns:
        df['days_since_first_review'] = (reference_date - df['first_review']).dt.days
        df['days_since_first_review'] = df['days_since_first_review'].fillna(9999)
        df = df.drop('first_review', axis=1)
    
    if 'last_review' in df.columns:
        df['days_since_last_review'] = (reference_date - df['last_review']).dt.days
        df['days_since_last_review'] = df['days_since_last_review'].fillna(9999)
        df = df.drop('last_review', axis=1)
    
    return df

df_lisbon_q1_clean = create_review_features(df_lisbon_q1_clean)
df_lisbon_q3_clean = create_review_features(df_lisbon_q3_clean)
df_porto_q3_clean = create_review_features(df_porto_q3_clean)

print("Review features created for all datasets")

Review features created for all datasets


### 4.6 Property Features

In [10]:
def create_property_features(df):
    """
    Create property-related features.
    """
    
    if 'accommodates' in df.columns and 'bedrooms' in df.columns:
        df['bedrooms_per_person'] = df['bedrooms'] / df['accommodates'].replace(0, np.nan)
    
    if 'accommodates' in df.columns and 'beds' in df.columns:
        df['beds_per_person'] = df['beds'] / df['accommodates'].replace(0, np.nan)
    
    if 'accommodates' in df.columns and 'bathrooms' in df.columns:
        df['bathrooms_per_person'] = df['bathrooms'] / df['accommodates'].replace(0, np.nan)
    
    if 'accommodates' in df.columns:
        df['is_large_property'] = (df['accommodates'] >= 6).astype(int)
    
    return df

df_lisbon_q1_clean = create_property_features(df_lisbon_q1_clean)
df_lisbon_q3_clean = create_property_features(df_lisbon_q3_clean)
df_porto_q3_clean = create_property_features(df_porto_q3_clean)

print("Property features created for all datasets")

Property features created for all datasets


---
## 5. Final Dataset Preparation

### 5.1 Handle Missing Values

In [11]:
def handle_missing_values(df):
    """
    Handle missing values in the dataset.
    """
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna(df[col].median())
    
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown')
    
    return df

df_lisbon_q1_clean = handle_missing_values(df_lisbon_q1_clean)
df_lisbon_q3_clean = handle_missing_values(df_lisbon_q3_clean)
df_porto_q3_clean = handle_missing_values(df_porto_q3_clean)

print("Missing values handled for all datasets")

Missing values handled for all datasets


### 5.2 One-Hot Encoding of Categorical Variables

In [12]:
def encode_categorical_variables(df):
    """
    Encode categorical variables using one-hot encoding.
    """
    
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    date_cols = [col for col in categorical_cols if 'date' in col.lower() or df[col].dtype == 'datetime64[ns]']
    categorical_cols = [col for col in categorical_cols if col not in date_cols]
    
    if categorical_cols:
        df = pd.get_dummies(df, columns=categorical_cols, drop_first=True, dtype=int)
    
    return df

df_lisbon_q1_encoded = encode_categorical_variables(df_lisbon_q1_clean)
df_lisbon_q3_encoded = encode_categorical_variables(df_lisbon_q3_clean)
df_porto_q3_encoded = encode_categorical_variables(df_porto_q3_clean)

print(f"Lisbon Q1 shape after encoding: {df_lisbon_q1_encoded.shape}")
print(f"Lisbon Q3 shape after encoding: {df_lisbon_q3_encoded.shape}")
print(f"Porto Q3 shape after encoding: {df_porto_q3_encoded.shape}")

Lisbon Q1 shape after encoding: (20611, 188)
Lisbon Q3 shape after encoding: (21378, 184)
Porto Q3 shape after encoding: (12896, 168)


### 5.3 Align Columns Across All Datasets

In [13]:
# Get column sets for each dataset
cols_lisbon_q1 = set(df_lisbon_q1_encoded.columns)
cols_lisbon_q3 = set(df_lisbon_q3_encoded.columns)
cols_porto_q3 = set(df_porto_q3_encoded.columns)

# Find common columns (intersection of all three sets)
common_cols = cols_lisbon_q1 & cols_lisbon_q3 & cols_porto_q3

print(f"Total columns - Lisbon Q1: {len(cols_lisbon_q1)}, Lisbon Q3: {len(cols_lisbon_q3)}, Porto Q3: {len(cols_porto_q3)}")
print(f"Common columns across all datasets: {len(common_cols)}")

unique_to_lisbon_q1 = cols_lisbon_q1 - common_cols
unique_to_lisbon_q3 = cols_lisbon_q3 - common_cols
unique_to_porto_q3 = cols_porto_q3 - common_cols

if unique_to_lisbon_q1:
    print(f"\nColumns unique to Lisbon Q1 ({len(unique_to_lisbon_q1)}): {list(unique_to_lisbon_q1)[:10]}...")
if unique_to_lisbon_q3:
    print(f"Columns unique to Lisbon Q3 ({len(unique_to_lisbon_q3)}): {list(unique_to_lisbon_q3)[:10]}...")
if unique_to_porto_q3:
    print(f"Columns unique to Porto Q3 ({len(unique_to_porto_q3)}): {list(unique_to_porto_q3)[:10]}...")

# Keep only common columns
common_cols_sorted = sorted(list(common_cols))

df_lisbon_q1_final = df_lisbon_q1_encoded[common_cols_sorted].copy()
df_lisbon_q3_final = df_lisbon_q3_encoded[common_cols_sorted].copy()
df_porto_q3_final = df_porto_q3_encoded[common_cols_sorted].copy()

print(f"\nFinal shape - Lisbon Q1: {df_lisbon_q1_final.shape}")
print(f"Final shape - Lisbon Q3: {df_lisbon_q3_final.shape}")
print(f"Final shape - Porto Q3: {df_porto_q3_final.shape}")

assert list(df_lisbon_q1_final.columns) == list(df_lisbon_q3_final.columns) == list(df_porto_q3_final.columns)
print("\n✓ All datasets have identical columns")

Total columns - Lisbon Q1: 188, Lisbon Q3: 184, Porto Q3: 168
Common columns across all datasets: 149

Columns unique to Lisbon Q1 (39): ['property_type_Private room in dome', 'property_type_Lighthouse', 'neighbourhood_group_cleansed_Mafra', 'neighbourhood_group_cleansed_Arruda Dos Vinhos', 'property_type_Entire hostel', 'property_type_Castle', 'property_type_Private room in minsu', 'neighbourhood_group_cleansed_Cascais', 'property_type_Holiday park', 'property_type_Yurt']...
Columns unique to Lisbon Q3 (35): ['property_type_Lighthouse', 'neighbourhood_group_cleansed_Mafra', 'neighbourhood_group_cleansed_Arruda Dos Vinhos', 'property_type_Entire hostel', 'neighbourhood_group_cleansed_Cascais', 'property_type_Holiday park', 'property_type_Yurt', 'neighbourhood_group_cleansed_Sintra', 'property_type_Private room in windmill', 'property_type_Private room in tent']...
Columns unique to Porto Q3 (19): ['neighbourhood_group_cleansed_VILA NOVA DE GAIA', 'neighbourhood_group_cleansed_PORTO', '

### 5.4 Clean Column Names for Regression Compatibility

In [14]:
def clean_column_name(col_name):
    """
    Clean column names to be compatible with statsmodels formulas.
    """
    col_name = str(col_name)
    col_name = col_name.replace(' ', '_')
    col_name = col_name.replace('-', '_')
    col_name = col_name.replace('/', '_')
    col_name = col_name.replace('(', '')
    col_name = col_name.replace(')', '')
    col_name = col_name.replace('.', '_')
    col_name = col_name.replace(',', '')
    col_name = col_name.replace("'", '')
    col_name = re.sub(r'[^a-zA-Z0-9_]', '_', col_name)
    col_name = re.sub(r'_+', '_', col_name)
    col_name = col_name.strip('_')
    return col_name

df_lisbon_q1_final.columns = [clean_column_name(col) for col in df_lisbon_q1_final.columns]
df_lisbon_q3_final.columns = [clean_column_name(col) for col in df_lisbon_q3_final.columns]
df_porto_q3_final.columns = [clean_column_name(col) for col in df_porto_q3_final.columns]

print("Column names cleaned for regression compatibility")

Column names cleaned for regression compatibility


### 5.5 Data Summary

In [None]:
print("=== FINAL DATASET SUMMARIES ===")

for name, df in [("Lisbon Q1 2025", df_lisbon_q1_final), 
                 ("Lisbon Q3 2025", df_lisbon_q3_final), 
                 ("Porto Q3 2025", df_porto_q3_final)]:
    print(f"\n{name}:")
    print(f"  Shape: {df.shape[0]} rows × {df.shape[1]} columns")
    print(f"  Column types: {df.dtypes.value_counts().to_dict()}")
    
    if 'price' in df.columns:
        print(f"  Price statistics:")
        print(f"    Mean: {df['price'].mean():.2f}")
        print(f"    Median: {df['price'].median():.2f}")
        print(f"    Min: {df['price'].min():.2f}")
        print(f"    Max: {df['price'].max():.2f}")

=== FINAL DATASET SUMMARIES ===

Lisbon Q1 2025:
  Shape: 20611 rows × 149 columns
  Column types: {dtype('int32'): 100, dtype('float64'): 32, dtype('int64'): 17}
  Price statistics:
    Mean: €114.26
    Median: €87.00
    Min: €22.00
    Max: €928.00

Lisbon Q3 2025:
  Shape: 21378 rows × 149 columns
  Column types: {dtype('int32'): 100, dtype('float64'): 32, dtype('int64'): 17}
  Price statistics:
    Mean: €141.01
    Median: €110.00
    Min: €27.00
    Max: €1096.00

Porto Q3 2025:
  Shape: 12896 rows × 149 columns
  Column types: {dtype('int32'): 100, dtype('float64'): 32, dtype('int64'): 17}
  Price statistics:
    Mean: €109.70
    Median: €89.00
    Min: €25.00
    Max: €571.00


### 5.6 Save Cleaned Datasets

In [16]:
# Define output paths
output_path_listings_2025Q1_Lisbon = os.path.join(os.pardir, 'data', 'Lisbon_2025Q1_cleaned.csv')
output_path_listings_2025Q3_Lisbon = os.path.join(os.pardir, 'data', 'Lisbon_2025Q3_cleaned.csv')
output_path_listings_2025Q3_Porto = os.path.join(os.pardir, 'data', 'Porto_2025Q3_cleaned.csv')

# Save all three datasets
df_lisbon_q1_final.to_csv(output_path_listings_2025Q1_Lisbon, index=False)
df_lisbon_q3_final.to_csv(output_path_listings_2025Q3_Lisbon, index=False)
df_porto_q3_final.to_csv(output_path_listings_2025Q3_Porto, index=False)

print(f"✓ Lisbon Q1 2025 saved: {df_lisbon_q1_final.shape[0]} rows × {df_lisbon_q1_final.shape[1]} columns")
print(f"✓ Lisbon Q3 2025 saved: {df_lisbon_q3_final.shape[0]} rows × {df_lisbon_q3_final.shape[1]} columns")
print(f"✓ Porto Q3 2025 saved: {df_porto_q3_final.shape[0]} rows × {df_porto_q3_final.shape[1]} columns")
print("\nAll datasets are ready for modeling!")

✓ Lisbon Q1 2025 saved: 20611 rows × 149 columns
✓ Lisbon Q3 2025 saved: 21378 rows × 149 columns
✓ Porto Q3 2025 saved: 12896 rows × 149 columns

All datasets are ready for modeling!
