In [None]:
from housing_scraper import TokyoHousingScraper
import pandas as pd
import numpy as np
import re, sqlite3, prettytable 
prettytable.DEFAULT = 'DEFAULT'

# Load SQL magic extension to run SQL queries directly in notebook cells
%load_ext sql

## Housing Data Collection & Loading
>In this section, we initialize the `TokyoHousingScraper` to:  
>1. **Collect raw HTML** from *SUUMO.jp*.  
>2. **Transform the HTML** into structured, readable data and housing metrics.  
>3. **Store the processed data** in an SQLite database.

In [2]:
# Path to the SQLite database where scraped housing data will be stored
db = 'tokyo_housing.db'

# Base URL of Suumo (Japanese housing site)
base_url = 'https://suumo.jp/'

# URLs for initial listings pages 
starting_url = 'https://suumo.jp/jj/chintai/ichiran/FR301FC001/?url=%2Fchintai%2Fichiran%2FFR301FC001%2F&ar=030&bs=040&pc=50&smk=&po1=25&po2=99&shkr1=03&shkr2=03&shkr3=03&shkr4=03&cb=0.0&ct=25.0&md=01&md=02&md=03&md=04&md=05&md=06&md=07&md=08&md=09&md=10&et=20&mb=0&mt=9999999&cn=9999999&ra=013&ek=035017990&ek=035026830&rn=0350&ae=03501'

In [None]:
# Initialize scraper
scraper = TokyoHousingScraper(db, base_url, starting_url)

# Scrape housing listings
scraper.scrape_listings()

# Parse listing details and save dataset to SQLite
scraper.build_housing_dataset()

## Extracting & Engineering Housing Metrics 
>- Connect to local SQLite database `db` containing listing information and housing metrics.
>- Initialize SQL Magic (`%sql`) to run queries directly from the notebook. 

In [32]:
# Connect to SQLite database for querying listings 
conn = sqlite3.connect(db)
cursor = conn.cursor()

# Initialize SQL Magic with database connection
%sql sqlite:///tokyo_housing.db

### Create SQL View

>**Step 1: Standardize core listing fields**  
>- `img`, `title`, `address`: Basic identifiers  
>- `rent`, `management_fee`, `deposit`, `key_money`: Convert to numeric values  
>- `floor`: Convert floor labels to integers  
>- `floor_plan`: Normalize labels (e.g., 'ワンルーム' → '1R')  
>- `area`: Convert to numeric (square meters)  
>- `building_age`: Extract age in years  
>- `building_size`: Standardize number of floors  
>- `stations`, `nearest_station`, `distance_to_nearest_station`, `avg_distance_to_stations`: Station-related features  
>
>**Step 2: Handle missing or invalid values**  
>- Replace 0 or invalid values in `management_fee`, `deposit`, `key_money` with NULL  
>
>**Step 3: Feature engineering**  
>- `avg_rent_by_station`: Average rent per nearest station  
>- `avg_rent_by_floor_plan`: Average rent per floor plan  
>- `count_listings_per_station`: Number of listings per station (nearest)
>- `count_listings_per_floor_plan`: Number of listings per floor plan (e.g. `1DK`, `2LDK`) 
>
>**Step 4: Build final view**  
>- Combine standardized fields and engineered features into `FEATURED_LISTINGS`  
>- Output all listings in `TOKYO_HOUSING` view

In [None]:
%%sql 
-- Remove the view if it already exists
DROP VIEW IF EXISTS TOKYO_HOUSING;

-- Create a cleaned + feature-engineered housing view
CREATE VIEW TOKYO_HOUSING AS

-- Deduplicate listings that appear multiple times due to scraping artifacts.
-- Listings are considered duplicates if they share the same title, address,
-- rent, floor plan, and floor.

-- ROW_NUMBER() is used to retain a single representative row per duplicate group.
-- Ordering by `img` provides a stable (though not necessarily unique) tie-breaker
WITH DEDUPLICATED_LISTINGS AS (
    SELECT * 
    FROM  (
        SELECT 
            *,
            ROW_NUMBER() OVER (
                PARTITION BY title, address, rent, floor_plan, floor
                ORDER BY img
                ) 
                AS rn
        FROM HOUSING_DATA 
    )
    WHERE rn = 1
),

STANDARDIZED_LISTINGS AS (
    SELECT 
        -- Basic identifiers
        img, title, address, 
        
        -- Convert rent/deposit/key money into numeric
        CAST(RTRIM(rent, '万円') AS FLOAT) * 10000 AS rent,
        CAST(RTRIM(management_fee, '円') AS FLOAT) AS management_fee,
        CAST(RTRIM(deposit, '万円') AS FLOAT) * 10000 AS deposit,
        CAST(RTRIM(key_money, '万円') AS FLOAT) * 10000 AS key_money,
        
        -- Remove floor label
        RTRIM(floor, '階') AS floor,
        
        -- Normalize floor plan categories 
        CASE
            WHEN floor_plan = 'ワンルーム' THEN '1R'
            ELSE floor_plan
        END AS floor_plan,
        
        -- Convert area to numeric (square meters)
        CAST(RTRIM(area, 'm2') AS FLOAT) AS area,
        
        -- Extract building age in years
        CAST(LTRIM(RTRIM(building_age, '年'), '築') AS INTEGER) AS building_age,
        
        -- Remove building size label 
        RTRIM(building_size, '階建') AS building_size,
        
        -- Station-related features
        stations,
        nearest_station,
        distance_to_nearest_station,
        ROUND(avg_distance_to_stations, 2) AS avg_distance_to_stations
    FROM DEDUPLICATED_LISTINGS
),

FEATURED_LISTINGS AS (
    SELECT 
        img, title, address, rent, 
        
        -- Replace 0 values with NULLs
        NULLIF(management_fee, 0.0) AS management_fee,
        NULLIF(deposit, 0.0) AS deposit,
        NULLIF(key_money, 0.0) AS key_money,
        floor, floor_plan, area, building_age,
        building_size, nearest_station,
        distance_to_nearest_station, avg_distance_to_stations,
        
        -- Average rent by station, floor plan
        ROUND(AVG(rent) 
            OVER (PARTITION BY nearest_station), 2) 
            AS avg_rent_by_station, 
        ROUND(AVG(rent)
            OVER (PARTITION BY floor_plan), 2) 
            AS avg_rent_by_floor_plan,
        
        -- Number of listings per station, floor plan
        COUNT(title)
            OVER (PARTITION BY nearest_station)
            AS count_listings_per_station,
        COUNT(title)
            OVER (PARTITION BY floor_plan) 
            AS count_listings_per_floor_plan
    FROM STANDARDIZED_LISTINGS
)

-- Final output 
SELECT * FROM FEATURED_LISTINGS

### Load SQL View Into DataFrame
>- Use `%sql` to query `TOKYO_HOUSING` and convert results to a Dataframe for further analysis.
>- Once data is in pandas, we close the database connection. 

In [None]:
# Query the engineered SQL view into a pandas DataFrame for analysis
tokyo_housing = %sql SELECT * FROM TOKYO_HOUSING 
tokyo_housing_df = tokyo_housing.DataFrame()

# Close the DB connection 
conn.close()

## Data Overview & Normalization

### Dataset Overview
> We begin by reviewing column data types, non-null counts, and summary statistics to
identify potential inconsistencies and guide downstream cleaning decisions.

In [34]:
# Display column names, non-null counts, and dtypes
tokyo_housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1203 entries, 0 to 1202
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   img                            1198 non-null   object 
 1   title                          1203 non-null   object 
 2   address                        1203 non-null   object 
 3   rent                           1203 non-null   float64
 4   management_fee                 1030 non-null   float64
 5   deposit                        824 non-null    float64
 6   key_money                      786 non-null    float64
 7   floor                          1203 non-null   object 
 8   floor_plan                     1203 non-null   object 
 9   area                           1203 non-null   float64
 10  building_age                   1203 non-null   int64  
 11  building_size                  1203 non-null   object 
 12  nearest_station                1203 non-null   o

In [35]:
# Show summary statistics for all columns (numeric + categorical)
tokyo_housing_df.describe(include = 'all')

Unnamed: 0,img,title,address,rent,management_fee,deposit,key_money,floor,floor_plan,area,building_age,building_size,nearest_station,distance_to_nearest_station,avg_distance_to_stations,avg_rent_by_station,avg_rent_by_floor_plan,count_listings_per_station,count_listings_per_floor_plan
count,1198,1203,1203,1203.0,1030.0,824.0,786.0,1203.0,1203,1203.0,1203.0,1203.0,1203,1203.0,1203.0,1203.0,1203.0,1203.0,1203.0
unique,1198,1156,45,,,,,20.0,15,,,36.0,11,,,,,,
top,https://img01.suumo.com/front/gazo/fr/bukken/4...,ＪＲ山手線 高田馬場駅 4階建 築3年,東京都新宿区高田馬場３,,,,,2.0,1K,,,2.0,中井駅,,,,,,
freq,1,4,100,,,,,386.0,517,,,326.0,233,,,,,,
mean,,,,101623.607648,6966.893204,106165.898058,115197.328244,,,26.688662,22.610141,,,5.354946,9.411297,101623.608504,101623.60916,170.833749,336.311721
std,,,,42567.392682,4201.619506,59825.66097,64333.560542,,,12.979141,15.707395,,,2.518061,2.296133,8099.992902,34584.02553,59.459817,182.800388
min,,,,30000.0,200.0,30000.0,30000.0,,,0.0,0.0,,,1.0,2.0,85901.96,63000.0,1.0,1.0
25%,,,,69000.0,3000.0,69000.0,74000.0,,,19.03,9.0,,,3.0,7.67,97947.06,74316.05,127.0,152.0
50%,,,,89000.0,6000.0,87000.0,94000.0,,,24.61,21.0,,,5.0,9.67,105018.91,87431.53,201.0,324.0
75%,,,,125000.0,10000.0,125000.0,138000.0,,,31.0,35.0,,,7.0,11.0,106436.45,108070.42,214.0,517.0


### Floor Normalization
> The `floor` variable contains a mix of single values and ranges (e.g., `"1-2"`).  
To support analysis and modeling, floor ranges are expanded into individual observations:
>
> - Split floor ranges into their component values
> - Expand each floor into a separate row (**explode('floor')**)
> - Coerce non-numeric values to `NaN` and remove them
> - Convert the resulting values to integer type
> 
> This results in one floor value per row, enabling accurate aggregation and analysis.

In [36]:
# Inspect raw floor distribution (pre-cleaning)
tokyo_housing_df['floor'].value_counts()

floor
2       386
1       354
3       166
4       112
5        40
7        37
6        31
8        21
9        16
10        8
B1        7
11        6
13        4
1-2       4
2-3       3
-         3
B2        2
24        1
B1-1      1
14        1
Name: count, dtype: int64

In [37]:
# Normalize floor values:
#    - Split floor ranges (e.g., "1-2" → ["1", "2"])
#    - Expand to one floor per row
#    - Coerce non-numeric values to NaN and drop them
tokyo_housing_df = (
    tokyo_housing_df
        .assign(
            floor = lambda df: df['floor'].str.split('-')
        )
        .explode('floor')
        .assign(
            floor = lambda df: pd.to_numeric(df['floor'], errors = 'coerce')
        )
        .dropna(subset = ['floor'])
)
# Convert floor to integer type
tokyo_housing_df['floor'] = tokyo_housing_df['floor'].astype('int64')

### Building Size Parsing
>The `building_size` column includes semi-structured text (e.g., `"地下1地上3"`),
representing underground and above-ground floors.
>To standardize this feature:
> - All numeric components are extracted using **regular expressions**
> - Values are summed to compute total building height
>
>The resulting feature represents total building size as a single numeric value.

In [38]:
# Inspect raw building size distribution (pre-parsing)
tokyo_housing_df['building_size'].value_counts()

building_size
2          326
3          212
4          177
5          106
6           53
10          49
9           47
7           46
8           38
地下1地上3      25
12          15
11          14
地下1地上4      13
14          10
地下1地上5       8
地下1地上8       8
15           8
地下1地上6       6
13           6
地下1地上9       4
地上2          4
地下1地上14      3
地下1地上11      3
地下1地上7       2
地下1地上12      2
地下1地上10      2
地下2地上2       2
19           2
地下7地上9       1
1            1
地下1地上2       1
31           1
地下2地上31      1
地下2地上14      1
地下2地上10      1
Name: count, dtype: int64

In [39]:
# Parse total building size by extracting numeric components
#    from semi-structured strings (e.g., "地下1地上3" → 1 + 3 = 4)
tokyo_housing_df['building_size'] = (
    tokyo_housing_df['building_size']
        .map(
            lambda x: sum(map(int, re.findall(r'\d+', x)))
        )
)

In [40]:
# Save cleaned DataFrame to CSV file
tokyo_housing_df.to_csv('tokyo_housing.csv', index = False)