In [None]:
from housing_scraper import TokyoHousingScraper
import pandas as pd
import numpy as np
import re, sqlite3, prettytable 
prettytable.DEFAULT = 'DEFAULT'

# Load SQL magic extension to run SQL queries directly in notebook cells
%load_ext sql

## Housing Data Collection & Loading
>In this section, we initialize the `TokyoHousingScraper` to:  
>1. **Collect raw HTML** from *SUUMO.jp*.  
>2. **Transform the HTML** into structured, readable data and housing metrics.  
>3. **Store the processed data** in an SQLite database.

In [11]:
# Path to the SQLite database where scraped housing data will be stored
db = 'tokyo_housing.db'

# Base URL of Suumo (Japanese housing site)
base_url = 'https://suumo.jp/'

# URLs for initial listings pages 
starting_url = 'https://suumo.jp/jj/chintai/ichiran/FR301FC001/?url=%2Fchintai%2Fichiran%2FFR301FC001%2F&ar=030&bs=040&pc=50&smk=&po1=25&po2=99&shkr1=03&shkr2=03&shkr3=03&shkr4=03&cb=0.0&ct=25.0&md=01&md=02&md=03&md=04&md=05&md=06&md=07&md=08&md=09&md=10&et=20&mb=0&mt=9999999&cn=9999999&ra=013&ek=035017990&ek=035026830&rn=0350&ae=03501'

In [None]:
# Initialize scraper
scraper = TokyoHousingScraper(db, base_url, starting_url)

# Scrape housing listings
scraper.scrape_listings()

# Parse listing details and save dataset to SQLite
scraper.build_housing_dataset()

## Extracting & Engineering Housing Metrics 
>- Connect to local SQLite database `db` containing listing information and housing metrics.
>- Initialize SQL Magic (`%sql`) to run queries directly from the notebook. 

In [12]:
# Connect to SQLite database for querying listings 
conn = sqlite3.connect(db)
cursor = conn.cursor()

# Initialize SQL Magic with database connection
%sql sqlite:///tokyo_housing.db

### Create SQL View

>**Step 1: Standardize core listing fields**  
>- `img`, `title`, `address`: Basic identifiers  
>- `rent`, `management_fee`, `deposit`, `key_money`: Convert to numeric values  
>- `floor`: Convert floor labels to integers  
>- `floor_plan`: Normalize labels (e.g., 'ワンルーム' → '1R')  
>- `area`: Convert to numeric (square meters)  
>- `building_age`: Extract age in years  
>- `building_size`: Standardize number of floors  
>- `stations`, `nearest_station`, `distance_to_nearest_station`, `avg_distance_to_stations`: Station-related features  
>
>**Step 2: Handle missing or invalid values**  
>- Replace 0 or invalid values in `management_fee`, `deposit`, `key_money` with NULL  
>
>**Step 3: Feature engineering**  
>- `avg_rent_by_station`: Average rent per nearest station  
>- `avg_rent_by_floor_plan`: Average rent per floor plan  
>- `price_rank_by_station`: Rank rent relative to other listings near the same station  
>
>**Step 4: Build final view**  
>- Combine standardized fields and engineered features into `FEATURED_LISTINGS`  
>- Output all listings in `TOKYO_HOUSING` view

In [None]:
%%sql 
-- Remove the view if it already exists
DROP VIEW IF EXISTS TOKYO_HOUSING;

-- Create a cleaned + feature-engineered housing view
CREATE VIEW TOKYO_HOUSING AS

WITH STANDARDIZED_LISTINGS AS (
    SELECT 
        -- Basic identifiers
        img, title, address, 
        
        -- Convert rent/deposit/key money into numeric
        CAST(RTRIM(rent, '万円') AS FLOAT) * 10000 AS rent,
        CAST(RTRIM(management_fee, '円') AS FLOAT) AS management_fee,
        CAST(RTRIM(deposit, '万円') AS FLOAT) * 10000 AS deposit,
        CAST(RTRIM(key_money, '万円') AS FLOAT) * 10000 AS key_money,
        
        -- Remove floor label
        RTRIM(floor, '階') AS floor,
        
        -- Normalize floor plan categories 
        CASE
            WHEN floor_plan = 'ワンルーム' THEN '1R'
            ELSE floor_plan
        END AS floor_plan,
        
        -- Convert area to numeric (square meters)
        CAST(RTRIM(area, 'm2') AS FLOAT) AS area,
        
        -- Extract building age in years
        CAST(LTRIM(RTRIM(building_age, '年'), '築') AS INTEGER) AS building_age,
        
        -- Remove building size label 
        RTRIM(building_size, '階建') AS building_size,
        
        -- Station-related features
        stations,
        nearest_station,
        distance_to_nearest_station,
        ROUND(avg_distance_to_stations, 2) AS avg_distance_to_stations
    FROM HOUSING_DATA
),

FEATURED_LISTINGS AS (
    SELECT 
        img, title, address, rent, 
        
        -- Replace 0 values with NULLs
        NULLIF(management_fee, 0.0) AS management_fee,
        NULLIF(deposit, 0.0) AS deposit,
        NULLIF(key_money, 0.0) AS key_money,
        floor, floor_plan, area, building_age,
        building_size, nearest_station,
        distance_to_nearest_station, avg_distance_to_stations,
        
        -- Feature engineering: average rents by station, floor plan, and distance to nearest station
        ROUND(AVG(rent) 
            OVER (PARTITION BY nearest_station), 2) 
            AS avg_rent_by_station, 
        ROUND(AVG(rent)
            OVER (PARTITION BY floor_plan), 2) 
            AS avg_rent_by_floor_plan,
        
        -- Price rank relative to other listings near the same station
        DENSE_RANK() 
            OVER (PARTITION BY nearest_station ORDER BY rent DESC)
            AS price_rank_by_station
    FROM STANDARDIZED_LISTINGS
)

-- Final output 
SELECT * FROM FEATURED_LISTINGS

### Load SQL View Into DataFrame
>- Use `%sql` to query `TOKYO_HOUSING` and convert results to a Dataframe for further analysis.
>- Once data is in pandas, we close the database connection. 

In [None]:
# Query the engineered SQL view into a pandas DataFrame for analysis
tokyo_housing = %sql SELECT * FROM TOKYO_HOUSING 
tokyo_housing_df = tokyo_housing.DataFrame()

# Close the DB connection 
conn.close()

## Data Cleaning & Overview
>- Display the DataFrame structure, column data types, and non-null counts.
>- Generate summary statistics for both numeric and categorical columns.
>- Drop duplicate rows.
>- Normalize `floor` and `building_size` columns.
>- Save cleaned DataFrame to CSV file

In [14]:
# Drop duplicate listings 
tokyo_housing_df.drop_duplicates(inplace = True)

In [15]:
# Display column names, non-null counts, and dtypes
tokyo_housing_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1207 entries, 0 to 1210
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   img                          1202 non-null   object 
 1   title                        1207 non-null   object 
 2   address                      1207 non-null   object 
 3   rent                         1207 non-null   float64
 4   management_fee               1034 non-null   float64
 5   deposit                      825 non-null    float64
 6   key_money                    788 non-null    float64
 7   floor                        1207 non-null   object 
 8   floor_plan                   1207 non-null   object 
 9   area                         1207 non-null   float64
 10  building_age                 1207 non-null   int64  
 11  building_size                1207 non-null   object 
 12  nearest_station              1207 non-null   object 
 13  distance_to_nearest_sta

In [16]:
# Show summary statistics for all columns (numeric + categorical)
tokyo_housing_df.describe(include = 'all')

Unnamed: 0,img,title,address,rent,management_fee,deposit,key_money,floor,floor_plan,area,building_age,building_size,nearest_station,distance_to_nearest_station,avg_distance_to_stations,avg_rent_by_station,avg_rent_by_floor_plan,price_rank_by_station
count,1202,1207,1207,1207.0,1034.0,825.0,788.0,1207.0,1207,1207.0,1207.0,1207.0,1207,1207.0,1207.0,1207.0,1207.0,1207.0
unique,1202,1156,45,,,,,20.0,15,,,36.0,11,,,,,
top,https://img01.suumo.com/front/gazo/fr/bukken/4...,ＪＲ山手線 高田馬場駅 4階建 築3年,東京都新宿区高田馬場３,,,,,2.0,1K,,,2.0,中井駅,,,,,
freq,1,4,100,,,,,386.0,517,,,326.0,235,,,,,
mean,,,,101696.520298,6964.119923,106328.121212,115307.233503,,,26.720017,22.607291,,,5.35377,9.407026,101620.114391,101677.018028,43.293289
std,,,,42696.128135,4196.799459,59970.635488,64419.889365,,,13.045711,15.697647,,,2.520552,2.298308,7978.506092,34782.337937,24.968109
min,,,,30000.0,200.0,30000.0,30000.0,,,0.0,0.0,,,1.0,2.0,85730.77,63000.0,1.0
25%,,,,69000.0,3000.0,69000.0,74000.0,,,19.03,9.0,,,3.0,7.67,97854.65,74262.69,22.0
50%,,,,89000.0,6000.0,87000.0,94000.0,,,24.61,21.0,,,5.0,9.67,104994.06,87425.19,43.0
75%,,,,125000.0,10000.0,125000.0,138000.0,,,31.05,35.0,,,7.0,11.0,106420.47,108070.42,63.0


In [17]:
# Inspect raw floor distribution (pre-cleaning)
tokyo_housing_df['floor'].value_counts().to_frame()

Unnamed: 0_level_0,count
floor,Unnamed: 1_level_1
2,386
1,356
3,167
4,112
5,40
7,37
6,31
8,21
9,16
B1,8


In [18]:
# Normalize floor values:
#    - Split floor ranges (e.g., "1-3" → ["1", "3"])
#    - Expand to one floor per row
#    - Coerce non-numeric values to NaN and drop them
tokyo_housing_df = (
    tokyo_housing_df
        .assign(
            floor = lambda df: df['floor'].str.split('-')
        )
        .explode('floor')
        .assign(
            floor = lambda df: pd.to_numeric(df['floor'], errors = 'coerce')
        )
        .dropna(subset = ['floor'])
)
# Convert floor to integer type
tokyo_housing_df['floor'] = tokyo_housing_df['floor'].astype('int64')

In [19]:
# Inspect raw building size distribution (pre-parsing)
tokyo_housing_df['building_size'].value_counts().to_frame()

Unnamed: 0_level_0,count
building_size,Unnamed: 1_level_1
2,326
3,212
4,179
5,106
6,53
10,49
9,47
7,46
8,38
地下1地上3,25


In [20]:
# Parse total building size by extracting numeric components
#    from semi-structured strings (e.g., "地下1地上3" → 1 + 3 = 4)
tokyo_housing_df['building_size'] = (
    tokyo_housing_df['building_size']
        .map(
            lambda x: sum(map(int, re.findall(r'\d+', x)))
        )
)

In [None]:
# Save cleaned DataFrame to CSV file
tokyo_housing_df.to_csv('tokyo_housing.csv', index = False)