# data/processed/0001_kodate/0010_unit_overlap preview

`data/processed/0001_kodate/0010_unit_overlap/{train,test}.parquet` のスキーマと先頭データを確認する。


## 共通設定


In [1]:
from pathlib import Path

import pandas as pd

pd.options.display.max_rows = 400
pd.options.display.max_columns = 200


def find_project_root(start: Path) -> Path:
    """`data` と `notebooks` が存在するルートを上方探索で特定"""
    for path in [start, *start.parents]:
        if (path / "data").is_dir() and (path / "notebooks").is_dir():
            return path
    raise FileNotFoundError("プロジェクトルートを特定できません")


PROJECT_ROOT = find_project_root(Path.cwd().resolve())
DATASET_DIR = PROJECT_ROOT / "data" / "processed" / "0001_kodate" / "0010_unit_overlap"
TRAIN_PATH = DATASET_DIR / "train.parquet"
TEST_PATH = DATASET_DIR / "test.parquet"

TRAIN_PATH, TEST_PATH


(PosixPath('/Users/takamiya/work/0000_repos/signate_comp_2nd/data/processed/0001_kodate/0010_unit_overlap/train.parquet'),
 PosixPath('/Users/takamiya/work/0000_repos/signate_comp_2nd/data/processed/0001_kodate/0010_unit_overlap/test.parquet'))

In [2]:
train_df = pd.read_parquet(TRAIN_PATH)
test_df = pd.read_parquet(TEST_PATH)

print(f"train shape: {train_df.shape}")
print(f"test shape : {test_df.shape}")


train shape: (165310, 76)
test shape : (52892, 75)


## カラム概要


In [3]:
def summarize_columns(df: pd.DataFrame) -> pd.DataFrame:
    summary = pd.DataFrame(
        {
            "dtype": df.dtypes.astype(str),
            "non_null": df.notna().sum(),
            "null_ratio": df.isna().mean(),
        }
    )
    summary["unique"] = df.nunique()
    return summary.sort_index()

column_summary = summarize_columns(train_df)
column_summary.head(20).T


Unnamed: 0,addr1_1,addr_all,balcony_area,building_land_area,building_land_chimoku,building_structure,building_tag_210401,building_tag_294201,building_tag_334001,building_tag_334101,building_tag_334201,building_tag_340301,convenience_distance,data_id,dwelling_unit_window_angle,eki_name1,eki_name2,flg_investment,floor_count,floor_plan_code
dtype,int64,string,float64,float64,float64,float64,uint8,uint8,uint8,uint8,uint8,uint8,float64,int64,float64,string,string,float64,float64,float64
non_null,165310,165310,12694,153256,148627,153323,165310,165310,165310,165310,165310,165310,81337,165310,82272,158645,76868,76397,164026,153150
null_ratio,0.0,0.0,0.923211,0.072918,0.100919,0.072512,0.0,0.0,0.0,0.0,0.0,0.0,0.507973,0.0,0.502317,0.040318,0.535007,0.537856,0.007767,0.073559
unique,47,662,429,31619,9,12,2,2,2,2,2,2,2772,165310,9,4810,4018,2,19,78


## 先頭データ (train)


In [4]:
train_df.head(20).T


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,1,2,3,5,6,7,8,9,10,11,13,14,15,18,19,20,21,22,23
money_room,13980000,24480000,24480000,16300000,9000000,9900000,5400000,14500000,11680000,14800000,22800000,16000000,16800000,16500000,32000000,16000000,18800000,18000000,12980000,17800000
building_structure,1.0,10.0,1.0,1.0,1.0,1.0,,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0
total_floor_area,106.82,,,106.809998,78.739998,74.519997,,74.519997,70.160004,,78.0,144.899994,112.620003,105.160004,112.629997,,,138.919998,94.400002,139.110001
floor_count,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,3.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0
year_built,199204.0,198108.0,199506.0,200203.0,196605.0,199010.0,,201605.0,198603.0,,201006.0,200102.0,200706.0,198812.0,200812.0,201201.0,,197604.0,198907.0,200302.0
years_old,26.75154,37.418207,23.586585,16.837782,52.670773,28.251882,,2.669405,32.837782,,8.5859,17.913758,11.586585,30.083504,10.083504,7.000684,,42.75154,29.50308,15.915127
building_land_area,188.490005,290.519989,235.649994,169.729996,105.779999,97.190002,,99.370003,112.510002,,165.0,170.309998,141.259995,199.679993,176.600006,124.389999,,253.119995,186.059998,509.089996
land_area_all,188.490005,,,,105.779999,,,,,,,,141.259995,,,,,,186.059998,509.089996
building_land_chimoku,1.0,,,1.0,1.0,1.0,,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0


## 先頭データ (test)


In [5]:
test_df.head(20).T


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,19,20,21,22,23
building_structure,1.0,10.0,1.0,1.0,1.0,1.0,,10.0,1.0,1.0,1.0,,1.0,1.0,,1.0,1.0,1.0,1.0,9.0
total_floor_area,171.820007,92.129997,,105.980003,146.559998,,,,100.190002,100.589996,138.919998,,,,,171.5,116.75,80.519997,,116.860001
floor_count,2.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0
year_built,199206.0,197511.0,201603.0,199411.0,200310.0,195701.0,,200405.0,199509.0,198904.0,197604.0,,198811.0,202307.0,,201212.0,199107.0,198101.0,199908.0,197410.0
years_old,30.584531,47.167693,6.836413,28.167009,19.252567,65.998631,,18.669405,27.334702,33.752225,46.75154,,34.16564,-0.495551,,10.083504,31.504449,41.998631,23.419576,48.251882
building_land_area,197.529999,223.309998,100.589996,150.990005,344.440002,636.940002,,177.729996,140.0,110.290001,253.119995,,203.279999,129.740005,,205.130005,206.529999,276.940002,174.470001,202.830002
land_area_all,197.529999,223.309998,,150.990005,344.440002,,,,,,,,,,,,,276.940002,,202.830002
building_land_chimoku,1.0,1.0,5.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
land_youto,1.0,1.0,3.0,1.0,1.0,99.0,,11.0,11.0,11.0,99.0,,99.0,12.0,,99.0,99.0,14.0,12.0,6.0
