# data/processed/0001_kodate/0010_unit_overlap preview

`data/processed/0001_kodate/0010_unit_overlap/{train,test}.parquet` のスキーマと先頭データを確認する。


## 共通設定


In [1]:
from pathlib import Path

import pandas as pd

pd.options.display.max_rows = 400
pd.options.display.max_columns = 200


def find_project_root(start: Path) -> Path:
    """`data` と `notebooks` が存在するルートを上方探索で特定"""
    for path in [start, *start.parents]:
        if (path / "data").is_dir() and (path / "notebooks").is_dir():
            return path
    raise FileNotFoundError("プロジェクトルートを特定できません")


PROJECT_ROOT = find_project_root(Path.cwd().resolve())
DATASET_DIR = PROJECT_ROOT / "data" / "processed" / "0001_kodate" / "0010_unit_overlap"
TRAIN_PATH = DATASET_DIR / "train.parquet"
TEST_PATH = DATASET_DIR / "test.parquet"

TRAIN_PATH, TEST_PATH


(PosixPath('/Users/takamiya/work/0000_repos/signate_comp_2nd/data/processed/0001_kodate/0010_unit_overlap/train.parquet'),
 PosixPath('/Users/takamiya/work/0000_repos/signate_comp_2nd/data/processed/0001_kodate/0010_unit_overlap/test.parquet'))

In [2]:
train_df = pd.read_parquet(TRAIN_PATH)
test_df = pd.read_parquet(TEST_PATH)

print(f"train shape: {train_df.shape}")
print(f"test shape : {test_df.shape}")


train shape: (165310, 76)
test shape : (52892, 75)


## カラム概要


In [3]:
def summarize_columns(df: pd.DataFrame) -> pd.DataFrame:
    summary = pd.DataFrame(
        {
            "dtype": df.dtypes.astype(str),
            "non_null": df.notna().sum(),
            "null_ratio": df.isna().mean(),
        }
    )
    summary["unique"] = df.nunique()
    return summary.sort_index()

column_summary = summarize_columns(train_df)
column_summary.head(20).T


Unnamed: 0,addr1_1,addr_all,balcony_area,building_land_area,building_land_chimoku,building_structure,building_tag_210401,building_tag_294201,building_tag_334001,building_tag_334101,building_tag_334201,building_tag_340301,convenience_distance,data_id,dwelling_unit_window_angle,eki_name1,eki_name2,flg_investment,floor_count,floor_plan_code
dtype,int64,string,float64,float64,float64,float64,uint8,uint8,uint8,uint8,uint8,uint8,float64,int64,float64,string,string,float64,float64,float64
non_null,165310,165310,12694,153256,148627,153323,165310,165310,165310,165310,165310,165310,81337,165310,82272,158645,76868,76397,164026,153150
null_ratio,0.0,0.0,0.923211,0.072918,0.100919,0.072512,0.0,0.0,0.0,0.0,0.0,0.0,0.507973,0.0,0.502317,0.040318,0.535007,0.537856,0.007767,0.073559
unique,47,662,429,31619,9,12,2,2,2,2,2,2,2772,165310,9,4810,4018,2,19,78


## 先頭データ (train)


In [4]:
train_df.head(20).T


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,4,12,16,17,34,35,36,37,38,75,76,77,82,91,92,101,109,110,111,112
money_room,18800000,16900000,16700000,14480000,8300000,11800000,8480000,8500000,9800000,14500000,17800000,18800000,18300000,34800000,43800000,18900000,13500000,11900000,12800000,9200000
building_structure,4.0,5.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,4.0
floor_count,6.0,15.0,5.0,5.0,14.0,14.0,14.0,14.0,14.0,7.0,5.0,5.0,14.0,24.0,23.0,10.0,11.0,14.0,14.0,10.0
year_built,200703.0,200203.0,200302.0,200302.0,199506.0,199506.0,199810.0,199810.0,199604.0,200702.0,200609.0,200609.0,200711.0,200401.0,200403.0,199502.0,198910.0,199203.0,199203.0,198810.0
building_land_chimoku,,,,,,,,,,,,,,,,,,1.0,1.0,
land_youto,3.0,12.0,11.0,11.0,2.0,2.0,2.0,2.0,2.0,11.0,11.0,11.0,4.0,11.0,11.0,11.0,12.0,12.0,12.0,12.0
land_toshi,1.0,1.0,1.0,1.0,1.0,1.0,,,,1.0,,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0
land_chisei,,,,,,,,,,,,,,0.0,1.0,,1.0,1.0,1.0,1.0
management_form,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0


## 先頭データ (test)


In [5]:
test_df.head(20).T


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,7,17,18,40,51,52,110,111,119,121,122,123,125,127,128,136,137,138,139
building_structure,5.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0
floor_count,14.0,11.0,6.0,7.0,14.0,7.0,7.0,24.0,24.0,7.0,6.0,4.0,15.0,14.0,10.0,8.0,10.0,10.0,10.0,11.0
year_built,199510.0,199901.0,199201.0,199707.0,199810.0,199803.0,199803.0,200401.0,200401.0,199204.0,198911.0,196601.0,200901.0,199103.0,198908.0,199203.0,198901.0,198901.0,198901.0,198910.0
building_land_chimoku,,1.0,,,,,,,,,,,,,,,,,,
land_youto,7.0,4.0,11.0,6.0,2.0,3.0,3.0,11.0,11.0,6.0,11.0,11.0,3.0,11.0,6.0,,12.0,12.0,12.0,12.0
land_toshi,1.0,1.0,1.0,1.0,,,,,,1.0,1.0,,1.0,1.0,1.0,,1.0,1.0,1.0,1.0
land_chisei,,,,,,,,0.0,,,,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0
management_form,3.0,3.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
room_floor,4.0,4.0,2.0,7.0,1.0,6.0,3.0,1.0,11.0,4.0,1.0,3.0,1.0,3.0,6.0,5.0,9.0,4.0,2.0,6.0
