In [36]:
import pandas as pd
import wandb
import h2o
from h2o.automl import H2OAutoML
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import re

In [37]:
config = {
    "train_test_valid_ratio": [0.8],
    "max_models": 10
}

run = wandb.init(project="h2o tokyo immo", config=config)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112262411107724, max=1.0…

In [38]:
# Add the table to an Artifact to increase the row
# limit to 200000 and make it easier to reuse
data_artifact = wandb.Artifact("data", type="dataset")
# log the raw csv file within an artifact to preserve our data


In [39]:
df_land = pd.read_csv("tokyo_land_only.csv")
df_land.head()

station_average = df_land.groupby(["取引時期", "最寄駅：名称"])["取引価格（㎡単価）"].mean().reset_index()
district_average = df_land.groupby(["取引時期", "地区名"])["取引価格（㎡単価）"].mean().reset_index()
ward_average = df_land.groupby(["取引時期", "市区町村名"])["取引価格（㎡単価）"].mean().reset_index()

station_average.to_csv("station_average.csv", index=False)
district_average.to_csv("district_average.csv", index=False)
ward_average.to_csv("ward_average.csv", index=False)

station_average = pd.read_csv("station_average.csv")
district_average = pd.read_csv("district_average.csv")
ward_average = pd.read_csv("ward_average.csv")

data_artifact.add_file("station_average.csv")
data_artifact.add_file("district_average.csv")
data_artifact.add_file("ward_average.csv")

ArtifactManifestEntry(path='ward_average.csv', digest='fMznWiYdBWrZRekCcYvcoA==', size=33561, local_path='/home/dgg32/.local/share/wandb/artifacts/staging/tmpmutbz8nw', skip_cache=False)

In [40]:
period_station_average_dict = {}
station_average_dict = {}

for i, row in station_average.iterrows():
    
    time = row["取引時期"]
    if time not in period_station_average_dict:
        period_station_average_dict[time] = {}
    place = row["最寄駅：名称"]
    period_station_average_dict[time][place] = row["取引価格（㎡単価）"]

    if place not in station_average_dict:
        station_average_dict[place] = []
    station_average_dict[place].append(row["取引価格（㎡単価）"])

period_district_average_dict = {}
district_average_dict = {}

for i, row in district_average.iterrows():

    time = row["取引時期"]
    if time not in period_district_average_dict:
        period_district_average_dict[time] = {}
    place = row["地区名"]
    
    period_district_average_dict[time][place] = row["取引価格（㎡単価）"]

    if place not in district_average_dict:
        district_average_dict[place] = []
    district_average_dict[place].append(row["取引価格（㎡単価）"])


period_ward_average_dict = {}
ward_average_dict = {}
for i, row in ward_average.iterrows():
    
    time = row["取引時期"]
    if time not in period_ward_average_dict:
        period_ward_average_dict[time] = {}
    place = row["市区町村名"]
    period_ward_average_dict[time][place] = row["取引価格（㎡単価）"]

    if place not in ward_average_dict:
        ward_average_dict[place] = []
    ward_average_dict[place].append(row["取引価格（㎡単価）"])

In [41]:
class filter_format_data(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Perform arbitary transformation
        
        filter_df = X[X["建築年"].str.contains("年")==True]
        filter_df.loc[:, "建築年"] = filter_df["建築年"].str.replace("年", "")
        filter_df.loc[:, "建築年"] = filter_df["建築年"].astype(int)
        filter_df = filter_df[filter_df["建築年"] >= 2020]

        filter_df = filter_df[filter_df['延床面積（㎡）'].notna()]
        filter_df.loc[:, "延床面積（㎡）"] = filter_df["延床面積（㎡）"].astype(str).str.replace("㎡以上", "")
        filter_df.loc[:, "延床面積（㎡）"] = filter_df["延床面積（㎡）"].str.replace(",", "")
        filter_df.loc[:, "延床面積（㎡）"] = filter_df["延床面積（㎡）"].astype(int)

        walk_time_from_station_mapping = {"30分～60分": 45, "1H30～2H": 105, "1H～1H30": 75, "2H～": 120}
        filter_df.loc[:, "最寄駅：距離（分）"] = filter_df["最寄駅：距離（分）"].replace(walk_time_from_station_mapping)

        #filter_df = filter_df[filter_df["用途"].str.contains("住宅", na=False)]
        #filter_df = filter_df[filter_df["今後の利用目的"].str.contains("住宅", na=False)]
        if "用途" in filter_df.columns and "今後の利用目的" in filter_df.columns:
            filter_df = filter_df[(filter_df["用途"] == "住宅") & (filter_df["今後の利用目的"] == "住宅")]

        construction_mapping = {"軽量鉄骨造": "not_wood", "ＲＣ、木造": "not_wood", "ＲＣ": "not_wood", "ＳＲＣ": "not_wood", "木造": "wood", "ブロック造": "not_wood", "鉄骨造": "not_wood"}
        filter_df.loc[:, "建物の構造"] = filter_df["建物の構造"].replace(construction_mapping)


        filter_df["建物の構造"] = filter_df["建物の構造"].fillna(value="wood")
        filter_df["wood"] = filter_df["建物の構造"].apply(lambda x: True if x == "wood" else False)
        #print (filter_df["wood"].value_counts())

        filter_df["south"] = filter_df["前面道路：方位"].str.contains('南')
        filter_df["west"] = filter_df["前面道路：方位"].str.contains('西')
        filter_df["east"] = filter_df["前面道路：方位"].str.contains('東')
        filter_df["north"] = filter_df["前面道路：方位"].str.contains('北')

        return filter_df

class calculate_land_price(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, filter_df, y=None):
        # Perform arbitary transformation
        for io, row in filter_df.iterrows():
            if row["取引時期"] in period_station_average_dict and row["最寄駅：名称"] in period_station_average_dict[row["取引時期"]]:
                filter_df.at[io, "land_price_estimated"] = round(period_station_average_dict[row["取引時期"]][row["最寄駅：名称"]], 0) * int(row["面積（㎡）"])
            elif row["取引時期"] in period_district_average_dict and row["地区名"] in period_district_average_dict[row["取引時期"]]:
                filter_df.at[io, "land_price_estimated"] = round(period_district_average_dict[row["取引時期"]][row["地区名"]], 0) * int(row["面積（㎡）"])
            elif row["取引時期"] in period_ward_average_dict and row["市区町村名"] in period_ward_average_dict[row["取引時期"]]:
                filter_df.at[io, "land_price_estimated"] = round(period_ward_average_dict[row["取引時期"]][row["市区町村名"]], 0) * int(row["面積（㎡）"])
            elif row["最寄駅：名称"] in station_average_dict:
                filter_df.at[io, "land_price_estimated"] = round(np.mean(station_average_dict[row["最寄駅：名称"]]), 0) * int(row["面積（㎡）"])
            elif row["地区名"] in district_average_dict:
                filter_df.at[io, "land_price_estimated"] = round(np.mean(district_average_dict[row["地区名"]]), 0) * int(row["面積（㎡）"])
            elif row["市区町村名"] in ward_average_dict:
                filter_df.at[io, "land_price_estimated"] = round(np.mean(ward_average_dict[row["市区町村名"]]), 0) * int(row["面積（㎡）"])
            else:
                print (row)
        
        return filter_df


class select_rename_columns(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self

    def transform(self, filter_df, y=None):
        #print (filter_df.columns)
        X_columns = ["市区町村名", "最寄駅：距離（分）", "延床面積（㎡）", "建築年", "前面道路：幅員（ｍ）", "land_price_estimated", "wood", "south", "west", "east", "north"]
        y_column = "取引価格（総額）"
        desired_columns = X_columns + [y_column]
        filter_df.dropna(subset=desired_columns, inplace=True)

        filter_df = filter_df[desired_columns]

        filter_df.rename(columns = {"市区町村名": "district", "最寄駅：距離（分）": "walk_min_from_nearest_station", "延床面積（㎡）": "construction_area", "建築年": "construction_year", "前面道路：幅員（ｍ）": "street_width", "取引価格（総額）": "price"}, inplace=True)

        scale_factor = 1000000
        filter_df["price"] = filter_df["price"]/scale_factor
        filter_df["land_price_estimated"] = filter_df["land_price_estimated"]/scale_factor
        return filter_df

class def_remove_district(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, filter_df, y=None):
        #print (filter_df.columns)
        filter_df.drop(['district'], axis=1, inplace=True)
        
        return filter_df

groupby_pipe = Pipeline(
    steps=[
        ("filter_format_data", filter_format_data()),
        ("calculate_land_price", calculate_land_price()),
        ("select_rename_columns", select_rename_columns())
        ]
)

remove_district_pipe = Pipeline(
    steps=[
        ("def_remove_district", def_remove_district())
    ]
)

whole_pipe = Pipeline(
    steps=[
        ("filter_format_data", filter_format_data()),
        ("calculate_land_price", calculate_land_price()),
        ("select_rename_columns", select_rename_columns()),
        ("def_remove_district", def_remove_district())
        ]
)

In [42]:
df = pd.read_csv("tokyo_reinfolib.csv")
filter_df = groupby_pipe.fit_transform(df)

#filter_df.to_csv("tokyo_reinfolib_filtered_temp.csv")
print (filter_df.head(n = 10))
print (len(filter_df))



     district walk_min_from_nearest_station construction_area  \
24       千代田区                             3                80   
502        港区                             3                90   
633        港区                            12                70   
634        港区                            12                70   
689        港区                             4                75   
738        港区                            12               130   
1079      新宿区                             4               105   
1086      新宿区                             4                80   
1098      新宿区                             5                70   
1099      新宿区                             5                65   

     construction_year  street_width  land_price_estimated  wood  south  \
24                2021           4.0             130.50000  True  False   
502               2022           4.0             176.65716  True  False   
633               2021           2.7             209.44445 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df.rename(columns = {"市区町村名": "district", "最寄駅：距離（分）": "walk_min_from_nearest_station", "延床面積（㎡）": "construction_area", "建築年": "construction_year", "前面道路：幅員（ｍ）": "street_width", "取引価格（総額）": "price"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df["price"] = filter_df["price"]/scale_factor
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-vers

In [43]:
filter_df["district"].value_counts().rename_axis('Name').reset_index(name='Values').to_csv("district_count.csv", index=False, header = False)

In [44]:
filter_df = remove_district_pipe.fit_transform(filter_df)


In [45]:
filter_df["price"].describe()

count    12578.000000
mean        57.826045
std         28.678136
min         10.000000
25%         43.000000
50%         53.000000
75%         66.000000
max       1400.000000
Name: price, dtype: float64

In [46]:
filter_df["land_price_estimated"].describe()

count    12578.000000
mean        37.348424
std         23.547836
min          0.187500
25%         22.410156
50%         32.462865
75%         46.000000
max        475.000000
Name: land_price_estimated, dtype: float64

In [47]:
filter_df.to_csv("tokyo_reinfolib_filtered.csv", index=False)
filter_df = pd.read_csv("tokyo_reinfolib_filtered.csv")
filter_df.dtypes

walk_min_from_nearest_station      int64
construction_area                  int64
construction_year                  int64
street_width                     float64
land_price_estimated             float64
wood                                bool
south                               bool
west                                bool
east                                bool
north                               bool
price                            float64
dtype: object

In [48]:
filter_df["construction_area"].describe()

count    12578.000000
mean        93.426618
std         28.484016
min         25.000000
25%         85.000000
50%         95.000000
75%        100.000000
max       2000.000000
Name: construction_area, dtype: float64

In [49]:
(filter_df["price"] - filter_df["land_price_estimated"]).describe()

count    12578.000000
mean        20.477622
std         21.513321
min       -355.000000
25%         12.250000
50%         20.239530
75%         28.364966
max       1038.517520
dtype: float64

In [50]:
filter_df[filter_df["price"] - filter_df["land_price_estimated"] < 0]

Unnamed: 0,walk_min_from_nearest_station,construction_area,construction_year,street_width,land_price_estimated,wood,south,west,east,north,price
0,3,80,2021,4.0,130.50000,True,False,True,False,True,100.0
1,3,90,2022,4.0,176.65716,True,False,False,True,True,170.0
2,12,70,2021,2.7,209.44445,True,True,True,False,False,100.0
3,12,70,2021,2.7,209.44445,True,True,True,False,False,100.0
5,12,130,2022,3.0,250.70000,True,False,True,False,False,200.0
...,...,...,...,...,...,...,...,...,...,...,...
12336,18,75,2023,4.0,26.13336,True,False,True,False,False,22.0
12369,6,95,2022,4.5,75.93750,True,False,False,True,False,60.0
12409,6,90,2022,4.0,54.67500,True,False,False,False,True,54.0
12426,9,90,2021,4.0,51.75000,True,False,False,False,True,49.0


In [51]:
h2o.init()
h2o_filter_df = h2o.H2OFrame(filter_df)

train,test = h2o_filter_df.split_frame(ratios = wandb.config.get("train_test_valid_ratio"))

# log the raw csv file within an artifact to preserve our data
data_artifact.add_file("tokyo_reinfolib_filtered.csv")
run.log_artifact(data_artifact)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,25 mins 23 secs
H2O_cluster_timezone:,Asia/Tokyo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.4
H2O_cluster_version_age:,26 days
H2O_cluster_name:,H2O_from_python_dgg32_r9v0zo
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.550 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


<Artifact data>

In [52]:
aml = H2OAutoML(max_models=wandb.config.get("max_models"))
aml.train(x=list(filter_df.columns).remove("price"), y="price", training_frame=train)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees
,44.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,,0.0,,,,,
loglikelihood,,0.0,,,,,
mae,9.224113,0.3925852,9.351232,8.92726,8.930174,9.854896,9.057006
mean_residual_deviance,317.60916,273.52924,189.23332,180.80173,170.59578,804.34106,243.07394
mse,317.60916,273.52924,189.23332,180.80173,170.59578,804.34106,243.07394
r2,0.6469632,0.0529552,0.6579039,0.713545,0.6534705,0.5656334,0.6442631
residual_deviance,317.60916,273.52924,189.23332,180.80173,170.59578,804.34106,243.07394
rmse,16.843086,6.511491,13.75621,13.446254,13.061232,28.360907,15.590829
rmsle,0.1993135,0.0036158,0.2028965,0.1986361,0.1987262,0.2023952,0.1939138

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2024-08-04 22:28:06,1.101 sec,0.0,64.2682103,57.3394391,4130.4028587
,2024-08-04 22:28:06,1.165 sec,5.0,19.314958,11.9497509,373.0676021
,2024-08-04 22:28:06,1.201 sec,10.0,14.0188791,8.8059266,196.5289714
,2024-08-04 22:28:06,1.228 sec,15.0,13.0740937,8.6674049,170.9319258
,2024-08-04 22:28:06,1.258 sec,20.0,12.757809,8.5930045,162.7616896
,2024-08-04 22:28:06,1.286 sec,25.0,12.3474798,8.4901512,152.4602582
,2024-08-04 22:28:06,1.315 sec,30.0,12.1724006,8.4204985,148.1673352
,2024-08-04 22:28:06,1.344 sec,35.0,11.9388577,8.3771494,142.5363227
,2024-08-04 22:28:06,1.383 sec,40.0,11.7553274,8.3259485,138.1877213
,2024-08-04 22:28:06,1.407 sec,44.0,11.5917614,8.2707105,134.3689315

variable,relative_importance,scaled_importance,percentage
land_price_estimated,4865457.5,1.0,0.435309
construction_area,3636762.75,0.7474657,0.3253786
walk_min_from_nearest_station,1739908.25,0.3576042,0.1556683
wood.False,315748.78125,0.064896,0.0282498
street_width,221381.515625,0.0455007,0.0198068
north.True,129256.0,0.0265661,0.0115644
construction_year,106845.7265625,0.0219601,0.0095594
south.True,42002.03125,0.0086327,0.0037579
west.False,27606.0488281,0.0056739,0.0024699
north.False,26215.9257812,0.0053882,0.0023455


In [53]:
lb = aml.leaderboard

lb = h2o.as_list(lb, use_pandas=True)
#print(lb)


run.log({"leaderboard": wandb.Table(dataframe = lb)})




In [54]:
aml.leader

Unnamed: 0,number_of_trees
,44.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,,0.0,,,,,
loglikelihood,,0.0,,,,,
mae,9.224113,0.3925852,9.351232,8.92726,8.930174,9.854896,9.057006
mean_residual_deviance,317.60916,273.52924,189.23332,180.80173,170.59578,804.34106,243.07394
mse,317.60916,273.52924,189.23332,180.80173,170.59578,804.34106,243.07394
r2,0.6469632,0.0529552,0.6579039,0.713545,0.6534705,0.5656334,0.6442631
residual_deviance,317.60916,273.52924,189.23332,180.80173,170.59578,804.34106,243.07394
rmse,16.843086,6.511491,13.75621,13.446254,13.061232,28.360907,15.590829
rmsle,0.1993135,0.0036158,0.2028965,0.1986361,0.1987262,0.2023952,0.1939138

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2024-08-04 22:28:06,1.101 sec,0.0,64.2682103,57.3394391,4130.4028587
,2024-08-04 22:28:06,1.165 sec,5.0,19.314958,11.9497509,373.0676021
,2024-08-04 22:28:06,1.201 sec,10.0,14.0188791,8.8059266,196.5289714
,2024-08-04 22:28:06,1.228 sec,15.0,13.0740937,8.6674049,170.9319258
,2024-08-04 22:28:06,1.258 sec,20.0,12.757809,8.5930045,162.7616896
,2024-08-04 22:28:06,1.286 sec,25.0,12.3474798,8.4901512,152.4602582
,2024-08-04 22:28:06,1.315 sec,30.0,12.1724006,8.4204985,148.1673352
,2024-08-04 22:28:06,1.344 sec,35.0,11.9388577,8.3771494,142.5363227
,2024-08-04 22:28:06,1.383 sec,40.0,11.7553274,8.3259485,138.1877213
,2024-08-04 22:28:06,1.407 sec,44.0,11.5917614,8.2707105,134.3689315

variable,relative_importance,scaled_importance,percentage
land_price_estimated,4865457.5,1.0,0.435309
construction_area,3636762.75,0.7474657,0.3253786
walk_min_from_nearest_station,1739908.25,0.3576042,0.1556683
wood.False,315748.78125,0.064896,0.0282498
street_width,221381.515625,0.0455007,0.0198068
north.True,129256.0,0.0265661,0.0115644
construction_year,106845.7265625,0.0219601,0.0095594
south.True,42002.03125,0.0086327,0.0037579
west.False,27606.0488281,0.0056739,0.0024699
north.False,26215.9257812,0.0053882,0.0023455


In [55]:
model_path = h2o.save_model(model=aml.leader, path="model/winning_model")
model_path

'/home/dgg32/Documents/tokyo_housing_suumo/model/winning_model/XGBoost_3_AutoML_5_20240804_222747'

In [56]:
run.log_model(path=model_path, name="winning_model")

In [57]:
varimp = aml.varimp(use_pandas=True)
varimp

Unnamed: 0,GLM_1,GBM_1,XRT_1,GBM_2,GBM_4,GBM_3,XGBoost_3,DRF_1,XGBoost_2,XGBoost_1
wood,0.330867,0.037915,0.021868,0.017284,0.02553,0.018458,0.029197,0.027731,0.030541,0.017442
street_width,0.006043,0.011029,0.042638,0.016734,0.027923,0.031504,0.019807,0.041957,0.041111,0.039995
construction_year,0.03629,0.009767,0.028599,0.009023,0.013579,0.013043,0.009559,0.029033,0.029647,0.017715
north,0.011938,0.002103,0.003996,0.001401,0.002817,0.001969,0.01391,0.011427,0.013217,0.006177
east,0.008082,0.002168,0.00423,0.001628,0.005254,0.006639,0.00387,0.021552,0.013057,0.009798
west,0.002815,0.00154,0.004897,0.001323,0.004116,0.002117,0.002575,0.011996,0.006165,0.005185
south,0.002339,0.002115,0.006817,0.001401,0.003864,0.002209,0.004726,0.012351,0.005225,0.006015
walk_min_from_nearest_station,0.09867,0.073141,0.093005,0.078088,0.084003,0.076699,0.155668,0.110283,0.100989,0.091888
land_price_estimated,0.30049,0.586853,0.488133,0.5467,0.533206,0.535603,0.435309,0.39896,0.431298,0.418559
construction_area,0.202466,0.273368,0.305817,0.326418,0.299709,0.31176,0.325379,0.334712,0.32875,0.387226


In [58]:
lb

Unnamed: 0,model_id,rmse,mse,mae,rmsle,mean_residual_deviance
0,XGBoost_3_AutoML_5_20240804_222747,17.820858,317.58298,9.224097,0.19934,317.58298
1,GBM_3_AutoML_5_20240804_222747,18.146072,329.279946,9.151828,0.196671,329.279946
2,StackedEnsemble_BestOfFamily_1_AutoML_5_202408...,18.27324,333.911296,9.181486,0.198908,333.911296
3,GBM_4_AutoML_5_20240804_222747,18.32391,335.765678,9.235089,0.198093,335.765678
4,GBM_2_AutoML_5_20240804_222747,18.323983,335.768364,9.252556,0.198755,335.768364
5,XGBoost_2_AutoML_5_20240804_222747,18.495675,342.089988,9.599815,0.206341,342.089988
6,XGBoost_1_AutoML_5_20240804_222747,18.688315,349.253127,9.896311,0.215158,349.253127
7,StackedEnsemble_AllModels_1_AutoML_5_20240804_...,18.712031,350.140096,9.173515,0.198137,350.140096
8,DRF_1_AutoML_5_20240804_222747,18.9324,358.435759,9.163898,0.19867,358.435759
9,GLM_1_AutoML_5_20240804_222747,19.420053,377.138447,10.295489,,377.138447


In [59]:
winning_short_name = re.findall(r'^[A-Za-z]+_[0-9]+', aml.leader.model_id)
winning_short_name

['XGBoost_3']

In [60]:
varimp[winning_short_name[0]]

wood                             0.029197
street_width                     0.019807
construction_year                0.009559
north                            0.013910
east                             0.003870
west                             0.002575
south                            0.004726
walk_min_from_nearest_station    0.155668
land_price_estimated             0.435309
construction_area                0.325379
Name: XGBoost_3, dtype: float64

In [61]:
data = [[label, val] for (label, val) in varimp["XRT_1"].items()]

table = wandb.Table(data=data, columns = ["feature", "importance"])

wandb.log({"my_bar_chart_id" : wandb.plot.bar(table, "feature", "importance",
                               title="Winning model feature importance")})

In [62]:
prediction = aml.predict(test)

xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%


In [63]:

predict_df = pd.concat([h2o.as_list(test, use_pandas=True), h2o.as_list(prediction, use_pandas=True)], axis = 1)

predict_df["diff"] = predict_df["predict"] - predict_df["price"]


run.log({"prediction": wandb.Table(dataframe = predict_df) })






In [64]:
diff_df = predict_df["diff"].describe().to_frame()

diff_df.reset_index(inplace = True)

In [65]:
diff_df

Unnamed: 0,index,diff
0,count,2451.0
1,mean,0.048723
2,std,15.601432
3,min,-197.783066
4,25%,-5.907545
5,50%,0.973919
6,75%,7.333496
7,max,296.132874


In [66]:
run.log({"Prediction Diff": wandb.Table(dataframe = diff_df) })
wandb.finish()

VBox(children=(Label(value='0.620 MB of 0.620 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [67]:
new_house_df = pd.read_csv("new_houses.csv")
#print (new_house_df.dtypes)

filter_new_house_df = whole_pipe.fit_transform(new_house_df)
filter_new_house_df.head()

  filter_df.loc[:, "延床面積（㎡）"] = filter_df["延床面積（㎡）"].astype(str).str.replace("㎡以上", "")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df.rename(columns = {"市区町村名": "district", "最寄駅：距離（分）": "walk_min_from_nearest_station", "延床面積（㎡）": "construction_area", "建築年": "construction_year", "前面道路：幅員（ｍ）": "street_width", "取引価格（総額）": "price"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df["price"] = filter_df["price"]/scale_factor
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: htt

Unnamed: 0,walk_min_from_nearest_station,construction_area,construction_year,street_width,land_price_estimated,wood,south,west,east,north,price
0,8,44,2024,4,42.21,True,True,False,True,True,56.8
1,8,50,2024,4,47.88,True,False,False,False,True,57.8


In [68]:
aml.predict(h2o.H2OFrame(filter_new_house_df))

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%


predict
59.321
58.7116
