In [1]:
import lightgbm as lgbm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
plt.style.use("ggplot")

## 正解データの準備

In [16]:
dfpollen=pd.read_csv("../input/pollen_2010_2022.csv.gz", index_col="date_time",parse_dates=True)
dfpollen

Unnamed: 0_level_0,pollen,log_pollen
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-02-01 01:00:00,4.0,1.609438
2010-02-01 02:00:00,4.0,1.609438
2010-02-01 03:00:00,0.0,0.000000
2010-02-01 04:00:00,0.0,0.000000
2010-02-01 05:00:00,4.0,1.609438
...,...,...
2020-06-30 20:00:00,,
2020-06-30 21:00:00,,
2020-06-30 22:00:00,,
2020-06-30 23:00:00,,


## 気象データの準備

In [20]:
def make_ame_features(df, suffix):
    df_d=df.resample("D").mean()
    cols=["temp","precip","rh","wind","u_wind","v_wind"]
    df_d=df.resample("D").agg({"temp":"mean","precip":"sum","wind":"mean","u_wind":"mean", "v_wind":"mean"})
    df_d.columns=[f"{col}_d" for col in df_d.columns]
    df_d["temp_max"]=df_d["temp_d"].resample("D").max()
    df_d["temp_min"]=df_d["temp_d"].resample("D").min()
    df_d["is_rainy"]=df_d["precip_d"].apply(lambda x: 1 if x>=5 else 0)
    for iyy in range(2010,2021):
        df_d.loc[str(iyy),"cum_temp"]=df_d.loc[str(iyy),"temp_max"].cumsum()
    df_d_1d_before= df_d.copy()
    df_d_1d_before.index=df_d_1d_before.index+pd.offsets.Day(1)
    df_d_1d_before.columns=[f"{col}_1d_before" for col in df_d_1d_before.columns]
    df_d_merge=pd.merge(df_d, df_d_1d_before, how="left",left_index=True, right_index=True)
    df_d_merge["delta_temp_max"]=df_d_merge["temp_max"]- df_d_merge["temp_max_1d_before"]
    index=df.index
    df["yyyy-mm-dd"]=df.index.strftime("%Y-%m-%d")
    df_d_merge["yyyy-mm-dd"]=df_d_merge.index.strftime("%Y-%m-%d")
    dfout=pd.merge(df, df_d_merge ,how="left", on="yyyy-mm-dd")
    dfout.index=index
    dfout.drop(columns=["yyyy-mm-dd"], inplace=True)
    dfout.columns=[f"{col}_{suffix}" for col in dfout.columns]
    return dfout    

In [21]:
station_names_dict={"恵那":"ena","稲武":"inabu","亀山":"kameyama","岡崎":"okazaki","大垣":"oogaki","新城":"shinshiro","多治見":"tajimi"}
ame_list=[]
for st_jpname in station_names_dict:
    st_name=station_names_dict[st_jpname]
    dfame=pd.read_csv(f"../data/amedas_obs_hourly_{st_name}_2010_2020.csv.gz", parse_dates=True, index_col="date_time")
    ame_list.append(make_ame_features(dfame, st_name))

In [22]:
dfame_all=pd.concat(ame_list, axis=1)
dfame_all

Unnamed: 0_level_0,precip_ena,temp_ena,wind_ena,wind_direction_ena,u_wind_ena,v_wind_ena,lat_ena,lon_ena,temp_d_ena,precip_d_ena,...,temp_d_1d_before_tajimi,precip_d_1d_before_tajimi,wind_d_1d_before_tajimi,u_wind_d_1d_before_tajimi,v_wind_d_1d_before_tajimi,temp_max_1d_before_tajimi,temp_min_1d_before_tajimi,is_rainy_1d_before_tajimi,cum_temp_1d_before_tajimi,delta_temp_max_tajimi
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01 01:00:00,0.0,2.5,3.3,202.5,-1.262855e+00,-3.048802,35.446667,137.403333,1.508696,1.5,...,,,,,,,,,,
2010-01-01 02:00:00,0.0,2.6,2.4,202.5,-9.184402e-01,-2.217311,35.446667,137.403333,1.508696,1.5,...,,,,,,,,,,
2010-01-01 03:00:00,0.0,2.6,2.0,202.5,-7.653669e-01,-1.847759,35.446667,137.403333,1.508696,1.5,...,,,,,,,,,,
2010-01-01 04:00:00,0.0,2.7,2.4,202.5,-9.184402e-01,-2.217311,35.446667,137.403333,1.508696,1.5,...,,,,,,,,,,
2010-01-01 05:00:00,0.0,2.9,1.9,180.0,2.326829e-16,-1.900000,35.446667,137.403333,1.508696,1.5,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30 20:00:00,7.5,22.1,0.8,67.5,7.391036e-01,0.306147,35.446667,137.403333,20.662500,74.0,...,24.866667,0.0,1.358333,-0.412064,-0.029489,24.866667,24.866667,0.0,2339.470612,-1.887500
2020-06-30 21:00:00,3.0,22.5,1.1,247.5,-1.016267e+00,-0.420952,35.446667,137.403333,20.662500,74.0,...,24.866667,0.0,1.358333,-0.412064,-0.029489,24.866667,24.866667,0.0,2339.470612,-1.887500
2020-06-30 22:00:00,1.0,22.3,0.9,45.0,6.363961e-01,0.636396,35.446667,137.403333,20.662500,74.0,...,24.866667,0.0,1.358333,-0.412064,-0.029489,24.866667,24.866667,0.0,2339.470612,-1.887500
2020-06-30 23:00:00,2.0,23.7,4.2,202.5,-1.607270e+00,-3.880294,35.446667,137.403333,20.662500,74.0,...,24.866667,0.0,1.358333,-0.412064,-0.029489,24.866667,24.866667,0.0,2339.470612,-1.887500


In [23]:
dfnagoya=pd.read_csv("../data/situ_obs_hourly_nagoya_2010_2020.csv.gz", parse_dates=True, index_col="date_time")
dfnagoya

Unnamed: 0_level_0,hour,P0,precip,temp,dew_temp,e_vapor,rh,wind,wind_direction,solar_radiation,...,is_rainy,temp_d_1D_before,precip_d_1D_before,wind_d_1D_before,rh_d_1D_before,u_wind_d_1D_before,v_wind_d_1D_before,temp_max_1D_before,temp_min_1D_before,is_rainy_1D_before
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01 01:00:00,1,1009.9,0.5,1.2,3.7,4.6,83.0,4.9,292.5,0.0,...,1,,,,,,,,,
2010-01-01 02:00:00,2,1010.1,0.0,1.0,3.5,4.7,83.0,4.6,292.5,0.0,...,1,,,,,,,,,
2010-01-01 03:00:00,3,1010.3,0.0,0.9,3.2,4.8,84.0,4.5,292.5,0.0,...,1,,,,,,,,,
2010-01-01 04:00:00,4,1009.9,0.5,1.2,3.4,4.8,85.0,5.3,292.5,0.0,...,1,,,,,,,,,
2010-01-01 05:00:00,5,1009.8,0.0,1.1,3.4,4.7,84.0,4.0,292.5,0.0,...,1,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30 20:00:00,20,1001.2,1.0,24.6,24.3,30.3,98.0,4.4,157.5,0.0,...,1,25.891667,0.0,3.3500,57.916667,-1.070598,0.929152,30.7,20.9,0.0
2020-06-30 21:00:00,21,1001.6,1.0,24.7,24.4,30.5,98.0,5.0,157.5,0.0,...,1,25.891667,0.0,3.3500,57.916667,-1.070598,0.929152,30.7,20.9,0.0
2020-06-30 22:00:00,22,1001.5,1.5,24.9,24.6,30.9,98.0,5.6,157.5,0.0,...,1,25.891667,0.0,3.3500,57.916667,-1.070598,0.929152,30.7,20.9,0.0
2020-06-30 23:00:00,23,1000.7,3.5,24.9,24.7,31.2,99.0,5.2,157.5,0.0,...,1,25.891667,0.0,3.3500,57.916667,-1.070598,0.929152,30.7,20.9,0.0


In [24]:
dfweather=pd.concat([dfame_all, dfnagoya],axis=1)
dfweather

Unnamed: 0_level_0,precip_ena,temp_ena,wind_ena,wind_direction_ena,u_wind_ena,v_wind_ena,lat_ena,lon_ena,temp_d_ena,precip_d_ena,...,is_rainy,temp_d_1D_before,precip_d_1D_before,wind_d_1D_before,rh_d_1D_before,u_wind_d_1D_before,v_wind_d_1D_before,temp_max_1D_before,temp_min_1D_before,is_rainy_1D_before
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01 01:00:00,0.0,2.5,3.3,202.5,-1.262855e+00,-3.048802,35.446667,137.403333,1.508696,1.5,...,1,,,,,,,,,
2010-01-01 02:00:00,0.0,2.6,2.4,202.5,-9.184402e-01,-2.217311,35.446667,137.403333,1.508696,1.5,...,1,,,,,,,,,
2010-01-01 03:00:00,0.0,2.6,2.0,202.5,-7.653669e-01,-1.847759,35.446667,137.403333,1.508696,1.5,...,1,,,,,,,,,
2010-01-01 04:00:00,0.0,2.7,2.4,202.5,-9.184402e-01,-2.217311,35.446667,137.403333,1.508696,1.5,...,1,,,,,,,,,
2010-01-01 05:00:00,0.0,2.9,1.9,180.0,2.326829e-16,-1.900000,35.446667,137.403333,1.508696,1.5,...,1,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30 20:00:00,7.5,22.1,0.8,67.5,7.391036e-01,0.306147,35.446667,137.403333,20.662500,74.0,...,1,25.891667,0.0,3.3500,57.916667,-1.070598,0.929152,30.7,20.9,0.0
2020-06-30 21:00:00,3.0,22.5,1.1,247.5,-1.016267e+00,-0.420952,35.446667,137.403333,20.662500,74.0,...,1,25.891667,0.0,3.3500,57.916667,-1.070598,0.929152,30.7,20.9,0.0
2020-06-30 22:00:00,1.0,22.3,0.9,45.0,6.363961e-01,0.636396,35.446667,137.403333,20.662500,74.0,...,1,25.891667,0.0,3.3500,57.916667,-1.070598,0.929152,30.7,20.9,0.0
2020-06-30 23:00:00,2.0,23.7,4.2,202.5,-1.607270e+00,-3.880294,35.446667,137.403333,20.662500,74.0,...,1,25.891667,0.0,3.3500,57.916667,-1.070598,0.929152,30.7,20.9,0.0


In [175]:
dfdata=pd.merge(dfpollen, dfweather, how="left",left_index=True, right_index=True)#.dropna(subset=["pollen"])
dfdata

Unnamed: 0_level_0,pollen,log_pollen,precip_ena,temp_ena,wind_ena,wind_direction_ena,u_wind_ena,v_wind_ena,lat_ena,lon_ena,...,is_rainy,temp_d_1D_before,precip_d_1D_before,wind_d_1D_before,rh_d_1D_before,u_wind_d_1D_before,v_wind_d_1D_before,temp_max_1D_before,temp_min_1D_before,is_rainy_1D_before
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-01 01:00:00,4.0,1.609438,0.0,0.7,0.7,202.5,-2.678784e-01,-6.467157e-01,35.446667,137.403333,...,1,6.079167,1.5,1.591667,67.666667,-0.448285,1.367330,8.3,4.4,1.0
2010-02-01 02:00:00,4.0,1.609438,0.0,1.5,0.9,90.0,9.000000e-01,5.510911e-17,35.446667,137.403333,...,1,6.079167,1.5,1.591667,67.666667,-0.448285,1.367330,8.3,4.4,1.0
2010-02-01 03:00:00,0.0,0.000000,0.0,1.9,0.5,202.5,-1.913417e-01,-4.619398e-01,35.446667,137.403333,...,1,6.079167,1.5,1.591667,67.666667,-0.448285,1.367330,8.3,4.4,1.0
2010-02-01 04:00:00,0.0,0.000000,0.0,2.6,0.5,225.0,-3.535534e-01,-3.535534e-01,35.446667,137.403333,...,1,6.079167,1.5,1.591667,67.666667,-0.448285,1.367330,8.3,4.4,1.0
2010-02-01 05:00:00,4.0,1.609438,0.0,2.0,0.9,180.0,1.102182e-16,-9.000000e-01,35.446667,137.403333,...,1,6.079167,1.5,1.591667,67.666667,-0.448285,1.367330,8.3,4.4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30 20:00:00,,,7.5,22.1,0.8,67.5,7.391036e-01,3.061467e-01,35.446667,137.403333,...,1,25.891667,0.0,3.350000,57.916667,-1.070598,0.929152,30.7,20.9,0.0
2020-06-30 21:00:00,,,3.0,22.5,1.1,247.5,-1.016267e+00,-4.209518e-01,35.446667,137.403333,...,1,25.891667,0.0,3.350000,57.916667,-1.070598,0.929152,30.7,20.9,0.0
2020-06-30 22:00:00,,,1.0,22.3,0.9,45.0,6.363961e-01,6.363961e-01,35.446667,137.403333,...,1,25.891667,0.0,3.350000,57.916667,-1.070598,0.929152,30.7,20.9,0.0
2020-06-30 23:00:00,,,2.0,23.7,4.2,202.5,-1.607270e+00,-3.880294e+00,35.446667,137.403333,...,1,25.891667,0.0,3.350000,57.916667,-1.070598,0.929152,30.7,20.9,0.0


In [176]:
dfdata["pollen_smooth"]=dfdata["pollen"].interpolate()

## 特徴量

In [177]:
for iyy in range(2010,2021):
    dfdata.loc[f"{iyy}","pollen_smooth"]=dfdata.loc[f"{iyy}","pollen_smooth"].rolling(3,center=True).mean()

In [184]:
dfdata.to_csv("../input/input.csv.gz", compression="gzip")

In [138]:
dfdata["log_pollen"]=np.log1p(dfdata["pollen_smooth"])

In [158]:
features=[
 'cum_temp',
 'cum_temp_ena',
 'cum_temp_inabu',
 'cum_temp_kameyama',
 'cum_temp_okazaki',
 'cum_temp_oogaki',
 'cum_temp_shinshiro',
 'cum_temp_tajimi',
 'delta_temp_max_ena',
 'delta_temp_max_inabu',
 'delta_temp_max_kameyama',
 'delta_temp_max_okazaki',
 'delta_temp_max_oogaki',
 'delta_temp_max_shinshiro',
 'delta_temp_max_tajimi',
 'dew_temp',
 'e_vapor',
 'hour',
 'precip',
 'precip_d',
 'precip_d_1D_before',
 'precip_d_1d_before_ena',
 'precip_d_1d_before_inabu',
 'precip_d_1d_before_kameyama',
 'precip_d_1d_before_okazaki',
 'precip_d_1d_before_oogaki',
 'precip_d_1d_before_shinshiro',
 'precip_d_1d_before_tajimi',
 'precip_d_ena',
 'precip_d_inabu',
 'precip_d_kameyama',
 'precip_d_okazaki',
 'precip_d_oogaki',
 'precip_d_shinshiro',
 'precip_d_tajimi',
 'rh',
 'rh_d',
 'rh_d_1D_before',
 'temp',
 'temp_d',
 'temp_d_1D_before',
 'temp_d_1d_before_ena',
 'temp_d_1d_before_inabu',
 'temp_d_1d_before_kameyama',
 'temp_d_1d_before_okazaki',
 'temp_d_1d_before_oogaki',
 'temp_d_1d_before_shinshiro',
 'temp_d_1d_before_tajimi',
 'temp_d_ena',
 'temp_d_inabu',
 'temp_d_kameyama',
 'temp_d_okazaki',
 'temp_d_oogaki',
 'temp_d_shinshiro',
 'temp_d_tajimi',
 'temp_ena',
 'temp_inabu',
 'temp_kameyama',
 'temp_max',
 'temp_max_1D_before',
 'temp_max_1d_before_ena',
 'temp_max_1d_before_inabu',
 'temp_max_1d_before_kameyama',
 'temp_max_1d_before_okazaki',
 'temp_max_1d_before_oogaki',
 'temp_max_1d_before_shinshiro',
 'temp_max_1d_before_tajimi',
 'temp_max_ena',
 'temp_max_inabu',
 'temp_max_kameyama',
 'temp_max_okazaki',
 'temp_max_oogaki',
 'temp_max_shinshiro',
 'temp_max_tajimi',
 'temp_min',
 'temp_min_1D_before',
 'temp_min_1d_before_ena',
 'temp_min_1d_before_inabu',
 'temp_min_1d_before_kameyama',
 'temp_min_1d_before_okazaki',
 'temp_min_1d_before_oogaki',
 'temp_min_1d_before_shinshiro',
 'temp_min_1d_before_tajimi',
 'temp_min_ena',
 'temp_min_inabu',
 'temp_min_kameyama',
 'temp_min_okazaki',
 'temp_min_oogaki',
 'temp_min_shinshiro',
 'temp_min_tajimi',
 'temp_okazaki',
 'temp_oogaki',
 'temp_shinshiro',
 'temp_tajimi',
 'u_wind',
 'u_wind_d',
 'u_wind_d_ena',
 'u_wind_d_inabu',
 'u_wind_d_kameyama',
 'u_wind_d_okazaki',
 'u_wind_d_oogaki',
 'u_wind_d_shinshiro',
 'u_wind_d_tajimi',
 'u_wind_ena',
 'u_wind_inabu',
 'u_wind_kameyama',
 'u_wind_okazaki',
 'u_wind_oogaki',
 'u_wind_shinshiro',
 'u_wind_tajimi',
 'v_wind',
 'v_wind_d',
 'v_wind_d_ena',
 'v_wind_d_inabu',
 'v_wind_d_kameyama',
 'v_wind_d_okazaki',
 'v_wind_d_oogaki',
 'v_wind_d_shinshiro',
 'v_wind_d_tajimi',
 'v_wind_ena',
 'v_wind_inabu',
 'v_wind_kameyama',
 'v_wind_okazaki',
 'v_wind_oogaki',
 'v_wind_shinshiro',
 'v_wind_tajimi',
 'wind',
 'wind_d',
 'wind_d_ena',
 'wind_d_inabu',
 'wind_d_kameyama',
 'wind_d_okazaki',
 'wind_d_oogaki',
 'wind_d_shinshiro',
 'wind_d_tajimi',
 'wind_ena',
 'wind_inabu',
 'wind_kameyama',
 'wind_okazaki',
 'wind_oogaki',
 'wind_shinshiro',
 'wind_tajimi']


In [166]:
dfdata2=dfdata

In [167]:
features2=features
train_begin,train_end="2010-01-01","2017-12-31"
eval_begin, eval_end="2018-01-01","2018-12-31"
pred_begin,pred_end="2019-01-01","2020-12-31"
lgb_train=lgbm.Dataset(dfdata2.loc[train_begin:train_end, features2], dfdata2.loc[train_begin:train_end,"log_pollen"])
lgb_eval=lgbm.Dataset(dfdata2.loc[eval_begin:eval_end, features2],dfdata2.loc[eval_begin:eval_end, "log_pollen"] , reference=lgb_train)

params={
    "objective":"regression",
    "boosting_type":"gbdt",
    "metric":"rmse",
    "colsample_bytree":0.7,
    "num_leaves":12,
    "subsample":0.8,
    "min_child_samples":5,
}
model=lgbm.train(params, train_set=lgb_train, valid_sets=lgb_eval,num_boost_round=500,early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31589
[LightGBM] [Info] Number of data points in the train set: 27408, number of used features: 142
[LightGBM] [Info] Start training from score 2.376781
[1]	valid_0's rmse: 1.41214
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 1.37915
[3]	valid_0's rmse: 1.34373
[4]	valid_0's rmse: 1.3147
[5]	valid_0's rmse: 1.29187
[6]	valid_0's rmse: 1.2688
[7]	valid_0's rmse: 1.24608
[8]	valid_0's rmse: 1.22859
[9]	valid_0's rmse: 1.21267
[10]	valid_0's rmse: 1.19652
[11]	valid_0's rmse: 1.18091
[12]	valid_0's rmse: 1.16862
[13]	valid_0's rmse: 1.15957
[14]	valid_0's rmse: 1.15065
[15]	valid_0's rmse: 1.14421
[16]	valid_0's rmse: 1.14146
[17]	valid_0's rmse: 1.1378
[18]	valid_0's rmse: 1.13406
[19]	valid_0's rmse: 1.1285
[20]	valid_0's rmse: 1.12429
[21]	valid_0's rmse: 1.12152
[22]	valid_0's rmse: 1.11826
[23]	valid_0's rmse: 1.11418
[24]	valid_0's rmse: 1.11277
[25]	valid_0's 

In [168]:
result=dfdata.loc[pred_begin:pred_end].copy()

In [169]:
result["pred"]=model.predict(result[features2])

In [172]:
result["pred_pollen"]=np.expm1(result["pred"])