In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [2]:
data_path='data/input/'

In [3]:
train_df=pd.read_csv('data/raw/dataset_train.csv', sep=',')
test_df=pd.read_csv('data/raw/dataset_test.csv', sep=',')

### Interpolate

In [4]:
org_train=pd.read_csv(data_path+'ground_measures_train_features.csv', index_col='Unnamed: 0')
org_test=pd.read_csv(data_path+'ground_measures_test_features.csv', index_col='Unnamed: 0')
org_new=pd.read_csv(data_path+'ground_measures_features.csv', index_col='Unnamed: 0')
gm_metadata=pd.read_csv(data_path+'ground_measures_metadata.csv', index_col='station_id')

In [5]:
org_train_df=org_train.unstack().reset_index()
org_test_df=org_test.unstack().reset_index()
org_new_df=org_new.unstack().reset_index()
org_train_df.columns=['date', 'id', 'org_value']
org_test_df.columns=['date', 'id', 'org_value']
org_new_df.columns=['date', 'id', 'org_value']

In [6]:
org=pd.concat([org_train_df, org_test_df], axis=0).merge(gm_metadata, how='left', left_on='id', right_on='station_id').reset_index(drop=True)

In [7]:
org['dt_date'] = pd.to_datetime(org['date'], format='%Y-%m-%d')

In [8]:
org['dayofyear'] = org['dt_date'].dt.dayofyear

In [9]:
org['year'] = org['dt_date'].dt.year

In [10]:
org_new_df2=org_new_df.merge(gm_metadata, how='left', left_on='id', right_on='station_id')

In [11]:
org_new_df2['dt_date'] = pd.to_datetime(org_new_df2['date'], format='%Y-%m-%d')
org_new_df2['dayofyear'] = org_new_df2['dt_date'].dt.dayofyear
org_new_df2['year'] = org_new_df2['dt_date'].dt.year

In [12]:
org_new_df2=org_new_df2[['latitude','longitude', 'elevation_m', 'year', 'dayofyear']]

In [13]:
org_new_df3=train_df[['lat','lon','alt','year','dayofyear']]
org_new_df3.columns=['latitude','longitude', 'elevation_m', 'year', 'dayofyear']
org_new_df5=test_df[['lat','lon','alt','year','dayofyear']]
org_new_df5.columns=['latitude','longitude', 'elevation_m', 'year', 'dayofyear']

In [14]:
org_new_df4=pd.concat([org_new_df3,org_new_df2,org_new_df5]).reset_index(drop=True)

In [15]:
X_org=org[['latitude','longitude', 'elevation_m', 'year', 'dayofyear']]
y_org=org['org_value']
y_org.fillna(0, inplace=True)
Z_org=org_new_df4

In [16]:
rf=RandomForestRegressor(n_estimators=250, random_state=0, n_jobs=-1)

In [17]:
%%time
rf.fit(X_org,y_org)

Wall time: 4.84 s


RandomForestRegressor(n_estimators=250, n_jobs=-1, random_state=0)

In [18]:
rf_int=pd.DataFrame(rf.predict(Z_org))
rf_int.columns=['rf_org_value_v2']

In [19]:
rf_int_res=pd.concat([Z_org, rf_int],axis=1)
rf_int_res.columns=['lat','lon','alt','year','dayofyear','rf_org_value_v2']

In [20]:
rf_int2=rf_int_res.drop(['alt', 'year'],axis=1).groupby(by=['lat', 'lon', 'dayofyear']).mean('rf_org_value_v2').reset_index()

In [21]:
train_df=train_df.merge(rf_int2, how='left', left_on=['lat', 'lon', 'dayofyear'], right_on=['lat','lon', 'dayofyear'])
test_df=test_df.merge(rf_int2, how='left', left_on=['lat', 'lon', 'dayofyear'], right_on=['lat','lon', 'dayofyear'])

In [22]:
train_df.drop(['year'],axis=1, inplace=True)
test_df.drop(['year'],axis=1, inplace=True)

### Final data

In [23]:
X=train_df.drop(['cell_id','valid_time','swe'],axis=1)
y=train_df['swe']
Z=test_df.drop(['cell_id','valid_time'],axis=1)

### Zoo models

In [24]:
#cv=4.39
xgb_1=XGBRegressor(n_estimators=2500, learning_rate=0.05, max_depth=7, 
                  reg_alpha=0.1, reg_lambda=1.5, subsample=0.8, random_state=0, n_jobs=-1)

In [25]:
#cv=4.33
ctb_1=CatBoostRegressor(depth=8, iterations=2000, learning_rate=0.1, logging_level='Silent', random_seed=155)

In [26]:
#cv=4.22
lgb_1=LGBMRegressor(n_estimators=2500, num_leaves=64, metric='rmse', random_state=0)

### Stack

In [27]:
nf=5
cv=KFold(n_splits=nf, shuffle=True, random_state=18)

In [28]:
zoo_names=['xgb_1', 'lgb_1', 'ctb_1']
zoo=[xgb_1, lgb_1, ctb_1]

In [29]:
pd.options.mode.chained_assignment = None  # default='warn'

In [30]:
%%time
fold=0
i=0
name=0
res=np.zeros(nf)
meta_f=pd.DataFrame(columns=zoo_names, index=X.index).fillna(value=0)

for model in zoo:
    i=0  
    for train, test in cv.split(X,y):
        model.fit(X.loc[train],y.loc[train])
        meta_f[zoo_names[name]].loc[test]=model.predict(X.loc[test])
        res[i]=np.sqrt(mean_squared_error(y.loc[test], model.predict(X.loc[test])))
        i+=1
    
    print (zoo_names[name],'||',np.mean(res))
    name+=1

xgb_1 || 3.7526374867169396
lgb_1 || 3.6514706097455987
ctb_1 || 3.7841241869845206
Wall time: 23min 39s


In [31]:
for i in meta_f.columns:
    meta_f[meta_f[i]<0]=0

In [32]:
clf_meta=XGBRegressor(learning_rate=0.01, max_depth=4, n_estimators=500)

In [33]:
%%time
cv_result=cross_val_score(clf_meta, meta_f, y, n_jobs=-1, cv=cv, scoring='neg_mean_squared_error')

Wall time: 12.9 s


In [34]:
print(cv_result, '||',np.mean(np.sqrt(-cv_result)))

[-12.03362821 -12.34396492 -11.75727126 -12.46829675 -12.4932873 ] || 3.4953739417774683


In [35]:
clf_meta.fit(meta_f, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=32,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [36]:
%%time
name=0
Z_meta_f=pd.DataFrame(columns=zoo_names, index=Z.index).fillna(value=0)

for model in zoo: 
    model.fit(X,y)
    Z_meta_f[zoo_names[name]]=model.predict(Z)
    name+=1
    
    

Wall time: 5min 40s


In [37]:
for i in Z_meta_f.columns:
    Z_meta_f[Z_meta_f[i]<0]=0

In [38]:
res=pd.DataFrame(clf_meta.predict(Z_meta_f))
res.columns=['swe_pred']

In [39]:
res[res['swe_pred']<0]=0

In [40]:
res_pivot=pd.concat([test_df[['cell_id', 'valid_time']], res],axis=1).pivot(index='cell_id', 
                                                                  columns='valid_time', values='swe_pred')

In [41]:
res_pivot.to_csv('sub.csv', index=True)

In [42]:
dmp_models=[rf, xgb_1, lgb_1, ctb_1, clf_meta]

In [43]:
with open('models/'+'models_final.pkl', 'wb') as f:
    for mdl in dmp_models:
         pickle.dump(mdl, f)

In [44]:
# models_readed = []
# with open('models/'+'models_final.pkl', 'rb') as f:
#     while True:
#         try:
#             models_readed.append(pickle.load(f))
#         except EOFError:
#             break