In [None]:
!pip install catboost

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import time
import os
import json
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from joblib import dump

from sklearn import linear_model
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# For catboost output
from google.colab import output
output.enable_custom_widget_manager()

## Load Ground Measures, Train, and Test Data

In [None]:
data_dir = '/content/drive/MyDrive/snocast/train/data'

# Get ground measures for train dataset
ground_measures_train = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_train_features.csv'))
ground_measures_train.columns = ['station_id'] + list(ground_measures_train.columns[1:])
gm_melt_train = ground_measures_train.melt(id_vars=["station_id"],
                                            var_name="date",
                                            value_name="swe")
            
# Get ground measures for test dataset
ground_measures_test = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_test_features.csv'))
ground_measures_test.columns = ['station_id'] + list(ground_measures_test.columns[1:])
gm_melt_test = ground_measures_test.melt(id_vars=["station_id"],
                           var_name="date",
                           value_name="swe")

ground_measures_metadata = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_metadata.csv'))
ground_measures_all = pd.concat([ground_measures_train, ground_measures_test], axis=1)
gm_melt_all = pd.concat([gm_melt_train, gm_melt_test])

# Get the train labels data
train_labels = pd.read_csv(os.path.join(data_dir, 'static/train_labels.csv'))
labels_melt_train = train_labels.melt(id_vars=["cell_id"],
                  var_name="date",
                  value_name="swe").dropna()

# Get the test labels data
test_labels = pd.read_csv(os.path.join(data_dir, 'static/labels_2020_2021.csv'))
labels_melt_test = test_labels.melt(id_vars=["cell_id"],
                  var_name="date",
                  value_name="swe").dropna()

# Get elevation data for train and test
train_label_elev = pd.read_parquet(os.path.join(data_dir, 'static/train_elevation.parquet'))
test_pred_elev = pd.read_parquet(os.path.join(data_dir, 'static/test_elevation.parquet'))

train_elev_grad = pd.read_parquet(os.path.join(data_dir, 'static/train_elevation_grads.parquet'))
test_elev_grad = pd.read_parquet(os.path.join(data_dir, 'static/test_elevation_grads.parquet'))
gm_elev_grad = pd.read_parquet(os.path.join(data_dir, 'static/gm_elevation_grads.parquet'))

gm_modis_aqua = pd.read_parquet(os.path.join(data_dir, 'modis/modis_aqua_gm.parquet'))
gm_modis_terra = pd.read_parquet(os.path.join(data_dir, 'modis/modis_terra_gm.parquet'))

train_modis_aqua = pd.read_parquet(os.path.join(data_dir, 'modis/modis_aqua_train.parquet'))
train_modis_terra = pd.read_parquet(os.path.join(data_dir, 'modis/modis_terra_train.parquet'))

test_modis_aqua = pd.read_parquet(os.path.join(data_dir, 'modis/modis_aqua_test.parquet'))
test_modis_terra = pd.read_parquet(os.path.join(data_dir, 'modis/modis_terra_test.parquet'))

train_water = pd.read_parquet(os.path.join(data_dir, 'static/train_water.parquet'))
train_water['water'] = train_water['water'] - 1
test_water = pd.read_parquet(os.path.join(data_dir, 'static/test_water.parquet'))
test_water['water'] = test_water['water'] - 1

test_lccs = pd.read_parquet(os.path.join(data_dir, 'static/test_lccs.parquet'))
train_lccs = pd.read_parquet(os.path.join(data_dir, 'static/train_lccs.parquet'))
gm_lccs = pd.read_parquet(os.path.join(data_dir, 'static/gm_lccs.parquet'))

# Pull in the NOAA HRRR Climate Data
gm_climate = pd.read_parquet(os.path.join(data_dir, 'hrrr/gm_climate.parquet'))
train_climate = pd.read_parquet(os.path.join(data_dir, 'hrrr/train_climate.parquet'))
test_climate = pd.read_parquet(os.path.join(data_dir, 'hrrr/test_climate.parquet'))

In [None]:
# Create sequential dataframes for train and test
train_label_seq = pd.merge(labels_melt_train, train_label_elev, how='inner', on='cell_id')
test_label_seq = pd.merge(labels_melt_test, test_pred_elev, how='inner', on='cell_id')

# Create sequential dataframe for ground measure stations
gm_seq = pd.merge(gm_melt_all, ground_measures_metadata, how='inner', on='station_id')

In [None]:
# Combine sequential datasets
gm_seq.columns = ['location_id', 'date', 'swe', 'name', 'elevation_m', 'latitude', 'longitude', 'state']

train_label_seq.columns = ['location_id', 'date', 'swe', 'latitude', 'longitude', 'region', 'elevation_m', 'elevation_var_m']

test_label_seq.columns = ['location_id', 'date', 'swe', 'latitude', 'longitude', 'region', 'elevation_m', 'elevation_var_m']


In [None]:
# Merge Aqua and Terra Modis datasets
def transform_modis(df_modis_terra, df_modis_aqua):
  df_terra = df_modis_terra.groupby(['location_id','date']).mean().reset_index()
  df_aqua = df_modis_aqua.groupby(['location_id','date']).mean().reset_index()
  df_modis = pd.merge(df_aqua, df_terra, how='outer', on=['date','location_id'], suffixes=('_aqua','_terra'))
  df_modis['date'] = df_modis['date'].str.replace('_','-')
  df_modis = df_modis.sort_values(['location_id','date']).reset_index(drop=True)

  return df_modis

gm_modis = transform_modis(gm_modis_terra, gm_modis_aqua)
train_modis = transform_modis(train_modis_terra, train_modis_aqua)
test_modis = transform_modis(test_modis_terra, test_modis_aqua)

In [None]:
# https://stackoverflow.com/questions/13996302/python-rolling-functions-for-groupby-object
def get_rolling_avgs(df, roll_cols, rolling_days_list):
  all_roll_cols = []

  df = df.sort_values(['location_id','date'])

  for roll_days in rolling_days_list:
    rolling_days_cols = [col + f'_{roll_days}_day' for col in roll_cols]
    all_roll_cols.extend(rolling_days_cols)
    df_roll = (df
                      .groupby('location_id', sort=False)[['date'] + roll_cols]
                      .rolling(roll_days, min_periods=1, on='date')
                      .mean()
                      .reset_index()
                      .drop('level_1', axis=1))
    
    df = pd.merge(df, df_roll, how='left', on=['location_id','date'], suffixes=['',f'_{roll_days}_day'])

  return df, all_roll_cols

In [None]:
# Get the 5-day and 15-day rolling average of the Modis data
roll_cols = [
             'NDSI_Snow_Cover_aqua',
             'NDSI_Snow_Cover_terra',
             ]

rolling_days_list = [5, 15]

gm_modis, modis_roll_cols = get_rolling_avgs(gm_modis, roll_cols, rolling_days_list)
train_modis, modis_roll_cols = get_rolling_avgs(train_modis, roll_cols, rolling_days_list)
test_modis, modis_roll_cols = get_rolling_avgs(test_modis, roll_cols, rolling_days_list)

In [None]:
# Merge the sequence data with the Modis data
gm_xgboost = pd.merge(gm_seq, gm_modis, how='left', on=['date','location_id'])
train_xgboost = pd.merge(train_label_seq, train_modis, how='left', on=['date','location_id'])
test_xgboost = pd.merge(test_label_seq, test_modis, how='left', on=['date','location_id'])

In [None]:
# Get the 3-day rolling average of the climate data
climate_cols_2_roll = [
             'TMP', 
             'SNOD', 
             'WEASD', 
             'SPFH', 
             'SNOWC', 
             'REFC',
             'PRES', 
             'PWAT'
             ]

rolling_days_list = [3]

gm_climate, climate_roll_cols = get_rolling_avgs(gm_climate, climate_cols_2_roll, rolling_days_list)
train_climate, climate_roll_cols = get_rolling_avgs(train_climate, climate_cols_2_roll, rolling_days_list)
test_climate, climate_roll_cols = get_rolling_avgs(test_climate, climate_cols_2_roll, rolling_days_list)

In [None]:
# Merge the climate data with the Modis data
gm_xgboost = pd.merge(gm_xgboost, gm_climate, how='left', on=['date','location_id'])
train_xgboost = pd.merge(train_xgboost, train_climate, how='left', on=['date','location_id'])
test_xgboost = pd.merge(test_xgboost, test_climate, how='left', on=['date','location_id'])

In [None]:
# Add in the snow season day feature
gm_xgboost['datetime'] = pd.to_datetime(gm_xgboost['date'])
gm_xgboost['snow_season_day'] = gm_xgboost.datetime.dt.dayofyear.apply(lambda x: x - 335 if x >= 335 else x + 30)

train_xgboost['datetime'] = pd.to_datetime(train_xgboost['date'])
train_xgboost['snow_season_day'] = train_xgboost.datetime.dt.dayofyear.apply(lambda x: x - 335 if x >= 335 else x + 30)

test_xgboost['datetime'] = pd.to_datetime(test_xgboost['date'])
test_xgboost['snow_season_day'] = test_xgboost.datetime.dt.dayofyear.apply(lambda x: x - 335 if x >= 335 else x + 30)

In [None]:
# Separate the snow season into periods of 14 days
snow_season_period_dict = {}
days_in_period = 14
total_days = 213
period = 0
period_count = 0
total_periods = int(total_days/days_in_period) - 1

for day in range(total_days):
  snow_season_period_dict[day] = period
  period_count += 1
  if period_count == days_in_period:
    if period != total_periods:
      period += 1
    period_count = 0

In [None]:
train_xgboost['snow_season_period'] = train_xgboost.snow_season_day.apply(lambda x: snow_season_period_dict[x])
test_xgboost['snow_season_period'] = test_xgboost.snow_season_day.apply(lambda x: snow_season_period_dict[x])
gm_xgboost['snow_season_period'] = gm_xgboost.snow_season_day.apply(lambda x: snow_season_period_dict[x])

In [None]:
# Get the snow season period historical mean and standard deviation to calculate the relative swe for
# each ground station measurement
gm_period = (gm_xgboost.groupby(['location_id','snow_season_period'])
                            .agg(swe_period_mean=('swe','mean'), swe_period_std=('swe','std'))
                            .reset_index())
gm_xgboost = pd.merge(gm_xgboost, gm_period, how='left', on=['location_id', 'snow_season_period'], suffixes=('','_period'))
gm_xgboost['relative_swe'] = (gm_xgboost['swe'] - gm_xgboost['swe_period_mean'])/gm_xgboost['swe_period_std']
gm_xgboost['relative_swe'] = gm_xgboost.apply(lambda x: 0.0 if x.swe_period_mean == 0. and x.swe_period_std == 0. else x.relative_swe, axis=1)

In [None]:
# Backfill most recent date relative_swe if NaN
roll_cols = [
             'relative_swe'
             ]

roll_window = [2]

gm_xgboost, relative_swe_roll_cols = get_rolling_avgs(gm_xgboost, roll_cols, roll_window)

gm_xgboost['relative_swe'] = gm_xgboost['relative_swe'].fillna(gm_xgboost['relative_swe_2_day'])
gm_xgboost = gm_xgboost[gm_xgboost['swe'].notna()]

In [None]:
# Need to map train measurement dates to the most recent past ground measurement date
def map_dates_to_most_recent_past_date(unique_dates_from, unique_dates_to):
  all_sorted = sorted([(d, 1) for d in unique_dates_from] + [(d, 0) for d in unique_dates_to])
  date_dict = {}
  for i in range(len(all_sorted)):
    if all_sorted[i][1] == 1:
      still_looking = True
      j = i
      while still_looking:
        j -= 1
        if all_sorted[j][1] == 0:
          date_dict[all_sorted[i][0]] = all_sorted[j][0]
          still_looking = False
  return date_dict

unique_train_dates = train_label_seq.date.unique()
unique_gm_dates = gm_seq.date.unique()
date_dict = map_dates_to_most_recent_past_date(unique_train_dates, unique_gm_dates)

In [None]:
def get_k_neighbor_swe_data(location_df, neighbor_df, location_seq_df, neighbor_seq_df, k):
  ''' function to map a location with a latitude, longitude, and elevation to
  its k nearest ground measurement stations in 3-D space. The historical relative SWE
  for the k nearest ground measurment stations are retrieved and averaged by
  weighted distance. The averaged relative SWE of the k neighbors is returned.
  '''
  distance_cols = ['longitude','latitude']
  scaler = StandardScaler()
  scaler.fit(neighbor_df[distance_cols])
  X_neighbor = scaler.transform(neighbor_df[distance_cols])
  X_location = scaler.transform(location_df[distance_cols])

  location_unique_dates = location_seq_df.date.unique()
  neighbor_unique_dates = neighbor_seq_df.date.unique()
  date_dict = map_dates_to_most_recent_past_date(location_unique_dates, neighbor_unique_dates)

  # Builds the tree on the neighbor data
  tree = KDTree(X_neighbor, leaf_size=2)

  # Get neighbors for location dataset
  location_dist, location_idx = tree.query(X_location, k=k)

  neighbor_data = []
  # iterate through locations in train
  for idx, row in location_df.iterrows():
    if idx % 1000 == 0:
      print(idx)
    # for each location get neighbors and distances
    location_id = row['location_id']
    # get neighbors for the location
    neighbors = neighbor_df.loc[location_idx[idx]]['location_id'].values
    # build df for neighbors with distances to the location
    distance_df = pd.DataFrame({'location_id': neighbors, 'distance': location_dist[idx]})
    distance_df = distance_df[distance_df['distance'] != 0]
    neighbors = distance_df['location_id'].unique()
    # get historical relative swe data for neighbors
    neighbor_swe_hist_df = neighbor_seq_df[neighbor_seq_df['location_id'].isin(neighbors)][['location_id','date','relative_swe']]
    neighbor_swe_hist_df.columns = ['location_id','neighbor_date','neighbor_relative_swe']
    # build sequential df for the location to capture predictions
    location_swe_pred_df = pd.DataFrame({'date': location_unique_dates})
    location_swe_pred_df['location_id'] = location_id
    # map the dates location_swe_pred_df to applicable neighbor dates
    location_swe_pred_df['neighbor_date'] = location_swe_pred_df['date'].apply(lambda x: date_dict[x])
    # get the inverse distance weight to figure out the contribution for each neighbor
    distance_df['inverse_distance_weight'] = distance_df['distance']**-1/(distance_df['distance']**-1).sum()
    # build a lookup df for the neighbor sequential data
    lookup_df = pd.merge(neighbor_swe_hist_df, distance_df, how='inner', on='location_id')
    lookup_df['swe_contrib'] = lookup_df['neighbor_relative_swe']*lookup_df['inverse_distance_weight']
    combined_df = pd.merge(location_swe_pred_df, lookup_df[['neighbor_date','swe_contrib']], how='inner', on='neighbor_date')
    combined_df = combined_df[['location_id','date','swe_contrib']].groupby(['location_id','date']).sum().reset_index()
    neighbor_data.extend(combined_df.values)

  all_locations_df = pd.DataFrame(neighbor_data, columns=['location_id','date','neighbor_relative_swe'])
  return all_locations_df

In [None]:
k = 15
# Get ground measure neighbor relative SWE for train data
# location_df
location_df = train_label_elev[['cell_id', 'latitude', 'longitude', 'elevation_m']]
location_df.columns = ['location_id', 'latitude', 'longitude', 'elevation_m']
# location_seq_df
location_seq_df = train_label_seq[['date']]
# neighbor_df
neighbor_df = ground_measures_metadata[['station_id', 'elevation_m', 'latitude', 'longitude']]
neighbor_df.columns = ['location_id', 'elevation_m', 'latitude', 'longitude']
# neighbor_seq_df
neighbor_seq_df = gm_xgboost[['location_id', 'date', 'relative_swe']]

train_neighbor_swe_df = get_k_neighbor_swe_data(location_df, neighbor_df, location_seq_df, neighbor_seq_df, k)

In [None]:
# Get ground measure neighbor relative SWE for gm data
gm_neighbor_swe_df = get_k_neighbor_swe_data(neighbor_df, neighbor_df, neighbor_seq_df, neighbor_seq_df, k)

In [None]:
# Get ground measure neighbor relative SWE for test data
# location_df
location_df = test_pred_elev[['cell_id', 'latitude', 'longitude', 'elevation_m']]
location_df.columns = ['location_id', 'latitude', 'longitude', 'elevation_m']
# location_seq_df
location_seq_df = test_label_seq[['date']]
# neighbor_seq_df
neighbor_seq_df = gm_xgboost[['location_id', 'date', 'relative_swe']]

test_neighbor_swe_df = get_k_neighbor_swe_data(location_df, neighbor_df, location_seq_df, neighbor_seq_df, k)

In [None]:
train_xgboost = pd.merge(train_xgboost, train_neighbor_swe_df, how='left', on=['location_id','date'])
test_xgboost = pd.merge(test_xgboost, test_neighbor_swe_df, how='left', on=['location_id','date'])
gm_xgboost = pd.merge(gm_xgboost, gm_neighbor_swe_df, how='left', on=['location_id','date'])

In [None]:
# Add in the water feature
train_water.columns = ['location_id','water']
test_water.columns = ['location_id','water']
train_xgboost = pd.merge(train_xgboost, train_water, how='left', on=['location_id'])
test_xgboost = pd.merge(test_xgboost, test_water, how='left', on=['location_id'])

In [None]:
# Add in land category feature
train_lccs.columns = ['location_id','lccs_0', 'lccs_pct_0', 'lccs_1', 'lccs_pct_1', 'lccs_2', 'lccs_pct_2']
test_lccs.columns = ['location_id','lccs_0', 'lccs_pct_0', 'lccs_1', 'lccs_pct_1', 'lccs_2', 'lccs_pct_2']
gm_lccs.columns = ['location_id','lccs_0']
train_xgboost = pd.merge(train_xgboost, train_lccs, how='left', on=['location_id'])
test_xgboost = pd.merge(test_xgboost, test_lccs, how='left', on=['location_id'])
gm_xgboost = pd.merge(gm_xgboost, gm_lccs, how='left', on=['location_id'])

In [None]:
# Add in the elevation gradient features
train_elev_grad.columns = ['location_id','east_elev_grad','south_elev_grad','east_elev_pct','south_elev_pct']
test_elev_grad.columns = ['location_id','east_elev_grad','south_elev_grad','east_elev_pct','south_elev_pct']
gm_elev_grad.columns = ['location_id','east_elev_grad','south_elev_grad','east_elev_pct','south_elev_pct']
train_xgboost = pd.merge(train_xgboost, train_elev_grad, how='left', on='location_id')
test_xgboost = pd.merge(test_xgboost, test_elev_grad, how='left', on='location_id')
gm_xgboost = pd.merge(gm_xgboost, gm_elev_grad, how='left', on='location_id')

In [None]:
all_xgboost = pd.concat([gm_xgboost, train_xgboost])

In [None]:
all_xgboost = all_xgboost[all_xgboost['date'] <= '2019-12-31']

In [None]:
all_xgboost.date.max()

In [None]:
all_xgboost.shape

In [None]:
all_xgboost.reset_index(drop=True, inplace=True)

In [None]:
climate_cols = ['SNOD', 'WEASD', 'SNOWC'] + ['TMP_3_day','SPFH_3_day','PRES_3_day','PWAT_3_day']
xgb_cols = [
            'latitude',
            'longitude',
            'elevation_m',
            'elevation_var_m',
            'snow_season_day',
            'water',
            'neighbor_relative_swe',
            'east_elev_grad',
            'south_elev_grad',
            ] \
            + modis_roll_cols + climate_cols

label_col = ['swe']

X = all_xgboost[xgb_cols]
y = all_xgboost[label_col]

X_test = test_xgboost[xgb_cols]
y_test = test_xgboost[label_col]

In [None]:
# scale data
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X_test = scaler.transform(X_test)

In [None]:
# define data_dmatrix
xgb_dmatrix = xgb.DMatrix(data=X,label=y)

In [None]:
params = {
          'n_estimators': 1000,
          'min_split_loss': 0.08,
          'gamma': 0.1,
          'max_depth': 32,
          'min_child_weight': 45,
          'learning_rate': 0.04,
          'objective':'reg:squarederror',
          'n_jobs': 4,
          'subsample': 0.95,
          'colsample_bytree': 0.95,
          }

## Make Predictions

In [None]:
# instantiate the XGB regressor
xgb_reg = xgb.XGBRegressor(**params)

# fit the regressor to the training data
xgb_reg.fit(X, y)

In [None]:
# make predictions on test data
xgb_pred = xgb_reg.predict(X_test)

In [None]:
xgb_pred[xgb_pred < 0] = 0.0

In [None]:
test_xgboost['xgb_swe_pred'] = xgb_pred

In [None]:
print('XGBoost model accuracy score: {0:0.4f}'
.format(mean_squared_error(test_xgboost['swe'], test_xgboost['xgb_swe_pred'], squared=False)))
# best 3.5091

## LightGBM

In [None]:
all_xgboost['neighbor_relative_swe'] = all_xgboost['neighbor_relative_swe'].astype(float)
test_xgboost['neighbor_relative_swe'] = test_xgboost['neighbor_relative_swe'].astype(float)

In [None]:
cat_cols = ['lccs_0', 'lccs_1', 'lccs_2']
lgb_cols = [
            'latitude',
            'longitude',
            'elevation_m',
            'elevation_var_m',
            'snow_season_day',
            'water',
            'neighbor_relative_swe',
            'east_elev_grad',
            'south_elev_grad',
            ] \
            + modis_roll_cols + climate_cols + cat_cols

label_col = ['swe']

X_lgb = all_xgboost[lgb_cols]
y = all_xgboost[label_col]

X_lgb_test = test_xgboost[lgb_cols]
y_test = test_xgboost[label_col]

In [None]:
train_data = lgb.Dataset(X_lgb, label=y)
test_data = lgb.Dataset(X_lgb_test, label=y_test)

In [None]:
lgb_reg = lgb.LGBMRegressor(
            nthread=4,
            n_estimators=20000,
            num_leaves=64,
            max_depth=32,
            learning_rate=0.04,
            min_child_weight=30,
            subsample=1,
            colsample_bytree=0.95,
            reg_alpha=0.0,
            reg_lambda=0.075,
            min_split_gain=0.0,
            silent=-1,
            verbose=-1,
            )


In [None]:
lgb_reg.fit(X_lgb,y)

In [None]:
lgb_pred = lgb_reg.predict(X_lgb_test)

In [None]:
lgb_pred[lgb_pred < 0] = 0.0

In [None]:
test_xgboost['lgb_swe_pred'] = lgb_pred

In [None]:
print('LGB model accuracy score: {0:0.4f}'
.format(mean_squared_error(test_xgboost['swe'], test_xgboost['lgb_swe_pred'], squared=False)))
# best 3.3145

## Catboost

In [None]:
all_xgboost['lccs_1'] = all_xgboost['lccs_1'].fillna(0).astype(int)
all_xgboost['lccs_2'] = all_xgboost['lccs_2'].fillna(0).astype(int)

In [None]:
all_xgboost['region'] = all_xgboost['region'].fillna('gm')

In [None]:
cat_cols = ['lccs_0', 'lccs_1', 'lccs_2', 'region']
cb_cols = [
            'latitude',
            'longitude',
            'elevation_m',
            'elevation_var_m',
            'snow_season_day',
            'water',
            'neighbor_relative_swe',
            'east_elev_grad',
            'south_elev_grad',
            ] \
            + modis_roll_cols + climate_cols + cat_cols

label_col = ['swe']

X_cb = all_xgboost[cb_cols]
y = all_xgboost[label_col]

X_cb_test = test_xgboost[cb_cols]
y_test = test_xgboost[label_col]

In [None]:
train_dataset = cb.Pool(data=X_cb, 
                        label=y,
                        cat_features=[20, 21, 22, 23]) 
test_dataset = cb.Pool(data=X_cb_test,
                       label=y_test,
                       cat_features=[20, 21, 22, 23])

In [None]:
cb_model = cb.CatBoostRegressor(iterations=550,
                             learning_rate=0.04,
                             depth=16,
                             l2_leaf_reg=0.05,
                             model_size_reg=None,
                             loss_function="RMSE",
)

In [None]:
cb_model.fit(train_dataset)

In [None]:
cb_pred = cb_model.predict(test_dataset)

In [None]:
cb_pred[cb_pred < 0] = 0.0

In [None]:
test_xgboost['cb_swe_pred'] = cb_pred

In [None]:
print('CB model accuracy score: {0:0.4f}'
.format(mean_squared_error(test_xgboost['swe'], test_xgboost['cb_swe_pred'], squared=False)))
# best 3.4749

In [None]:
cb_fi = cb_model.feature_importances_.argsort()
np.array(cb_cols)[cb_fi]

## Ensemble XGB, LGB, and Catboost

In [None]:
# Use Linear Regression to calculate weight to assign to each model
reg = linear_model.LinearRegression(fit_intercept=False)
reg.fit(test_xgboost[['lgb_swe_pred','xgb_swe_pred','cb_swe_pred']], test_xgboost['swe'])
test_xgboost['optimal_swe'] = reg.predict(test_xgboost[['lgb_swe_pred','xgb_swe_pred','cb_swe_pred']])
print(reg.coef_/sum(reg.coef_))
mean_squared_error(test_xgboost['swe'], test_xgboost['optimal_swe'], squared=False)

In [None]:
sierras = test_xgboost[test_xgboost['region'] == 'sierras']
reg = linear_model.LinearRegression(fit_intercept=False)
reg.fit(sierras[['lgb_swe_pred','xgb_swe_pred','cb_swe_pred']], sierras['swe'])
print(reg.coef_/sum(reg.coef_))

In [None]:
rockies = test_xgboost[test_xgboost['region'] == 'central rockies']
reg = linear_model.LinearRegression(fit_intercept=False)
reg.fit(rockies[['lgb_swe_pred','xgb_swe_pred','cb_swe_pred']], rockies['swe'])
print(reg.coef_/sum(reg.coef_))

In [None]:
other = test_xgboost[test_xgboost['region'] == 'other']
reg = linear_model.LinearRegression(fit_intercept=False)
reg.fit(other[['lgb_swe_pred','xgb_swe_pred','cb_swe_pred']], other['swe'])
print(reg.coef_/sum(reg.coef_))

In [None]:
# See rmse by region
test_xgboost['rse'] = (test_xgboost['swe'] - test_xgboost['optimal_swe'])**2
np.sqrt(test_xgboost.rse.mean())
#3.194904948806366

In [None]:
np.sqrt(test_xgboost.groupby('region').mean()['rse'])
# Best
# central rockies    3.025345
# other              2.597061
# sierras            3.433604

In [None]:
np.sqrt(test_xgboost.groupby('snow_season_period').mean()['rse'])

In [None]:
def gb_ensemble(row):
  if row['region'] == 'sierras':
    swe_pred = (0.40*row['lgb_swe_pred']
                + 0.25*row['xgb_swe_pred']
                + 0.35*row['cb_swe_pred'])
  elif row['region'] == 'central rockies':
    swe_pred = (0.80*row['lgb_swe_pred']
                + 0.20*row['xgb_swe_pred'])
  else:
    swe_pred = (0.70*row['lgb_swe_pred']
                + 0.20*row['xgb_swe_pred']
                + 0.10*row['cb_swe_pred'])
  return swe_pred

In [None]:
test_xgboost['best_swe_pred'] = test_xgboost.apply(lambda x: gb_ensemble(x), axis=1)

In [None]:
print('Ensemble model accuracy score: {0:0.4f}'
.format(mean_squared_error(test_xgboost['swe'], test_xgboost['best_swe_pred'], squared=False)))
# best 3.2217

In [None]:
test_xgboost['rse'] = (test_xgboost['swe'] - test_xgboost['best_swe_pred'])**2

In [None]:
np.sqrt(test_xgboost.groupby('region').mean()['rse'])

In [None]:
test_xgboost[['swe','region','lgb_swe_pred','xgb_swe_pred','cb_swe_pred','best_swe_pred']].sample(5)

In [None]:
lgb.plot_importance(lgb_reg, importance_type="gain")

In [None]:
lgb.plot_importance(lgb_reg) # importance_type = "split"

## Train Model on all data for Eval

In [None]:
final_dataset = pd.concat([gm_xgboost, train_xgboost, test_xgboost])

#### XGBoost

In [None]:
X_all = final_dataset[xgb_cols]
y_all = final_dataset[label_col]

In [None]:
scaler = StandardScaler()
scaler.fit(X_all)
X_all = scaler.transform(X_all)

In [None]:
dump(scaler, '/content/drive/MyDrive/snocast/eval/models/std_scaler.bin', compress=True)

In [None]:
# define data_dmatrix
xgb_dmatrix = xgb.DMatrix(data=X_all,label=y_all)

In [None]:
params = {
          'n_estimators': 1100,
          'min_split_loss': 0.08,
          'gamma': 0.1,
          'max_depth': 32,
          'min_child_weight': 45,
          'learning_rate': 0.04,
          'objective':'reg:squarederror',
          'n_jobs': 4,
          'subsample': 0.95,
          'colsample_bytree': 0.95,
          }

# instantiate the regressor
xgb_reg_all = xgb.XGBRegressor(**params) # Params defined during cross-validation above

# fit the regressor to the training data
xgb_reg_all.fit(X_all, y_all)

In [None]:
xgb_reg_all.save_model('/content/drive/MyDrive/snocast/eval/models/xgb_all.txt')

In [None]:
# dump model with feature map
xgb_reg_all.get_booster().dump_model('/content/drive/MyDrive/snocast/train/models/xgb_all.json', dump_format='json')

#### LightGBM

In [None]:
final_dataset['neighbor_relative_swe'] = final_dataset['neighbor_relative_swe'].astype(float)

In [None]:
X_all = final_dataset[lgb_cols]
y_all = final_dataset[label_col]

In [None]:
train_data = lgb.Dataset(X_all, label=y_all)

In [None]:
lgb_reg = lgb.LGBMRegressor(
            nthread=4,
            n_estimators=21000,
            num_leaves=64,
            max_depth=32,
            learning_rate=0.04,
            min_child_weight=30,
            subsample=1,
            colsample_bytree=0.95,
            reg_alpha=0.0,
            reg_lambda=0.075,
            min_split_gain=0.0,
            silent=-1,
            verbose=-1,
            )


In [None]:
lgb_reg.fit(X_all,y_all)

In [None]:
lgb_reg.booster_.save_model('/content/drive/MyDrive/snocast/eval/models/lgb_all.txt')

#### Catboost

In [None]:
final_dataset['lccs_1'] = final_dataset['lccs_1'].fillna(0).astype(int)
final_dataset['lccs_2'] = final_dataset['lccs_2'].fillna(0).astype(int)
final_dataset['region'] = final_dataset['region'].fillna('gm')

In [None]:
X_all = final_dataset[cb_cols]
y_all = final_dataset[label_col]

In [None]:
train_dataset = cb.Pool(data=X_all, 
                        label=y_all,
                        cat_features=[20, 21, 22, 23]) 

In [None]:
cb_model = cb.CatBoostRegressor(iterations=600,
                             learning_rate=0.04,
                             depth=16,
                             l2_leaf_reg=0.05,
                             model_size_reg=None,
                             loss_function="RMSE")

In [None]:
cb_model.fit(train_dataset)

In [None]:
cb_model.save_model('/content/drive/MyDrive/snocast/eval/models/cb_all.txt')