In [None]:
!pip install catboost

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Import Base Data Files

In [None]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

from joblib import load
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KDTree

In [None]:
run_date = '2022-06-30'
first_run = False # '2022-01-13' <- first week model run
data_dir = '/content/drive/MyDrive/snocast/eval/data'

In [None]:
submission_format = pd.read_csv(os.path.join(data_dir, 'submission_format.csv')).rename(columns = {'Unnamed: 0':'cell_id'})
test_base = submission_format.fillna(0.)

if not first_run:
  prev_date = submission_format.columns[submission_format.columns.get_loc(run_date) - 1]
  prev_submission = pd.read_csv(f'/content/drive/MyDrive/snocast/eval/submissions/submission_{prev_date}.csv')
  print(prev_date)

In [None]:
def transform_gm(gm_df, dropna=True):
  gm_df.columns = ['location_id'] + list(gm_df.columns[1:])
  gm_melt = gm_df.melt(id_vars=["location_id"],
                      var_name="date",
                      value_name="swe")
  if dropna:
    gm_melt = gm_melt.dropna()
  return gm_melt

In [None]:
# Transform ground measures
ground_measures_metadata = pd.read_csv(os.path.join(data_dir, 'ground_measures_metadata.csv'))

gm = pd.read_csv(os.path.join(data_dir, 'ground_measures/ground_measures_features.csv'))
gm_recent = transform_gm(gm, False)
gm_test = pd.read_csv(os.path.join(data_dir, 'ground_measures/ground_measures_test_features.csv'))
gm_train = pd.read_csv(os.path.join(data_dir, 'ground_measures/ground_measures_train_features.csv'))
gm_hist = pd.concat([transform_gm(gm_test), transform_gm(gm_train)])

In [None]:
preds_melt_test = test_base.melt(id_vars=["cell_id"],
                  var_name="date",
                  value_name="swe").dropna()[['cell_id','date']]
preds_melt_test = preds_melt_test[preds_melt_test['date'] == run_date]

grid_elev = pd.read_parquet(os.path.join(data_dir, 'static/grid_cells_elev.parquet'))
grid_elev_grad = pd.read_parquet(os.path.join(data_dir, 'static/test_elevation_grads.parquet'))
grid_water = pd.read_parquet(os.path.join(data_dir, 'static/grid_water.parquet'))
grid_water['water'] = grid_water['water'] - 1
grid_lccs = pd.read_parquet(os.path.join(data_dir, 'static/grid_lccs.parquet'))
grid_climate = pd.read_parquet(os.path.join(data_dir, f'hrrr/climate_{run_date}.parquet'))
modis_terra = pd.read_parquet(os.path.join(data_dir, f'modis/modis_terra_pc_{run_date}.parquet'))
modis_aqua = pd.read_parquet(os.path.join(data_dir, f'modis/modis_aqua_pc_{run_date}.parquet'))

### Perform Data Transformations

In [None]:
# Create sequential dataframes for train and test
test_pred_seq = pd.merge(preds_melt_test, grid_elev, how='inner', on='cell_id')
test_pred_seq.columns = ['location_id', 'date', 'latitude', 'longitude', 'region', 'elevation_m', 'elevation_var_m']

In [None]:
# Merge Aqua and Terra Modis datasets
def transform_modis(df_modis_terra, df_modis_aqua):
  df_terra = df_modis_terra.groupby(['location_id','date']).mean().reset_index()
  df_aqua = df_modis_aqua.groupby(['location_id','date']).mean().reset_index()
  df_modis = pd.merge(df_aqua, df_terra, how='outer', on=['date','location_id'], suffixes=('_aqua','_terra'))
  df_modis['date'] = df_modis['date'].str.replace('_','-')
  df_modis = df_modis.sort_values(['location_id','date']).reset_index(drop=True)
  df_modis['flag'] = 1

  return df_modis

In [None]:
test_modis = transform_modis(modis_terra, modis_aqua)

In [None]:
# Need to run this in case we aren't able to pull Modis data all the way up to the run_date
def prepare_modis_for_roll(modis_df, seq_df):
    dates_df = seq_df[['location_id', 'date']]
    missing_dates = pd.merge(dates_df,
                             modis_df[['location_id', 'date', 'flag']],
                             how='left',
                             on=['location_id', 'date'])
    dates_df = missing_dates[missing_dates['flag'].isna()][['location_id', 'date']].reset_index(drop=True)

    for col in modis_df.columns:
        if col not in ('location_id', 'date'):
            dates_df[col] = np.nan

    modis_df = pd.concat([modis_df, dates_df])
    modis_df = modis_df.sort_values(['location_id', 'date']).reset_index(drop=True)

    return modis_df

In [None]:
test_modis = prepare_modis_for_roll(test_modis, test_pred_seq)

In [None]:
# https://stackoverflow.com/questions/13996302/python-rolling-functions-for-groupby-object
def get_rolling_avgs(df, roll_cols, rolling_days_list):
  all_roll_cols = []

  df = df.sort_values(['location_id','date'])

  for roll_days in rolling_days_list:
    rolling_days_cols = [col + f'_{roll_days}_day' for col in roll_cols]
    all_roll_cols.extend(rolling_days_cols)
    df_roll = (df
                      .groupby('location_id', sort=False)[['date'] + roll_cols]
                      .rolling(roll_days, min_periods=1, on='date')
                      .mean()
                      .reset_index()
                      .drop('level_1', axis=1))
    
    df = pd.merge(df, df_roll, how='left', on=['location_id','date'], suffixes=['',f'_{roll_days}_day'])

  return df, all_roll_cols

In [None]:
# Get the 5-day and 15-day rolling average of the Modis data
roll_cols = [
             'NDSI_Snow_Cover_aqua',
             'NDSI_Snow_Cover_terra',
             ]

rolling_days_list = [5, 15]

test_modis, modis_roll_cols = get_rolling_avgs(test_modis, roll_cols, rolling_days_list)

In [None]:
test_dataset = pd.merge(test_pred_seq, test_modis, how='left', on=['date','location_id'])

In [None]:
# Get the 3-day rolling average of the climate data
climate_cols_2_roll = [
             'TMP', 
             'SNOD', 
             'WEASD', 
             'SPFH', 
             'SNOWC', 
             'REFC',
             'PRES', 
             'PWAT'
             ]

rolling_days_list = [3]

grid_climate, climate_roll_cols = get_rolling_avgs(grid_climate, climate_cols_2_roll, rolling_days_list)

In [None]:
test_dataset = pd.merge(test_dataset, grid_climate, how='left', on=['date','location_id'])

In [None]:
# Add in the snow season day feature
test_dataset['datetime'] = pd.to_datetime(test_dataset['date'])
test_dataset['snow_season_day'] = test_dataset.datetime.dt.dayofyear.apply(lambda x: x - 335 if x >= 335 else x + 30)

gm_hist['datetime'] = pd.to_datetime(gm_hist['date'])
gm_hist['snow_season_day'] = gm_hist.datetime.dt.dayofyear.apply(lambda x: x - 335 if x >= 335 else x + 30)

gm_recent['datetime'] = pd.to_datetime(gm_recent['date'])
gm_recent['snow_season_day'] = gm_recent.datetime.dt.dayofyear.apply(lambda x: x - 335 if x >= 335 else x + 30)

In [None]:
# Separate the snow season into periods of 14 days
snow_season_period_dict = {}
days_in_period = 14
total_days = 213
period = 0
period_count = 0
total_periods = int(total_days/days_in_period) - 1

for day in range(total_days):
  snow_season_period_dict[day] = period
  period_count += 1
  if period_count == days_in_period:
    if period != total_periods:
      period += 1
    period_count = 0

test_dataset['snow_season_period'] = test_dataset.snow_season_day.apply(lambda x: snow_season_period_dict[x])
gm_hist['snow_season_period'] = gm_hist.snow_season_day.apply(lambda x: snow_season_period_dict[x])
gm_recent['snow_season_period'] = gm_recent.snow_season_day.apply(lambda x: snow_season_period_dict[x])

For each location/snow_season_period pair calculate the Z-Score (relative SWE). 

$\frac{\bar{X}-\mu}{\hat{\sigma}}$

In [None]:
# Get the snow season period historical mean and standard deviation to calculate the relative swe for
# each recent ground station measurement
gm_period = (gm_hist.groupby(['location_id','snow_season_period'])
                            .agg(swe_period_mean=('swe','mean'), swe_period_std=('swe','std'))
                            .reset_index())
gm_recent = pd.merge(gm_recent, gm_period, how='left', on=['location_id', 'snow_season_period'], suffixes=('','_period_mean'))
gm_recent['relative_swe'] = (gm_recent['swe'] - gm_recent['swe_period_mean'])/(gm_recent['swe_period_std'])
# Clip outliears of relative_swe due to small sample sizes
gm_recent['relative_swe'] = (gm_recent.apply(lambda x: 0.0 if x.swe_period_mean == 0. and x.swe_period_std == 0. else x.relative_swe, axis=1)).clip(-5,5)

In [None]:
# Backfill most recent date relative_swe if NaN
roll_cols = [
             'relative_swe'
             ]

roll_window = [2]

gm_recent, relative_swe_roll_cols = get_rolling_avgs(gm_recent, roll_cols, roll_window)
gm_recent['relative_swe'] = gm_recent['relative_swe'].fillna(gm_recent['relative_swe_2_day'])
gm_recent = gm_recent[gm_recent['swe'].notna()]

In [None]:
gm_recent.relative_swe.hist()

In [None]:
def get_k_neighbor_swe_data(location_df, neighbor_df, location_seq_df, neighbor_seq_df, k):
  ''' function to map a location with a latitude, longitude, and elevation to
  its k nearest ground measurement stations in 3-D space. The historical relative SWE
  for the k nearest ground measurment stations are retrieved and averaged by
  weighted distance. The averaged relative SWE of the k neighbors is returned.
  '''
  distance_cols = ['longitude','latitude']
  scaler = StandardScaler()
  scaler.fit(neighbor_df[distance_cols])
  X_neighbor = scaler.transform(neighbor_df[distance_cols])
  X_location = scaler.transform(location_df[distance_cols])

  location_unique_dates = location_seq_df.date.unique()
  neighbor_unique_dates = neighbor_seq_df.date.unique()
  #date_dict = map_dates_to_most_recent_past_date(location_unique_dates, neighbor_unique_dates)

  # Builds the tree on the neighbor data
  tree = KDTree(X_neighbor, leaf_size=2)

  # Get neighbors for location dataset
  location_dist, location_idx = tree.query(X_location, k=k)

  neighbor_data = []
  # iterate through locations in train
  for idx, row in location_df.iterrows():
    if idx % 1000 == 0:
      print(idx)
    # for each location get neighbors and distances
    location_id = row['location_id']
    # get neighbors for the location
    neighbors = neighbor_df.loc[location_idx[idx]]['location_id'].values
    # build df for neighbors with distances to the location
    distance_df = pd.DataFrame({'location_id': neighbors, 'distance': location_dist[idx]})
    distance_df = distance_df[distance_df['distance'] != 0]
    neighbors = distance_df['location_id'].unique()
    # get historical relative swe data for neighbors
    neighbor_swe_hist_df = neighbor_seq_df[neighbor_seq_df['location_id'].isin(neighbors)][['location_id','date','relative_swe']]
    neighbor_swe_hist_df.columns = ['location_id','neighbor_date','neighbor_relative_swe']
    # build sequential df for the location to capture predictions
    location_swe_pred_df = pd.DataFrame({'date': location_unique_dates})
    location_swe_pred_df['location_id'] = location_id
    # map the dates location_swe_pred_df to applicable neighbor dates
    location_swe_pred_df['neighbor_date'] = location_swe_pred_df['date'] #.apply(lambda x: date_dict[x])
    # get the inverse distance weight to figure out the contribution for each neighbor
    distance_df['inverse_distance_weight'] = distance_df['distance']**-1/(distance_df['distance']**-1).sum()
    # build a lookup df for the neighbor sequential data
    lookup_df = pd.merge(neighbor_swe_hist_df, distance_df, how='inner', on='location_id')
    lookup_df['swe_contrib'] = lookup_df['neighbor_relative_swe']*lookup_df['inverse_distance_weight']
    combined_df = pd.merge(location_swe_pred_df, lookup_df[['neighbor_date','swe_contrib']], how='inner', on='neighbor_date')
    combined_df = combined_df[['location_id','date','swe_contrib']].groupby(['location_id','date']).sum().reset_index()
    neighbor_data.extend(combined_df.values)

  all_locations_df = pd.DataFrame(neighbor_data, columns=['location_id','date','neighbor_relative_swe'])
  return all_locations_df

In [None]:
k = 15
## Get ground measure neighbor relative SWE for eval data
# Build df for grid cells data
location_df = grid_elev[['cell_id', 'latitude', 'longitude', 'elevation_m']]
location_df.columns = ['location_id', 'latitude', 'longitude', 'elevation_m']
location_seq_df = test_pred_seq[['date']]
# Build df for ground measures
neighbor_df = ground_measures_metadata[['station_id', 'elevation_m', 'latitude', 'longitude']]
neighbor_df.columns = ['location_id', 'elevation_m', 'latitude', 'longitude']
neighbor_seq_df = gm_recent[gm_recent['date'] == run_date][['location_id', 'date', 'relative_swe']]
# Only include ground measure stations that have a relative_swe value for the run_date
neighbor_df = pd.merge(neighbor_df, neighbor_seq_df, how='inner', on='location_id')
print(neighbor_seq_df.date.max())

In [None]:
neighbor_swe_df = get_k_neighbor_swe_data(location_df, neighbor_df, location_seq_df, neighbor_seq_df, k)

In [None]:
test_dataset = pd.merge(test_dataset, neighbor_swe_df, how='left', on=['location_id','date'])

In [None]:
# Add in the water feature
test_dataset = pd.merge(test_dataset, grid_water, how='left', on=['location_id'])

In [None]:
# Add in the land category feature
test_dataset = pd.merge(test_dataset, grid_lccs, how='left', on=['location_id'])

In [None]:
# Add in the elevation gradient features
grid_elev_grad.columns = ['location_id','east_elev_grad','south_elev_grad','east_elev_pct','south_elev_pct']
test_dataset = pd.merge(test_dataset, grid_elev_grad, how='left', on='location_id')

### Run XGBoost on transformed data

In [None]:
climate_cols = ['SNOD', 'WEASD', 'SNOWC'] + ['TMP_3_day','SPFH_3_day','PRES_3_day','PWAT_3_day']
xgb_cols = [
            'latitude',
            'longitude',
            'elevation_m',
            'elevation_var_m',
            'snow_season_day',
            'water',
            'neighbor_relative_swe',
            'east_elev_grad',
            'south_elev_grad',
            ] \
            + modis_roll_cols + climate_cols

X = test_dataset[xgb_cols]

In [None]:
xgb_all = xgb.XGBRegressor()
xgb_all.load_model('/content/drive/MyDrive/snocast/eval/models/xgb_all.txt')

In [None]:
scaler = load('/content/drive/MyDrive/snocast/eval/models/std_scaler.bin')

In [None]:
X = scaler.transform(X)

In [None]:
# make predictions on new data
xgb_pred = xgb_all.predict(X)

In [None]:
xgb_pred[xgb_pred < 0] = 0.0

In [None]:
print(xgb_pred.min(), xgb_pred.mean(), xgb_pred.std(), xgb_pred.max())

In [None]:
test_dataset['xgb_swe_pred'] = xgb_pred

### Run Light GBM on Transformed Data

In [None]:
test_dataset['neighbor_relative_swe'] = test_dataset['neighbor_relative_swe'].astype(float)

In [None]:
cat_cols = ['lccs_0', 'lccs_1', 'lccs_2']
lgb_cols = [
            'latitude',
            'longitude',
            'elevation_m',
            'elevation_var_m',
            'snow_season_day',
            'water',
            'neighbor_relative_swe',
            'east_elev_grad',
            'south_elev_grad',
            ] \
            + modis_roll_cols + climate_cols + cat_cols

X_lgb = test_dataset[lgb_cols]

In [None]:
lgb_reg = lgb.Booster(model_file='/content/drive/MyDrive/snocast/eval/models/lgb_all.txt')  # init model

In [None]:
lgb_pred = lgb_reg.predict(X_lgb)

In [None]:
lgb_pred[lgb_pred < 0] = 0.0

In [None]:
print(lgb_pred.min(), lgb_pred.mean(), lgb_pred.std(), lgb_pred.max())

In [None]:
test_dataset['lgb_swe_pred'] = lgb_pred

### Run Catboost on Transformed Data

In [None]:
test_dataset['lccs_1'] = test_dataset['lccs_1'].fillna(0).astype(int)
test_dataset['lccs_2'] = test_dataset['lccs_2'].fillna(0).astype(int)

In [None]:
cat_cols = ['lccs_0', 'lccs_1', 'lccs_2','region']
cb_cols = [
            'latitude',
            'longitude',
            'elevation_m',
            'elevation_var_m',
            'snow_season_day',
            'water',
            'neighbor_relative_swe',
            'east_elev_grad',
            'south_elev_grad',
            ] \
            + modis_roll_cols + climate_cols + cat_cols

X_cb = test_dataset[cb_cols]

In [None]:
cb_dataset = cb.Pool(data=X_cb,
                        cat_features=[20, 21, 22, 23]) 

In [None]:
cb_model = cb.CatBoostRegressor()
cb_model.load_model('/content/drive/MyDrive/snocast/eval/models/cb_all.txt')

In [None]:
cb_pred = cb_model.predict(X_cb)

In [None]:
cb_pred[cb_pred < 0] = 0.0

In [None]:
print(cb_pred.min(), cb_pred.mean(), cb_pred.std(), cb_pred.max())

In [None]:
test_dataset['cb_swe_pred'] = cb_pred

### Ensemble model predictions

In [None]:
def gb_ensemble(row):
  if row['region'] == 'sierras':
    swe_pred = (0.40*row['lgb_swe_pred']
                + 0.25*row['xgb_swe_pred']
                + 0.35*row['cb_swe_pred'])
  elif row['region'] == 'central rockies':
    swe_pred = (0.80*row['lgb_swe_pred']
                + 0.20*row['xgb_swe_pred'])
  else:
    swe_pred = (0.70*row['lgb_swe_pred']
                + 0.20*row['xgb_swe_pred']
                + 0.10*row['cb_swe_pred'])
  return swe_pred

In [None]:
test_dataset['best_swe_pred'] = test_dataset.apply(lambda x: gb_ensemble(x), axis=1)

In [None]:
print(test_dataset['best_swe_pred'].min(), 
      test_dataset['best_swe_pred'].mean(), 
      test_dataset['best_swe_pred'].std(), 
      test_dataset['best_swe_pred'].max(),
      test_dataset['best_swe_pred'].median())

In [None]:
test_dataset.groupby('region')['best_swe_pred'].mean()

In [None]:
# Convert the data to submission format
test_preds = test_dataset[['location_id','date','best_swe_pred']]
test_preds.columns = ['cell_id','date','swe_pred']
test_preds = test_preds.pivot_table(index='cell_id', columns='date')
test_preds.columns = test_preds.columns.droplevel().rename(None)
test_preds = test_preds.reset_index(drop=False)

In [None]:
test_dataset.to_parquet(f'/content/drive/MyDrive/snocast/eval/data/test_preds_{run_date}.parquet')

In [None]:
if first_run == True:
  submission = submission_format
else:
  submission = prev_submission
submission[run_date] = test_preds[[run_date]].values

In [None]:
submission.sample(10)

In [None]:
submission.to_csv(f'/content/drive/MyDrive/snocast/eval/submissions/submission_{run_date}.csv', index=False)