In [None]:
# This notebook runs 27 hours (download data and collect train dataset)

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import geopandas as gpd
from tqdm.auto import tqdm

import os
from src.data import hrrr_week_download
from src.features import hrrr_week_features, hrrr_winter_features

from datetime import datetime
from datetime import timedelta
from pathlib import Path

In [None]:
PATH_HRRR_DATA = 'data_train/external/hhhr_data/'
PATH_WEEK_FEATURES = 'data_train/processed/hrrr_week_feastures/'
PATH_WINTER_FEATURES = 'data_train/processed/hrrr_winter_feastures/'
PATH_TIMESERIES = 'data_train/processed/hrrr_timeserias/'
PATH_GRID = 'data/input/grid_cells.geojson'
PATH_GRID2 = 'data/input/grid_cell_stage2.geojson'
PATH_MODIS = 'data_train/external/modis/'
PATH_MODIS_FEATURES = 'data_train/processed/modis_features/'

In [None]:
grid_cells = gpd.read_file(Path(PATH_GRID))

In [None]:
#dates for train and test
train_labels = pd.read_csv('data/input/train_labels.csv')
submission = pd.read_csv('data/input/submission_format.csv')
train_features = pd.read_csv('data/input/ground_measures_train_features.csv')
test_features = pd.read_csv('data/input/ground_measures_test_features.csv')

aaa = list(train_labels)[63:] + list(submission)[1:] + list(test_features)[1:] + list(train_features)[57:]
dates = sorted(list(set(aaa)))

In [None]:
#hrrr data download
hrrr_week_download.download_data(dates, output_path=PATH_HRRR_DATA)

In [None]:
#hrrr weekly features
for date in tqdm(dates, desc='timestamp'):
    if pd.to_datetime(date) < datetime.now():
        hrrr_week_features.features_for_timestamp(folder= str(Path(os.path.join(PATH_HRRR_DATA, date))),
                                                  grid_cells=grid_cells,
                                                  features_save_path=str(Path(PATH_WEEK_FEATURES)),
                                                  save_path = str(Path(PATH_TIMESERIES))
                                                 )

In [None]:
#hrrr winter features        
hrrr_winter_features.winter_features(path_timeseries= PATH_TIMESERIES, 
                                     features_save_path=PATH_WINTER_FEATURES, 
                                     last_year=False)

In [None]:
# MODIS data download
from src.data import modis_downloader
from src.features import modis_features

years = set([d[:4] for d in dates])

for year in years:
    directory = PATH_MODIS_FEATURES + 'fol' + year
    if not os.path.exists(directory): os.makedirs(directory) 
    directory = PATH_MODIS + 'fol' + year
    if not os.path.exists(directory): os.makedirs(directory) 

modis_downloader.download(PATH_MODIS, how='everything')

In [None]:
# MODIS features
for year in os.listdir(PATH_MODIS):
    path = os.path.join(PATH_MODIS, year)
    modis_features.modis_features(path, grid_cells, all_files=True,
                   last_files=0, output_path=PATH_MODIS_FEATURES)

train dataset

In [None]:
sub1 = pd.read_csv('data/input/labels_2020_2021.csv')
sub1 = sub1.melt('cell_id')
sub1.columns = ['cell_id', 'valid_time', 'swe']

sub2 = pd.read_csv('data/input/train_labels.csv')
sub2 = sub.melt('cell_id')
sub2.columns = ['cell_id', 'valid_time', 'swe']

sub = pd.concat([sub2, sub1])
sub = sub[~sub['swe'].isna()]
sub = sub[sub['valid_time'].isin(dates)]

In [None]:
# modis
paths_modis = list(Path(PATH_MODIS_FEATURES).rglob('*.csv'))
df_modis = [pd.read_csv(p) for p in paths_modis]
df_modis = pd.concat(df_modis)

In [None]:
# weekly features
paths_week_f = list(Path(PATH_WEEK_FEATURES).rglob('*.csv'))
df_week = [pd.read_csv(p) for p in paths_week_f]
df_week = pd.concat(df_week)

In [None]:
# winter features
paths_winter_f = list(Path(PATH_WINTER_FEATURES).rglob('*.csv'))
df_winter = [pd.read_csv(p) for p in paths_winter_f]
df_winter = pd.concat(df_winter)

In [None]:
# grid features
df_dem = pd.read_csv('data/raw/dem_features.csv')

df_grid = grid_cells.copy()
df_grid = gpd.GeoDataFrame(df_grid, geometry=df_grid.centroid)
df_grid['lon'] = df_grid.geometry.x
df_grid['lat'] = df_grid.geometry.y
df_grid = df_grid[['cell_id', 'lon', 'lat']]

In [None]:
df = sub.copy()

df = df.merge(df_week, on=['cell_id', 'valid_time'], how='left')
df = df.merge(df_winter, on=['cell_id', 'valid_time'], how='left')
df = df.merge(df_modis, on=['cell_id', 'valid_time'], how='left')
df = df.merge(df_dem, on=['cell_id'], how='left')
df = df.merge(df_grid, on=['cell_id'], how='left')

df['dt_date'] = pd.to_datetime(df['valid_time'], format='%Y-%m-%d')
df['dayofyear'] = df['dt_date'].dt.dayofyear
df['year'] = df['dt_date'].dt.year
df = df.drop(['dt_date'], axis=1)

In [None]:
# uncommet to rewrite train dataset
# df.to_csv('data/raw/train_dataset.csv', index=False)