### Clean raw GFS download

In [1]:
import pandas as pd

In [2]:
gfs = pd.read_csv('../data/raw/gfs_forecast.csv').drop(columns=['system:index', '.geo'])

In [3]:
gfs['forecast_creation_dt'] = pd.to_datetime(gfs['forecast_creation_dt'].astype(str), format='%Y%m%d%H')

In [4]:
gfs.groupby('sample_idx').size().value_counts()

5    4961
4       7
3       1
dtype: int64

A handful are not getting all of the measurements. Let's find those later, but don't worry too much for now.

In [5]:
# Create timing variables
gfs['forecast_dt'] = gfs['forecast_creation_dt'] + pd.Series(pd.to_timedelta(gfs['forecast_hours'], unit='hour'), index=gfs.index)
gfs['forecast_date'] = gfs['forecast_dt'].dt.date
gfs['forecast_hour'] = gfs['forecast_dt'].dt.hour

# Reorder columns
front_cols = ['sample_idx', 'forecast_dt', 'forecast_date', 'forecast_hour', 'forecast_creation_dt'] 
gfs = gfs[front_cols + [col for col in gfs.columns if col not in front_cols]]

In [6]:
# Keep those with correct number of data points for now
gfs = gfs[gfs.groupby('sample_idx')['sample_idx'].transform(len) == 5]

In [7]:
# Rename forecast values from sample time to 'sample
gfs = gfs \
  .sort_values(['sample_idx', 'forecast_dt'])
gfs.loc[gfs.groupby('sample_idx').tail(1).index, 'forecast_hour'] = 'sample'

# Pivot data wide so each row is a measurement
gfs_wide = gfs \
  .drop(columns=['forecast_creation_dt', 'forecast_hours', 'forecast_date', 'forecast_dt']) \
  .pivot(index='sample_idx', columns='forecast_hour')
gfs_wide.columns = gfs_wide.columns.map('{0[0]}_{0[1]}'.format)

In [8]:
gfs_wide.to_csv('../data/clean/gfs_forecasts.csv')