In [None]:
%load_ext autoreload
%autoreload 2

# Rescale features

This notebooks computes the mean and std for various features.
We will use these values to rescale the data before training.

In [None]:
import dask
import dask_jobqueue
import dask.dataframe as dd
import dask.distributed
import os
import numpy as np
import pathlib
import pandas as pd

## Boot cluster



In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ~/.bash_profile','conda activate smc01'],
    name='smc01-dask',
)

In [None]:
cluster.scale(jobs=6)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))
DATASET_DIR = DATA_DIR / 'interpolated/2021-12-20-gdps-metar/'

In [None]:
dataset_path = pathlib.Path(DATASET_DIR)

In [None]:
train_files = [f for f in dataset_path.glob('*.parquet') if int(f.name[:4]) in [2019,2020]]

In [None]:
train_files[:10]

In [None]:
df = dd.read_parquet(train_files)

In [None]:
df.head()

## Means and stds

In [None]:
mean = df.mean().compute()

In [None]:
std = df.std().compute()

In [None]:
mean_dict = mean

In [None]:
for k, v in mean_dict.items():
    print(f'"{k}": {v:.4e},')

In [None]:
std_dict = dict(std)

In [None]:
for k, v in std_dict.items():
    print(f'"{k}": {v:.4e},')

## Other fields

### Elevation

In [None]:
elevation = df['elevation'].compute()

In [None]:
elevation.plot.hist()

In [None]:
elevation_by_station = df.groupby('station').agg({'elevation': 'first'}).compute()

In [None]:
elevation_by_station[elevation_by_station['elevation'] < 0.0]

In [None]:
elevation_by_station.min()

In [None]:
np.log(elevation + 44 + 1e-6)

In [None]:
np.log(elevation+ 1e-6).plot.hist()

In [None]:
np.log(elevation + 1e-6).mean()

In [None]:
np.log(elevation + 1e-6).std()

### GDPS precipitation rate (Prate)

In [None]:
gdps_prate = df['gdps_prate'].compute()

In [None]:
gdps_prate.plot.hist()

In [None]:
(gdps_prate**(1./3.)).plot.hist()

## 10m wind speed (si)

In [None]:
obs_10si = df['obs_10si'].compute()

In [None]:
obs_10si.plot.hist(range=(0,10))

In [None]:
obs_10si.mean()

In [None]:
obs_10si.std()

In [None]:
obs_10si.max()

In [None]:
df[df['obs_10si'] > 400].compute()

In [None]:
obs_10si.quantile(q=0.01)

### Relative Humidity (Obs and GDPS)

In [None]:
obs_2r = df['obs_2r'].compute()

In [None]:
obs_2r.plot.hist()

In [None]:
obs_2r.max()

In [None]:
obs_2r.quantile(q=0.999)

In [None]:
df['gdps_2r'].compute().plot.hist()

### OBS Mean Sea Level Pressure

In [None]:
obs_prmsl = df['obs_prmsl'].compute()

In [None]:
obs_prmsl.plot.hist()

In [None]:
obs_prmsl.min()

In [None]:
obs_prmsl.max()

### GDPS Albedo

In [None]:
gdps_alb = df['gdps_al'].compute()

In [None]:
gdps_alb.plot.hist()

### GDPS Wind direction

In [None]:
gdps_wdir = df['gdps_10wdir'].compute()

In [None]:
gdps_wdir.plot.hist()