In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import dask
import dask.distributed
import dask.dataframe as dd
import dask_jobqueue
import numpy as np
import os
import pandas as pd
import pathlib
import seaborn as sns
import sklearn

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ~/.bash_profile','conda activate smc01'],
)

In [None]:
cluster.scale(jobs=2)  # Scale to two working nodes as configured.
client = dask.distributed.Client(cluster)

In [None]:
client

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))
INPUT_DIR = DATA_DIR / '2021-03-17-ppdataset'

In [None]:
csv_files = [f for f in pathlib.Path(INPUT_DIR).iterdir() if f.suffix == '.csv']

In [None]:
view = ['station', 'latitude', 'longitude', 'elevation', 'date', 'step', 'obs_2t', 'gdps_2t']

In [None]:
df = dd.read_csv(csv_files[0:100], usecols=view, converters={
    'date': pd.to_datetime,
    'step': pd.to_timedelta,
})

In [None]:
df = dd.read_parquet(INPUT_DIR + '/*.parquet')

In [None]:
df

In [None]:
df = df.compute()

In [None]:
df.head().dtypes

In [None]:
df['abs_error_2t'] = np.abs(df['gdps_2t'] - df['obs_2t'])
df['error_2t'] = df['gdps_2t'] - df['obs_2t']
df['step_hours'] = df['step'] / 3600

In [None]:
df.columns

In [None]:
df.corr()['error_2t']

In [None]:
feature_cols = [c for c in df.columns if c.startswith('gdps')]

In [None]:
feature_cols

In [None]:
X = np.empty((len(df), len(feature_cols)))

In [None]:
for i, col in enumerate(feature_cols):
    X[:,i] = df[col]

In [None]:
X

In [None]:
df['date'].value_counts()

In [None]:
train_set = df[df['date'] <= '2019-01-05']

In [None]:
val_set = df[df['date'] > '2019-01-05']

In [None]:
train_X = np.empty((len(train_set), len(feature_cols)))
val_Y = np.empty((len(val_set), len(feature_cols)))
for i, col in enumerate(feature_cols):
    train_X[:,i] = train_set[col]
    val_Y[:,i] = val_set[col]


In [None]:
train_y = np.array(train_set['error_2t'])
val_y = np.array(val_set['error_2t'])

In [None]:
train_y

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
classifier = RandomForestRegressor(n_estimators=100)

In [None]:
classifier.fit(train_X, train_y)