In [1]:
import glob
import time 
import os 


import numpy as np
import pandas as pd


import h2o

from h2o.estimators.random_forest import H2ORandomForestEstimator
from sklearn.metrics import mean_absolute_error, mean_squared_error 

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

import matplotlib.pyplot as plt

This is where we read the data

In [2]:
col_names= col_names = ['yyyy', 'mm', 'dd', 'HH', 'LON', 'LAT', 'Pres(hpa)', 'T(Celcius)', 'SST(Celcius)', 'RH(%)', 'u(m/s)', 'v(m/s)', 'WS(m/s)', 'WD(degrees)', 'q(g/kg)', 'dq(g/kg)', 'ASTD(Celcius)', 'RiB', 'EDH(m)']

df = pd.concat([pd.read_csv(f, header = None, names = col_names, sep = '\s+') for f in glob.glob('data/*.txt')], 
               ignore_index = True, sort = False)
              
print(df.shape)

df.head()
              

(6120516, 19)


Unnamed: 0,yyyy,mm,dd,HH,LON,LAT,Pres(hpa),T(Celcius),SST(Celcius),RH(%),u(m/s),v(m/s),WS(m/s),WD(degrees),q(g/kg),dq(g/kg),ASTD(Celcius),RiB,EDH(m)
0,2015,10,13,0,-76.7401,34.8289,1008.15,18.86,19.72,94.0,-0.06,0.81,0.82,175.97,12.63,1.26,-0.85,0.86,0.0
1,2015,10,13,1,-76.7178,34.8293,1008.2,18.78,19.72,93.98,-0.09,0.83,0.84,173.9,12.56,1.32,-0.94,1.03,0.0
2,2015,10,13,2,-76.6955,34.8297,1008.26,18.69,19.65,93.99,-0.18,0.81,0.83,167.29,12.5,1.34,-0.96,1.28,0.0
3,2015,10,13,3,-76.6732,34.83,1008.31,18.6,19.6,94.06,-0.21,0.9,0.92,166.58,12.43,1.36,-1.0,1.58,0.0
4,2015,10,13,4,-76.6509,34.8304,1008.33,18.49,19.53,93.98,-0.25,0.91,0.95,164.8,12.34,1.39,-1.04,2.14,0.0


In [3]:
df.min()

yyyy             2.015000e+03
mm               1.000000e+01
dd               1.300000e+01
HH               0.000000e+00
LON             -7.680440e+01
LAT              3.482890e+01
Pres(hpa)        0.000000e+00
T(Celcius)      -2.731500e+02
SST(Celcius)    -2.731500e+02
RH(%)            1.786000e+01
u(m/s)          -1.304000e+01
v(m/s)          -1.438000e+01
WS(m/s)          0.000000e+00
WD(degrees)      0.000000e+00
q(g/kg)          2.160000e+00
dq(g/kg)        -4.090000e+00
ASTD(Celcius)   -1.748000e+01
RiB             -2.059190e+08
EDH(m)           0.000000e+00
dtype: float64

In [4]:
df.max()

yyyy             2.015000e+03
mm               1.000000e+01
dd               3.000000e+01
HH               2.300000e+01
LON             -7.386330e+01
LAT              3.718000e+01
Pres(hpa)        1.030210e+03
T(Celcius)       2.645000e+01
SST(Celcius)     3.334000e+01
RH(%)            1.000300e+02
u(m/s)           1.495000e+01
v(m/s)           1.473000e+01
WS(m/s)          1.750000e+01
WD(degrees)      3.600000e+02
q(g/kg)                   inf
dq(g/kg)                  inf
ASTD(Celcius)    2.971500e+02
RiB              1.510094e+06
EDH(m)           4.000000e+01
dtype: float64

In [5]:
df.isnull().any()

yyyy             False
mm               False
dd               False
HH               False
LON              False
LAT              False
Pres(hpa)        False
T(Celcius)       False
SST(Celcius)     False
RH(%)            False
u(m/s)           False
v(m/s)           False
WS(m/s)          False
WD(degrees)      False
q(g/kg)          False
dq(g/kg)          True
ASTD(Celcius)    False
RiB              False
EDH(m)           False
dtype: bool

In [6]:
df = df.replace([-np.inf, np.inf], np.nan).dropna()

In [7]:
df.max()

yyyy             2.015000e+03
mm               1.000000e+01
dd               3.000000e+01
HH               2.300000e+01
LON             -7.386330e+01
LAT              3.718000e+01
Pres(hpa)        1.030210e+03
T(Celcius)       2.645000e+01
SST(Celcius)     3.334000e+01
RH(%)            1.000300e+02
u(m/s)           1.495000e+01
v(m/s)           1.473000e+01
WS(m/s)          1.750000e+01
WD(degrees)      3.600000e+02
q(g/kg)          1.734000e+01
dq(g/kg)         2.231000e+01
ASTD(Celcius)    8.800000e+00
RiB              1.510094e+06
EDH(m)           4.000000e+01
dtype: float64

In [8]:
df.isnull().any()

yyyy             False
mm               False
dd               False
HH               False
LON              False
LAT              False
Pres(hpa)        False
T(Celcius)       False
SST(Celcius)     False
RH(%)            False
u(m/s)           False
v(m/s)           False
WS(m/s)          False
WD(degrees)      False
q(g/kg)          False
dq(g/kg)         False
ASTD(Celcius)    False
RiB              False
EDH(m)           False
dtype: bool

In [9]:
h2o.init(max_mem_size=12)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_91"; Java(TM) SE Runtime Environment (build 1.8.0_91-b14); Java HotSpot(TM) 64-Bit Server VM (build 25.91-b14, mixed mode)
  Starting server from /Users/denny/anaconda3/envs/EDH_ML/h2o_jar/h2o.jar
  Ice root: /var/folders/t_/vqhq6xfs10n48trt1p70rmf80000gr/T/tmp7vdkomlv
  JVM stdout: /var/folders/t_/vqhq6xfs10n48trt1p70rmf80000gr/T/tmp7vdkomlv/h2o_denny_started_from_python.out
  JVM stderr: /var/folders/t_/vqhq6xfs10n48trt1p70rmf80000gr/T/tmp7vdkomlv/h2o_denny_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.2
H2O cluster version age:,"1 year, 3 months and 21 days !!!"
H2O cluster name:,H2O_from_python_denny_3eemcw
H2O cluster total nodes:,1
H2O cluster free memory:,10.67 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [10]:
hf=h2o.H2OFrame(df)

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [11]:
x = ['Pres(hpa)', 'T(Celcius)', 'SST(Celcius)', 'dq(g/kg)', 'ASTD(Celcius)']
y = 'EDH(m)'

In [12]:
hf_train, hf_test = hf.split_frame(ratios=[0.8])

print(len(hf_train)); print(len(hf_test))

len(hf_train) + len(hf_test) == len(hf)

4738233
1184847


True

In [15]:
edh_rf = H2ORandomForestEstimator(max_depth = 10, min_rows = 3, min_split_improvement = 1e-10, ntrees = 50)

In [16]:
edh_rf.train(x=x, y=y, training_frame = hf_train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [17]:
edh_rf

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_model_python_1561669666203_2


ModelMetricsRegression: drf
** Reported on train data. **

MSE: 12.59301319885602
RMSE: 3.548663579272628
MAE: 1.9511054470284912
RMSLE: 0.5492918806969123
Mean Residual Deviance: 12.59301319885602
Scoring History: 


0,1,2,3,4,5,6
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2019-06-27 14:26:26,0.002 sec,0.0,,,
,2019-06-27 14:26:27,0.707 sec,1.0,4.0065699,2.2324347,16.0526021
,2019-06-27 14:26:28,1.416 sec,2.0,3.9194084,2.1281792,15.3617618
,2019-06-27 14:26:28,2.134 sec,3.0,3.9214830,2.1246331,15.3780285
,2019-06-27 14:26:33,6.373 sec,6.0,3.7182449,2.0064280,13.8253454
,2019-06-27 14:26:37,11.014 sec,11.0,3.6256904,1.9810266,13.1456310
,2019-06-27 14:26:41,15.290 sec,17.0,3.5822097,1.9539424,12.8322263
,2019-06-27 14:26:46,19.587 sec,23.0,3.5810354,1.9621364,12.8238146
,2019-06-27 14:26:50,23.826 sec,29.0,3.5822101,1.9706663,12.8322289


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
dq(g/kg),1391192192.0000000,1.0,0.3447464
SST(Celcius),1162188928.0000000,0.8353906,0.2879979
ASTD(Celcius),637251520.0000000,0.4580615,0.1579151
T(Celcius),451503296.0000000,0.3245442,0.1118854
Pres(hpa),393271136.0000000,0.2826864,0.0974551




In [18]:
edh_perf = edh_rf.model_performance(hf_test)
edh_perf


ModelMetricsRegression: drf
** Reported on test data. **

MSE: 12.493330497351073
RMSE: 3.5345905699742755
MAE: 1.9410198186143126
RMSLE: 0.5474379740777879
Mean Residual Deviance: 12.493330497351073


