In [3]:
import glob
import time 
import os

import numpy as np
import pandas as pd

import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator 
from sklearn.metrics import mean_absolute_error, mean_squared_error

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials


import matplotlib.pyplot as plt


### Data reading

In [4]:
col_names = ['yyyy', 'mm', 'dd', 'HH', 'LON', 'LAT', 'Pres(hpa)', 'T(Celcius)', 'SST(Celcius)', 'RH(%)', 'u(m/s)', 'v(m/s)', 'WS(m/s)', 'WD(degrees)', 'q(g/kg)', 'dq(g/kg)', 'ASTD(Celcius)', 'RiB', 'EDH(m)']
df = pd.concat([pd.read_csv(f, names = col_names, header = None, sep = '\s+') for f in glob.glob('data/*.txt')], ignore_index = True, sort = False)

df.head()


Unnamed: 0,yyyy,mm,dd,HH,LON,LAT,Pres(hpa),T(Celcius),SST(Celcius),RH(%),u(m/s),v(m/s),WS(m/s),WD(degrees),q(g/kg),dq(g/kg),ASTD(Celcius),RiB,EDH(m)
0,2015,10,13,0,-76.7401,34.8289,1008.15,18.86,19.72,94.0,-0.06,0.81,0.82,175.97,12.63,1.26,-0.85,0.86,0.0
1,2015,10,13,1,-76.7178,34.8293,1008.2,18.78,19.72,93.98,-0.09,0.83,0.84,173.9,12.56,1.32,-0.94,1.03,0.0
2,2015,10,13,2,-76.6955,34.8297,1008.26,18.69,19.65,93.99,-0.18,0.81,0.83,167.29,12.5,1.34,-0.96,1.28,0.0
3,2015,10,13,3,-76.6732,34.83,1008.31,18.6,19.6,94.06,-0.21,0.9,0.92,166.58,12.43,1.36,-1.0,1.58,0.0
4,2015,10,13,4,-76.6509,34.8304,1008.33,18.49,19.53,93.98,-0.25,0.91,0.95,164.8,12.34,1.39,-1.04,2.14,0.0


In [5]:
df.isnull().any()

yyyy             False
mm               False
dd               False
HH               False
LON              False
LAT              False
Pres(hpa)        False
T(Celcius)       False
SST(Celcius)     False
RH(%)            False
u(m/s)           False
v(m/s)           False
WS(m/s)          False
WD(degrees)      False
q(g/kg)          False
dq(g/kg)          True
ASTD(Celcius)    False
RiB              False
EDH(m)           False
dtype: bool

In [6]:
df.max()

yyyy             2.015000e+03
mm               1.000000e+01
dd               3.000000e+01
HH               2.300000e+01
LON             -7.386330e+01
LAT              3.718000e+01
Pres(hpa)        1.030210e+03
T(Celcius)       2.645000e+01
SST(Celcius)     3.334000e+01
RH(%)            1.000300e+02
u(m/s)           1.495000e+01
v(m/s)           1.473000e+01
WS(m/s)          1.750000e+01
WD(degrees)      3.600000e+02
q(g/kg)                   inf
dq(g/kg)                  inf
ASTD(Celcius)    2.971500e+02
RiB              1.510094e+06
EDH(m)           4.000000e+01
dtype: float64

In [7]:
df.min()

yyyy             2.015000e+03
mm               1.000000e+01
dd               1.300000e+01
HH               0.000000e+00
LON             -7.680440e+01
LAT              3.482890e+01
Pres(hpa)        0.000000e+00
T(Celcius)      -2.731500e+02
SST(Celcius)    -2.731500e+02
RH(%)            1.786000e+01
u(m/s)          -1.304000e+01
v(m/s)          -1.438000e+01
WS(m/s)          0.000000e+00
WD(degrees)      0.000000e+00
q(g/kg)          2.160000e+00
dq(g/kg)        -4.090000e+00
ASTD(Celcius)   -1.748000e+01
RiB             -2.059190e+08
EDH(m)           0.000000e+00
dtype: float64

In [8]:
df = df.replace([-np.inf, np.inf],np.nan).dropna()

In [9]:
df.max()

yyyy             2.015000e+03
mm               1.000000e+01
dd               3.000000e+01
HH               2.300000e+01
LON             -7.386330e+01
LAT              3.718000e+01
Pres(hpa)        1.030210e+03
T(Celcius)       2.645000e+01
SST(Celcius)     3.334000e+01
RH(%)            1.000300e+02
u(m/s)           1.495000e+01
v(m/s)           1.473000e+01
WS(m/s)          1.750000e+01
WD(degrees)      3.600000e+02
q(g/kg)          1.734000e+01
dq(g/kg)         2.231000e+01
ASTD(Celcius)    8.800000e+00
RiB              1.510094e+06
EDH(m)           4.000000e+01
dtype: float64

In [10]:
df.shape 

(5923080, 19)

In [11]:
h2o.init(max_mem_size = 12)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_91"; Java(TM) SE Runtime Environment (build 1.8.0_91-b14); Java HotSpot(TM) 64-Bit Server VM (build 25.91-b14, mixed mode)
  Starting server from /Users/denny/anaconda3/envs/EDH_ML/h2o_jar/h2o.jar
  Ice root: /var/folders/t_/vqhq6xfs10n48trt1p70rmf80000gr/T/tmpkml9b8g_
  JVM stdout: /var/folders/t_/vqhq6xfs10n48trt1p70rmf80000gr/T/tmpkml9b8g_/h2o_denny_started_from_python.out
  JVM stderr: /var/folders/t_/vqhq6xfs10n48trt1p70rmf80000gr/T/tmpkml9b8g_/h2o_denny_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.2
H2O cluster version age:,"1 year, 3 months and 19 days !!!"
H2O cluster name:,H2O_from_python_denny_npw6tm
H2O cluster total nodes:,1
H2O cluster free memory:,10.67 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [12]:
hf = h2o.H2OFrame (df)

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [13]:
hf.shape

(5923080, 19)

In [14]:
x = ['Pres(hpa)', 'T(Celcius)', 'SST(Celcius)', 'RH(%)', 'WS(m/s)', 'WD(degrees)']
y = 'EDH(m)'


print(x)
print(y)

['Pres(hpa)', 'T(Celcius)', 'SST(Celcius)', 'RH(%)', 'WS(m/s)', 'WD(degrees)']
EDH(m)


In [15]:
hf_train, hf_test = hf.split_frame(ratios = [0.8])

print(hf_train.nrow) ; print (hf_test.nrow)

hf_train.nrow + hf_test.nrow == hf.nrow

4738266
1184814


True

In [16]:
edh_rf = H2ORandomForestEstimator(max_depth = 10, min_rows = 5, min_split_improvement = 1e-10, ntrees = 20)

In [17]:
edh_rf.train(x = x, y = y, training_frame = hf_train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [18]:
edh_rf

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_model_python_1561490888893_1


ModelMetricsRegression: drf
** Reported on train data. **

MSE: 7.795450672344167
RMSE: 2.7920334296609286
MAE: 0.994238888604682
RMSLE: 0.41377810199471055
Mean Residual Deviance: 7.795450672344167
Scoring History: 


0,1,2,3,4,5,6
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2019-06-25 12:29:42,0.023 sec,0.0,,,
,2019-06-25 12:29:44,2.086 sec,1.0,3.1195381,1.3516106,9.7315179
,2019-06-25 12:29:45,3.327 sec,2.0,3.0480158,1.2434506,9.2904005
,2019-06-25 12:29:46,4.248 sec,3.0,3.0036040,1.1863052,9.0216372
,2019-06-25 12:29:51,8.984 sec,9.0,2.8839042,1.0749838,8.3169036
,2019-06-25 12:29:55,13.028 sec,14.0,2.8268121,1.0231503,7.9908664
,2019-06-25 12:29:59,17.152 sec,19.0,2.8064962,0.9999674,7.8764207
,2019-06-25 12:30:00,18.101 sec,20.0,2.7920334,0.9942389,7.7954507


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
RH(%),639347456.0000000,1.0,0.3256280
WS(m/s),613181056.0000000,0.9590733,0.3123011
SST(Celcius),391238624.0000000,0.6119343,0.1992629
T(Celcius),168101792.0000000,0.2629271,0.0856164
Pres(hpa),87259520.0000000,0.1364822,0.0444424
WD(degrees),64300572.0000000,0.1005722,0.0327491




In [19]:
edh_perf = edh_rf.model_performance(hf_test)
edh_perf


ModelMetricsRegression: drf
** Reported on test data. **

MSE: 7.516051936706857
RMSE: 2.741541890379729
MAE: 0.9545735060972741
RMSLE: 0.4103869161309699
Mean Residual Deviance: 7.516051936706857


