In [1]:
import glob 
import time 
import os

import numpy as np
import pandas as pd

import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator


from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import mean_squared_error 

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

import matplotlib.pyplot as plt 


## Data reading

In [2]:
col_names = ['yyyy', 'mm', 'dd', 'HH', 'LON', 'LAT', 'Pres(hpa)', 'T(Celcius)', 'SST(Celcius)', 'RH(%)', 'u(m/s)', 'v(m/s)', 'WS(m/s)', 'WD(degrees)', 'q(g/kg)', 'dq(g/kg)', 'ASTD(Celcius)', 'RiB', 'EDH(m)']
df = pd.concat([pd.read_csv(f, header=None, names = col_names, sep = '\s+') for f in glob.glob ('data/*.txt')], ignore_index = True, sort = False)

print(df.shape)


(6120516, 19)


In [3]:
df.head()

Unnamed: 0,yyyy,mm,dd,HH,LON,LAT,Pres(hpa),T(Celcius),SST(Celcius),RH(%),u(m/s),v(m/s),WS(m/s),WD(degrees),q(g/kg),dq(g/kg),ASTD(Celcius),RiB,EDH(m)
0,2015,10,13,0,-76.7401,34.8289,1008.15,18.86,19.72,94.0,-0.06,0.81,0.82,175.97,12.63,1.26,-0.85,0.86,0.0
1,2015,10,13,1,-76.7178,34.8293,1008.2,18.78,19.72,93.98,-0.09,0.83,0.84,173.9,12.56,1.32,-0.94,1.03,0.0
2,2015,10,13,2,-76.6955,34.8297,1008.26,18.69,19.65,93.99,-0.18,0.81,0.83,167.29,12.5,1.34,-0.96,1.28,0.0
3,2015,10,13,3,-76.6732,34.83,1008.31,18.6,19.6,94.06,-0.21,0.9,0.92,166.58,12.43,1.36,-1.0,1.58,0.0
4,2015,10,13,4,-76.6509,34.8304,1008.33,18.49,19.53,93.98,-0.25,0.91,0.95,164.8,12.34,1.39,-1.04,2.14,0.0


In [6]:
sorted(os.listdir("data"))

['NRL_COAMPS_CASPER_EAST_2015101300.txt',
 'NRL_COAMPS_CASPER_EAST_2015101312.txt',
 'NRL_COAMPS_CASPER_EAST_2015101400.txt',
 'NRL_COAMPS_CASPER_EAST_2015101412.txt',
 'NRL_COAMPS_CASPER_EAST_2015101500.txt',
 'NRL_COAMPS_CASPER_EAST_2015101512.txt',
 'NRL_COAMPS_CASPER_EAST_2015101600.txt',
 'NRL_COAMPS_CASPER_EAST_2015101612.txt',
 'NRL_COAMPS_CASPER_EAST_2015101700.txt',
 'NRL_COAMPS_CASPER_EAST_2015101712.txt',
 'NRL_COAMPS_CASPER_EAST_2015101800.txt',
 'NRL_COAMPS_CASPER_EAST_2015102012.txt',
 'NRL_COAMPS_CASPER_EAST_2015102100.txt',
 'NRL_COAMPS_CASPER_EAST_2015102200.txt',
 'NRL_COAMPS_CASPER_EAST_2015102212.txt',
 'NRL_COAMPS_CASPER_EAST_2015102300.txt',
 'NRL_COAMPS_CASPER_EAST_2015102312.txt',
 'NRL_COAMPS_CASPER_EAST_2015102400.txt',
 'NRL_COAMPS_CASPER_EAST_2015102412.txt',
 'NRL_COAMPS_CASPER_EAST_2015102500.txt',
 'NRL_COAMPS_CASPER_EAST_2015102512.txt',
 'NRL_COAMPS_CASPER_EAST_2015102600.txt',
 'NRL_COAMPS_CASPER_EAST_2015102612.txt',
 'NRL_COAMPS_CASPER_EAST_201510270

### Checking is there any nan values? 

In [7]:
df.isnull().any()

yyyy             False
mm               False
dd               False
HH               False
LON              False
LAT              False
Pres(hpa)        False
T(Celcius)       False
SST(Celcius)     False
RH(%)            False
u(m/s)           False
v(m/s)           False
WS(m/s)          False
WD(degrees)      False
q(g/kg)          False
dq(g/kg)          True
ASTD(Celcius)    False
RiB              False
EDH(m)           False
dtype: bool

##### Checking the maximum in the data

In [None]:
df.max()

#### Checking the minimum

In [None]:
df.min()

In [None]:
df = df.replace([np.inf,-np.inf], np.nan)
df.max()

In [None]:
df.min()

In [None]:
df.isnull().any()

In [None]:
df = df.dropna()
df.isnull().any()

In [None]:
df.shape

## Modeling

##### We are going to try the H2O random forest model

In [52]:
h2o.init(max_mem_size = 12)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_91"; Java(TM) SE Runtime Environment (build 1.8.0_91-b14); Java HotSpot(TM) 64-Bit Server VM (build 25.91-b14, mixed mode)
  Starting server from /Users/denny/anaconda3/envs/EDH_ML/h2o_jar/h2o.jar
  Ice root: /var/folders/t_/vqhq6xfs10n48trt1p70rmf80000gr/T/tmp24qpo7wv
  JVM stdout: /var/folders/t_/vqhq6xfs10n48trt1p70rmf80000gr/T/tmp24qpo7wv/h2o_denny_started_from_python.out
  JVM stderr: /var/folders/t_/vqhq6xfs10n48trt1p70rmf80000gr/T/tmp24qpo7wv/h2o_denny_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.2
H2O cluster version age:,"1 year, 3 months and 14 days !!!"
H2O cluster name:,H2O_from_python_denny_3pg9fi
H2O cluster total nodes:,1
H2O cluster free memory:,10.67 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [53]:
# converting the dataframe to H2O data frame.

hf = h2o.H2OFrame(df)

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [54]:
hf.head()

yyyy,mm,dd,HH,LON,LAT,Pres(hpa),T(Celcius),SST(Celcius),RH(%),u(m/s),v(m/s),WS(m/s),WD(degrees),q(g/kg),dq(g/kg),ASTD(Celcius),RiB,EDH(m)
2015,10,13,0,-76.7401,34.8289,1008.15,18.86,19.72,94.0,-0.06,0.81,0.82,175.97,12.63,1.26,-0.85,0.86,0
2015,10,13,1,-76.7178,34.8293,1008.2,18.78,19.72,93.98,-0.09,0.83,0.84,173.9,12.56,1.32,-0.94,1.03,0
2015,10,13,2,-76.6955,34.8297,1008.26,18.69,19.65,93.99,-0.18,0.81,0.83,167.29,12.5,1.34,-0.96,1.28,0
2015,10,13,3,-76.6732,34.83,1008.31,18.6,19.6,94.06,-0.21,0.9,0.92,166.58,12.43,1.36,-1.0,1.58,0
2015,10,13,4,-76.6509,34.8304,1008.33,18.49,19.53,93.98,-0.25,0.91,0.95,164.8,12.34,1.39,-1.04,2.14,0
2015,10,13,5,-76.6286,34.8308,1008.33,18.4,19.46,93.85,-0.31,0.94,0.99,161.74,12.25,1.41,-1.05,2.44,0
2015,10,13,6,-76.6063,34.8312,1008.36,18.29,19.34,93.82,-0.38,0.93,1.0,157.91,12.16,1.4,-1.05,2.76,0
2015,10,13,7,-76.584,34.8315,1008.33,18.1,19.18,94.64,-0.45,1.13,1.22,158.47,12.12,1.3,-1.08,3.47,0
2015,10,13,8,-76.5616,34.8319,1008.33,17.9,18.96,95.45,-0.49,1.23,1.33,158.23,12.07,1.17,-1.05,3.15,0
2015,10,13,9,-76.5394,34.8322,1008.32,17.73,18.96,96.2,-0.54,1.37,1.47,158.48,12.04,1.2,-1.22,2.9,0




In [62]:
x = hf.columns
y = "EDH(m)"
x.remove(y)
print("x =",x);print(" ");print("y =",y)

x = ['yyyy', 'mm', 'dd', 'HH', 'LON', 'LAT', 'Pres(hpa)', 'T(Celcius)', 'SST(Celcius)', 'RH(%)', 'u(m/s)', 'v(m/s)', 'WS(m/s)', 'WD(degrees)', 'q(g/kg)', 'dq(g/kg)', 'ASTD(Celcius)', 'RiB']
 
y = EDH(m)


In [68]:
hf_train, hf_test = hf.split_frame(ratios=[0.8])

print(hf_train.shape); print(hf_test.shape)

len(hf_train) + len(hf_test) == len(hf)

# alternate for "len(hf_train)"" is "hf_train.nrow" 
# that means, alternatively we can use "hf_train.nrow + hf_test.nrow = hf.nrow"

(4738661, 19)
(1184419, 19)


True

## H2O random forest estimator

In [69]:
edh_rf = H2ORandomForestEstimator()
edh_rf.train(x=x, y=y, training_frame = hf_train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [70]:
edh_rf

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_model_python_1561057022593_1


ModelMetricsRegression: drf
** Reported on train data. **

MSE: 0.6507754532804956
RMSE: 0.8067065471907957
MAE: 0.21109308211086183
RMSLE: 0.12272010737284662
Mean Residual Deviance: 0.6507754532804956
Scoring History: 


0,1,2,3,4,5,6
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2019-06-20 13:17:46,0.020 sec,0.0,,,
,2019-06-20 13:17:52,6.287 sec,1.0,1.3307232,0.2515093,1.7708241
,2019-06-20 13:17:58,11.542 sec,2.0,1.2797343,0.2500328,1.6377200
,2019-06-20 13:18:03,16.850 sec,3.0,1.2441654,0.2463009,1.5479476
,2019-06-20 13:18:07,21.360 sec,4.0,1.2073606,0.2492171,1.4577196
---,---,---,---,---,---,---
,2019-06-20 13:21:58,4 min 11.673 sec,46.0,0.8098622,0.2114291,0.6558767
,2019-06-20 13:22:04,4 min 17.660 sec,47.0,0.8084904,0.2112027,0.6536567
,2019-06-20 13:22:10,4 min 23.432 sec,48.0,0.8070725,0.2108019,0.6513660



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
dq(g/kg),1443772288.0000000,1.0,0.2225349
WS(m/s),1052775488.0000000,0.7291839,0.1622689
LON,834307584.0000000,0.5778665,0.1285955
RH(%),698458112.0000000,0.4837730,0.1076564
SST(Celcius),563569664.0000000,0.3903453,0.0868655
ASTD(Celcius),458429312.0000000,0.3175219,0.0706597
T(Celcius),215081040.0000000,0.1489716,0.0331514
Pres(hpa),199517200.0000000,0.1381916,0.0307525
u(m/s),197932672.0000000,0.1370941,0.0305082




In [8]:
edh_perf = edh_rf.model_performance(hf_test)
edh_perf

NameError: name 'edh_rf' is not defined