In [15]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import h2o
from h2o.automl import H2OAutoML, get_leaderboard
h2o.init(ip="localhost")

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,1 day 20 hours 40 mins
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.1.2
H2O cluster version age:,18 days
H2O cluster name:,H2O_from_python_danielmurphy_oifm0x
H2O cluster total nodes:,1
H2O cluster free memory:,3.862 Gb
H2O cluster total cores:,16
H2O cluster allowed cores:,16


In [8]:
df = pd.read_csv (r'founder_V0.3_founder.csv')
df.columns = df.columns.str.replace(' ','_')

#Select features for model
y = "Success"
df[y] = df[y].astype('category')

#Set Factors
x_factor = ["Gender", "Headquarters_Location_"]
df[x_factor] = df[x_factor].astype('category')

#Set Numerics
x_numeric=df.columns[pd.Series(df.columns).str.contains('Number').tolist()].tolist() #Get all columns with "Number" in the name
x_numeric.extend(['Founded_Date']) #add any other necessary columns
df[x_numeric] = df[x_numeric].apply(pd.to_numeric)

#get all features together
x = x_numeric + x_factor
x_y = x+[y] # combining predictors and y
df=df[x_y]
data_h2o = h2o.H2OFrame(df)


train, test = data_h2o.split_frame(ratios = [.7], seed = 1234)


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [9]:
aml = H2OAutoML(max_models=3, seed=1)
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [10]:
lb = aml.leaderboard

In [11]:
# Optionally edd extra model information to the leaderboard
lb = get_leaderboard(aml, extra_columns='ALL')
# Print all rows (instead of default 10 rows)
lb.head(rows=lb.nrows)

model_id,mean_residual_deviance,rmse,mse,mae,rmsle,training_time_ms,predict_time_per_row_ms
StackedEnsemble_AllModels_AutoML_20200404_142022,0.157696,0.39711,0.157696,0.315901,0.278413,561,0.037159
XGBoost_1_AutoML_20200404_142022,0.157948,0.397427,0.157948,0.317664,0.278616,2015,0.011023
XGBoost_3_AutoML_20200404_142022,0.159309,0.399136,0.159309,0.325794,0.280022,951,0.005066
XGBoost_2_AutoML_20200404_142022,0.160951,0.401186,0.160951,0.319354,0.281391,2906,0.017631




In [14]:
aml.leader

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_AllModels_AutoML_20200404_142022

No model summary for this model

ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 0.13570929335058163
RMSE: 0.3683874228995632
MAE: 0.29209821587308893
RMSLE: 0.2593399993585393
R^2: 0.43084560413489803
Mean Residual Deviance: 0.13570929335058163
Null degrees of freedom: 12904
Residual degrees of freedom: 12901
Null deviance: 3077.070902751219
Residual deviance: 1751.3284306892558
AIC: 10858.41837441366

ModelMetricsRegressionGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.15769641572925394
RMSE: 0.3971100801153931
MAE: 0.31590123273624976
RMSLE: 0.2784127654323142
R^2: 0.33863329468076686
Mean Residual Deviance: 0.15769641572925394
Null degrees of freedom: 12904
Residual degrees of freedom: 12901
Null deviance: 3077.4548876755243
Residual deviance: 2035.072244986022
AIC: 12796.190797913616




In [22]:
preds = aml.predict(test_data= test)

preds.head(15)
test.concat(preds)

stackedensemble prediction progress: |████████████████████████████████████| 100%


Number_of_News_Articles,Number_of_Founded_Organizations,Number_of_Portfolio_Companies,Number_of_Investments_x,Number_of_Partner_Investments,Number_of_Lead_Investments_x,Number_of_Exits_x,Number_of_Events_x,Number_of_Founders,Founded_Date,Gender,Headquarters_Location_,Success,predict
14445,7,11,21,0,9,6,2,4,2003,1,California,1,0.899756
128,1,6,7,0,0,1,5,2,2006,1,California,1,0.987523
3669,1,73,101,0,3,17,19,2,1999,1,California,1,0.950903
90,1,0,0,0,0,0,0,2,1999,1,China,1,1.01592
48,1,3,3,0,0,2,0,3,2014,1,California,1,0.834022
451,1,2,3,0,0,1,2,1,2014,1,New York,1,0.541971
1330,3,1,1,0,0,0,12,1,2016,0,New York,0,0.277839
436,1,0,0,0,0,0,0,1,2000,1,Massachusetts,1,0.987908
234,1,3,3,0,0,0,0,5,2013,1,Nevada,0,0.320035
0,3,0,0,0,0,0,0,5,2013,1,Nevada,0,0.0514043


