In [1]:
#Importing necessary packages
import h2o
from h2o.automl import H2OAutoML
import random, os, sys
from datetime import datetime
import pandas as pd
import logging
import csv
import optparse
import time
import json
from distutils.util import strtobool
import matplotlib.pyplot as plt

Set up some parameters for the analysis.  

In [2]:
# Defining variables to be used
data_path=None
all_variables=None
test_path=None
target=None
nthreads=1 
min_mem_size=6 
run_time=333
classification=False
scale=False
max_models=9    
model_path=None
balance_y=False 
balance_threshold=0.2
name=None 
server_path=None  
analysis=0 

The next sections contains helper functions for automating analysis.             

In [3]:
# Defining helper functions to guide me through the tasks

def alphabet(n):
  alpha='0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'    
  str=''
  r=len(alpha)-1   
  while len(str)<n:
    i=random.randint(0,r)
    str+=alpha[i]   
  return str
  
  
def set_meta_data(run_id,analysis,target,run_time,classification,scale,model,balance,balance_threshold,name,nthreads,min_mem_size):
  m_data={}
  m_data['run_id'] =run_id
  m_data['start_time'] = time.time()
  m_data['target']=target
  m_data['max_models']=model
  m_data['run_time']=run_time
  m_data['scale']=scale
  m_data['classification']=classification
  m_data['scale']=False
  m_data['balance']=balance
  m_data['balance_threshold']=balance_threshold
  m_data['project'] =name
  m_data['end_time'] = time.time()
  m_data['execution_time'] = 0.0
  m_data['nthreads'] = nthreads
  m_data['min_mem_size'] = min_mem_size
  m_data['analysis'] = analysis
  return m_data


def dict_to_json(dct,n):
  j = json.dumps(dct, indent=4)
  f = open(n, 'w')
  print(j, file=f)
  f.close()
  
  
def stackedensemble(mod):
    coef_norm=None
    try:
      metalearner = h2o.get_model(mod.metalearner()['name'])
      coef_norm=metalearner.coef_norm()
    except:
      pass        
    return coef_norm

def stackedensemble_df(df):
    bm_algo={ 'GBM': None,'GLM': None,'DRF': None,'XRT': None,'Dee': None}
    for index, row in df.iterrows():
      if len(row['model_id'])>3:
        key=row['model_id'][0:3]
        if key in bm_algo:
          if bm_algo[key] is None:
                bm_algo[key]=row['model_id']
    bm=list(bm_algo.values()) 
    bm=list(filter(None.__ne__, bm))             
    return bm

def se_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['auc']=modl.auc()   
    d['roc']=modl.roc()
    d['mse']=modl.mse()   
    d['null_degrees_of_freedom']=modl.null_degrees_of_freedom()
    d['null_deviance']=modl.null_deviance()
    d['residual_degrees_of_freedom']=modl.residual_degrees_of_freedom()   
    d['residual_deviance']=modl.residual_deviance()
    d['rmse']=modl.rmse()
    return d

def get_model_by_algo(algo,models_dict):
    mod=None
    mod_id=None    
    for m in list(models_dict.keys()):
        if m[0:3]==algo:
            mod_id=m
            mod=h2o.get_model(m)      
    return mod,mod_id     
    
    
def gbm_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['varimp']=modl.varimp()  
    return d
    
    
def dl_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['varimp']=modl.varimp()  
    return d
    
    
def drf_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['varimp']=modl.varimp()  
    d['roc']=modl.roc()      
    return d
    
def xrt_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['varimp']=modl.varimp()  
    d['roc']=modl.roc()      
    return d
    
    
def glm_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['coef']=modl.coef()  
    d['coef_norm']=modl.coef_norm()      
    return d
    
def model_performance_stats(perf):
    d={}
    try:    
      d['mse']=perf.mse()
    except:
      pass      
    try:    
      d['rmse']=perf.rmse() 
    except:
      pass      
    try:    
      d['null_degrees_of_freedom']=perf.null_degrees_of_freedom()
    except:
      pass      
    try:    
      d['residual_degrees_of_freedom']=perf.residual_degrees_of_freedom()
    except:
      pass      
    try:    
      d['residual_deviance']=perf.residual_deviance() 
    except:
      pass      
    try:    
      d['null_deviance']=perf.null_deviance() 
    except:
      pass      
    try:    
      d['aic']=perf.aic() 
    except:
      pass      
    try:
      d['logloss']=perf.logloss() 
    except:
      pass    
    try:
      d['auc']=perf.auc()
    except:
      pass  
    try:
      d['gini']=perf.gini()
    except:
      pass    
    return d
    
def impute_missing_values(df, x, scal=False):
    # determine column types
    ints, reals, enums = [], [], []
    for key, val in df.types.items():
        if key in x:
            if val == 'enum':
                enums.append(key)
            elif val == 'int':
                ints.append(key)            
            else: 
                reals.append(key)    
    _ = df[reals].impute(method='mean')
    _ = df[ints].impute(method='median')
    if scal:
        df[reals] = df[reals].scale()
        df[ints] = df[ints].scale()    
    return


def get_independent_variables(df, targ):
    C = [name for name in df.columns if name != targ]
    # determine column types
    ints, reals, enums = [], [], []
    for key, val in df.types.items():
        if key in C:
            if val == 'enum':
                enums.append(key)
            elif val == 'int':
                ints.append(key)            
            else: 
                reals.append(key)    
    x=ints+enums+reals
    return x
    
def get_all_variables_csv(i):
    ivd={}
    try:
      iv = pd.read_csv(i,header=None)
    except:
      sys.exit(1)    
    col=iv.values.tolist()[0]
    dt=iv.values.tolist()[1]
    i=0
    for c in col:
      ivd[c.strip()]=dt[i].strip()
      i+=1        
    return ivd
    
    

def check_all_variables(df,dct,y=None):     
    targ=list(dct.keys())     
    for key, val in df.types.items():
        if key in targ:
          if dct[key] not in ['real','int','enum']:                      
            targ.remove(key)  
    for key, val in df.types.items():
        if key in targ:            
          if dct[key] != val:
            print('convert ',key,' ',dct[key],' ',val)
            if dct[key]=='enum':
                try:
                  df[key] = df[key].asfactor() 
                except:
                  targ.remove(key)                 
            if dct[key]=='int': 
                try:                
                  df[key] = df[key].asnumeric() 
                except:
                  targ.remove(key)                  
            if dct[key]=='real':
                try:                
                  df[key] = df[key].asnumeric()  
                except:
                  targ.remove(key)                  
    if y is None:
      y=df.columns[-1] 
    if y in targ:
      targ.remove(y)
    else:
      y=targ.pop()            
    return targ    
    
def predictions(mod,data,run_id):
    test = h2o.import_file(data)
    mod_perf=mod_best.model_performance(test)
              
    stats_test={}
    stats_test=model_performance_stats(mod_perf)

    n=run_id+'_test_stats.json'
    dict_to_json(stats_test,n) 

    try:    
      cf=mod_perf.confusion_matrix(metrics=["f1","f2","f0point5","accuracy","precision","recall","specificity","absolute_mcc","min_per_class_accuracy","mean_per_class_accuracy"])
      cf_df=cf[0].table.as_data_frame()
      cf_df.to_csv(run_id+'_test_confusion_matrix.csv')
    except:
      pass

    predictions = mod_best.predict(test)
    predictions_df=test.cbind(predictions).as_data_frame() 
    predictions_df.to_csv(run_id+'_predictions.csv')
    return

def predictions_test(mod,test,run_id):
    mod_perf=mod_best.model_performance(test)          
    stats_test={}
    stats_test=model_performance_stats(mod_perf)
    n=run_id+'_test_stats.json'
    dict_to_json(stats_test,n) 
    try:
      cf=mod_perf.confusion_matrix(metrics=["f1","f2","f0point5","accuracy","precision","recall","specificity","absolute_mcc","min_per_class_accuracy","mean_per_class_accuracy"])
      cf_df=cf[0].table.as_data_frame()
      cf_df.to_csv(run_id+'_test_confusion_matrix.csv')
    except:
      pass
    predictions = mod_best.predict(test)    
    predictions_df=test.cbind(predictions).as_data_frame() 
    predictions_df.to_csv(run_id+'_predictions.csv')
    return predictions

def check_X(x,df):
    for name in x:
        if name not in df.columns:
          x.remove(name)  
    return x    
    
    
def get_stacked_ensemble(lst):
    se=None
    for model in model_set:
      if 'BestOfFamily' in model:
        se=model
    if se is None:     
      for model in model_set:
        if 'AllModels'in model:
          se=model           
    return se       
    
def get_variables_types(df):
    d={}
    for key, val in df.types.items():
        d[key]=val           
    return d    
    
#  End Functions

In [4]:
# path to the dataset
data_path='D:/NEU/Courses/BDIA/Assignments/Assignment 2/Data/Forbes Top 2000/Forbes Top2000 2017.csv'

In [5]:
data_path = os.path.join(os.path.abspath(os.curdir),data_path)

In [6]:
all_variables=None

In [7]:
run_id=alphabet(9)
# run_id to std out
print (run_id) 

erv7AqH8C


In [8]:
server_path=os.path.abspath(os.curdir)
os.chdir(server_path) 
run_dir = os.path.join(server_path,run_id)
os.mkdir(run_dir)
os.chdir(run_dir) 

In [9]:
# 65535 Highest port no
port_no=random.randint(5555,55555)
#h2o.init(strict_version_check=False,min_mem_size_GB=min_mem_size,port=port_no)
h2o.init(port=port_no)

Checking whether there is an H2O instance running at http://localhost:6836..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.181-b13, mixed mode)
  Starting server from C:\Users\Dhruv Patel\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\DHRUVP~1\AppData\Local\Temp\tmpgom1zssm
  JVM stdout: C:\Users\DHRUVP~1\AppData\Local\Temp\tmpgom1zssm\h2o_Dhruv_Patel_started_from_python.out
  JVM stderr: C:\Users\DHRUVP~1\AppData\Local\Temp\tmpgom1zssm\h2o_Dhruv_Patel_started_from_python.err
  Server is running at http://127.0.0.1:6836
Connecting to H2O server at http://127.0.0.1:6836... successful.


0,1
H2O cluster uptime:,04 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.8
H2O cluster version age:,"28 days, 22 hours and 39 minutes"
H2O cluster name:,H2O_from_python_Dhruv_Patel_il0kwv
H2O cluster total nodes:,1
H2O cluster free memory:,3.528 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [10]:
# meta data
meta_data = set_meta_data(run_id,analysis,target,run_time,classification,scale,max_models,balance_y,balance_threshold,name,nthreads,min_mem_size)
print(meta_data)  

{'run_id': 'erv7AqH8C', 'start_time': 1540064011.284734, 'target': None, 'max_models': 9, 'run_time': 333, 'scale': False, 'classification': False, 'balance': False, 'balance_threshold': 0.2, 'project': None, 'end_time': 1540064011.284734, 'execution_time': 0.0, 'nthreads': 1, 'min_mem_size': 6, 'analysis': 0}


In [11]:
print(data_path)

D:/NEU/Courses/BDIA/Assignments/Assignment 2/Data/Forbes Top 2000/Forbes Top2000 2017.csv


In [12]:
df = h2o.import_file(data_path)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [13]:
df.head()

Rank,Company,Country,Sales,Profits,Assets,Market Value,Sector,Industry
1,ICBC,China,151.4,42.0,3473.2,229.8,Financials,Major Banks
2,China Construction Bank,China,134.2,35.0,3016.6,200.5,Financials,Regional Banks
3,Berkshire Hathaway,United States,222.9,24.1,620.9,409.9,Financials,Investment Services
4,JPMorgan Chase,United States,102.5,24.2,2513.0,306.6,Financials,Major Banks
5,Wells Fargo,United States,97.6,21.9,1943.4,274.4,Financials,Major Banks
6,Agricultural Bank of China,China,115.7,27.8,2816.0,149.2,Financials,Regional Banks
7,Bank of America,United States,92.2,16.6,2196.8,231.9,Financials,Major Banks
8,Bank of China,China,113.1,24.9,2611.5,141.3,Financials,Major Banks
9,Apple,United States,217.5,45.2,331.1,752.0,Information Technology,Computer Hardware
10,Toyota Motor,Japan,249.9,17.1,412.5,171.9,Consumer Discretionary,Auto & Truck Manufacturers




In [14]:
df.describe()

Rows:2000
Cols:9




Unnamed: 0,Rank,Company,Country,Sales,Profits,Assets,Market Value,Sector,Industry
type,int,string,enum,real,real,real,real,enum,enum
mins,1.0,,,0.001,-13.0,0.001,0.072,,
mean,1000.5,,,17.664510499999995,1.2407126000000002,84.5335465,24.417844500000005,,
maxs,2000.0,,,485.3,45.2,3473.2,752.0,,
sigma,577.4945887192364,,,29.320115640008332,2.918742258183496,260.9844885736006,44.76139951633159,,
zeros,0,0,,0,1,0,0,,
missing,0,0,0,0,0,0,0,197,491
0,1.0,ICBC,China,151.4,42.0,3473.2,229.8,Financials,Major Banks
1,2.0,China Construction Bank,China,134.2,35.0,3016.6,200.5,Financials,Regional Banks
2,3.0,Berkshire Hathaway,United States,222.9,24.1,620.9,409.9,Financials,Investment Services


describe()[source]
Generate an in-depth description of this H2OFrame.

The description is a tabular print of the type, min, max, sigma, number of zeros, and number of missing elements for each H2OVec in this H2OFrame.

Returns:	None (print to stdout) 

In [15]:
# dependent variable
# assign target and inputs for classification or regression
if target==None:
  target=df.columns[-5]   
y = target

In [16]:
#Taking 'Profits' as the dependent variable y
print(y)

Profits


In [17]:
print(all_variables)

None


In [18]:
if all_variables is not None:
  ivd=get_all_variables_csv(all_variables)
  print(ivd)    
  X=check_all_variables(df,ivd,y)
  print(X)

In [19]:
df.describe()

Rows:2000
Cols:9




Unnamed: 0,Rank,Company,Country,Sales,Profits,Assets,Market Value,Sector,Industry
type,int,string,enum,real,real,real,real,enum,enum
mins,1.0,,,0.001,-13.0,0.001,0.072,,
mean,1000.5,,,17.664510499999995,1.2407126000000002,84.5335465,24.417844500000005,,
maxs,2000.0,,,485.3,45.2,3473.2,752.0,,
sigma,577.4945887192364,,,29.320115640008332,2.918742258183496,260.9844885736006,44.76139951633159,,
zeros,0,0,,0,1,0,0,,
missing,0,0,0,0,0,0,0,197,491
0,1.0,ICBC,China,151.4,42.0,3473.2,229.8,Financials,Major Banks
1,2.0,China Construction Bank,China,134.2,35.0,3016.6,200.5,Financials,Regional Banks
2,3.0,Berkshire Hathaway,United States,222.9,24.1,620.9,409.9,Financials,Investment Services


In [20]:
# independent variables


X = []  
if all_variables is None:
  X=get_independent_variables(df, target)  
else: 
  ivd=get_all_variables_csv(all_variables)    
  X=check_all_variables(df, ivd)


X=check_X(X,df)


# Add independent variables

meta_data['X']=X  


# impute missing values

_=impute_missing_values(df,X, scale)

In [21]:
print(X)

['Rank', 'Country', 'Sector', 'Industry', 'Company', 'Sales', 'Assets', 'Market Value']


In [22]:
if analysis == 3:
  classification=False
elif analysis == 2:
  classification=True
elif analysis == 1:
  classification=True

In [23]:
print(classification)

False


In [24]:
# Force target to be factors
# Only 'int' or 'string' are allowed for asfactor(), got Target (Total orders):real 

if classification:
    df[y] = df[y].asfactor()

In [25]:
def check_y(y,df):
  ok=False
  C = [name for name in df.columns if name == y]
  for key, val in df.types.items():
    if key in C:
      if val in ['real','int','enum']:        
        ok=True         
  return ok, val   

In [26]:
ok,val=check_y(y,df)

In [27]:
print(val)

enum


In [28]:
print(ok)

True


In [29]:
if val=='enum':
    print(df[y].levels())

[]


In [30]:
df.describe()

Rows:2000
Cols:9




Unnamed: 0,Rank,Company,Country,Sales,Profits,Assets,Market Value,Sector,Industry
type,int,string,enum,real,real,real,real,enum,enum
mins,1.0,,,0.001,-13.0,0.001,0.072,,
mean,1000.5,,,17.664510499999995,1.2407126000000002,84.5335465,24.417844500000005,,
maxs,2000.0,,,485.3,45.2,3473.2,752.0,,
sigma,577.4945887192364,,,29.320115640008332,2.918742258183496,260.9844885736006,44.76139951633159,,
zeros,0,0,,0,1,0,0,,
missing,0,0,0,0,0,0,0,197,491
0,1.0,ICBC,China,151.4,42.0,3473.2,229.8,Financials,Major Banks
1,2.0,China Construction Bank,China,134.2,35.0,3016.6,200.5,Financials,Regional Banks
2,3.0,Berkshire Hathaway,United States,222.9,24.1,620.9,409.9,Financials,Investment Services


In [31]:
allV=get_variables_types(df)
allV

{'Assets': 'real',
 'Company': 'string',
 'Country': 'enum',
 'Industry': 'enum',
 'Market Value': 'real',
 'Profits': 'real',
 'Rank': 'int',
 'Sales': 'real',
 'Sector': 'enum'}

In [32]:
meta_data['variables']=allV

In [33]:
# split into training and test for showing how to predict
train, test = df.split_frame([0.9])

In [34]:
# Set up AutoML

aml = H2OAutoML(max_runtime_secs=run_time,project_name = name)

In [35]:
model_start_time = time.time()

In [36]:
aml.train(x=X,y=y,training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [37]:
meta_data['model_execution_time'] = time.time() - model_start_time

In [38]:
# get leaderboard
aml_leaderboard_df=aml.leaderboard.as_data_frame()

In [39]:
aml_leaderboard_df

Unnamed: 0,model_id,mean_residual_deviance,rmse,mse,mae,rmsle
0,GBM_grid_0_AutoML_20181020_153333_model_0,1.506223,1.227283,1.506223,0.558948,
1,StackedEnsemble_BestOfFamily_0_AutoML_20181020...,1.547755,1.244088,1.547755,0.554031,
2,StackedEnsemble_AllModels_0_AutoML_20181020_15...,1.571258,1.253498,1.571258,0.571155,
3,GBM_grid_0_AutoML_20181020_153333_model_75,1.578576,1.256414,1.578576,0.568941,
4,GBM_grid_0_AutoML_20181020_153333_model_35,1.689829,1.299934,1.689829,0.591543,
5,GBM_grid_0_AutoML_20181020_153333_model_10,1.767443,1.329452,1.767443,0.594771,
6,XRT_0_AutoML_20181020_153333,1.771997,1.331164,1.771997,0.579552,
7,GBM_grid_0_AutoML_20181020_153333_model_77,1.881134,1.371544,1.881134,0.600690,
8,GBM_grid_0_AutoML_20181020_153333_model_41,1.888916,1.374379,1.888916,0.614970,
9,GBM_grid_0_AutoML_20181020_153333_model_59,1.889375,1.374545,1.889375,0.609318,


In [40]:
# Start best model as first model

model_set=aml_leaderboard_df['model_id']
mod_best=h2o.get_model(model_set[0])

In [41]:
mod_best._id

'GBM_grid_0_AutoML_20181020_153333_model_0'

In [42]:
# Get stacked ensemble  
se=get_stacked_ensemble(model_set)

In [43]:
print(se)

StackedEnsemble_BestOfFamily_0_AutoML_20181020_153333


In [44]:
if se is not None:
  mod_best=h2o.get_model(se)

In [45]:
dir(mod_best)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_bc',
 '_bcin',
 '_check_targets',
 '_compute_algo',
 '_estimator_type',
 '_future',
 '_get_metrics',
 '_have_mojo',
 '_have_pojo',
 '_id',
 '_is_xvalidated',
 '_job',
 '_keyify_if_h2oframe',
 '_make_model',
 '_metrics_class',
 '_model_json',
 '_parms',
 '_plot',
 '_requires_training_frame',
 '_resolve_model',
 '_verify_training_frame_params',
 '_xval_keys',
 'actual_params',
 'aic',
 'algo',
 'auc',
 'base_models',
 'biases',
 'catoffsets',
 'coef',
 'coef_norm',
 'cross_validation_fold_assignment',
 'cross_validation_holdout_predictions',
 'cross_validation_metrics_summary',
 'cross_validation_models',


In [46]:
mod_best._id

'StackedEnsemble_BestOfFamily_0_AutoML_20181020_153333'

In [47]:
mod_best._get_metrics

<function h2o.model.model_base.ModelBase._get_metrics>

In [48]:
type(mod_best)

h2o.estimators.stackedensemble.H2OStackedEnsembleEstimator

In [49]:
mods=mod_best.coef_norm
print(mods)

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_BestOfFamily_0_AutoML_20181020_153333
No model summary for this model


ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 0.0992155241536476
RMSE: 0.3149849586149275
MAE: 0.1962688861094046
RMSLE: NaN
R^2: 0.9871742703359363
Mean Residual Deviance: 0.0992155241536476
Null degrees of freedom: 1460
Residual degrees of freedom: 1458
Null deviance: 11301.803841587578
Residual deviance: 144.95388078847913
AIC: 778.5551894024537

ModelMetricsRegressionGLM: stackedensemble
** Reported on validation data. **

MSE: 3.7823683235304633
RMSE: 1.9448311812418226
MAE: 0.6349648692732026
RMSLE: NaN
R^2: 0.6708481512246025
Mean Residual Deviance: 3.7823683235304633
Null degrees of freedom: 347
Residual degrees of freedom: 345
Null deviance: 4006.4948578993635
Residual deviance: 1316.2641765886012
AIC: 1458.5431423210332

ModelMetricsRegressionGLM: stackedensemble
** Reported on cross-

In [50]:
bm=stackedensemble_df(aml_leaderboard_df)

In [51]:
bm

['GBM_grid_0_AutoML_20181020_153333_model_0',
 'GLM_grid_0_AutoML_20181020_153333_model_0',
 'DRF_0_AutoML_20181020_153333',
 'XRT_0_AutoML_20181020_153333',
 'DeepLearning_0_AutoML_20181020_153333']

In [52]:
aml_leaderboard_df

Unnamed: 0,model_id,mean_residual_deviance,rmse,mse,mae,rmsle
0,GBM_grid_0_AutoML_20181020_153333_model_0,1.506223,1.227283,1.506223,0.558948,
1,StackedEnsemble_BestOfFamily_0_AutoML_20181020...,1.547755,1.244088,1.547755,0.554031,
2,StackedEnsemble_AllModels_0_AutoML_20181020_15...,1.571258,1.253498,1.571258,0.571155,
3,GBM_grid_0_AutoML_20181020_153333_model_75,1.578576,1.256414,1.578576,0.568941,
4,GBM_grid_0_AutoML_20181020_153333_model_35,1.689829,1.299934,1.689829,0.591543,
5,GBM_grid_0_AutoML_20181020_153333_model_10,1.767443,1.329452,1.767443,0.594771,
6,XRT_0_AutoML_20181020_153333,1.771997,1.331164,1.771997,0.579552,
7,GBM_grid_0_AutoML_20181020_153333_model_77,1.881134,1.371544,1.881134,0.600690,
8,GBM_grid_0_AutoML_20181020_153333_model_41,1.888916,1.374379,1.888916,0.614970,
9,GBM_grid_0_AutoML_20181020_153333_model_59,1.889375,1.374545,1.889375,0.609318,


In [53]:
#  Get best_models and coef_norm()
best_models={}
best_models=stackedensemble(mod_best)
bm=[]
if best_models is not None: 
  if 'Intercept' in best_models.keys():
    del best_models['Intercept']
  bm=list(best_models.keys())
else:
  best_models={}
  bm=stackedensemble_df(aml_leaderboard_df)   
  for b in bm:   
    best_models[b]=None

if mod_best.model_id not in bm:
    bm.append(mod_best.model_id)

In [54]:
bm

['GBM_grid_0_AutoML_20181020_153333_model_0',
 'XRT_0_AutoML_20181020_153333',
 'DRF_0_AutoML_20181020_153333',
 'GLM_grid_0_AutoML_20181020_153333_model_0',
 'DeepLearning_0_AutoML_20181020_153333',
 'StackedEnsemble_BestOfFamily_0_AutoML_20181020_153333']

In [55]:
# Best of Family leaderboard

aml_leaderboard_df=aml_leaderboard_df.loc[aml_leaderboard_df['model_id'].isin(bm)]


In [56]:
aml_leaderboard_df

Unnamed: 0,model_id,mean_residual_deviance,rmse,mse,mae,rmsle
0,GBM_grid_0_AutoML_20181020_153333_model_0,1.506223,1.227283,1.506223,0.558948,
1,StackedEnsemble_BestOfFamily_0_AutoML_20181020...,1.547755,1.244088,1.547755,0.554031,
6,XRT_0_AutoML_20181020_153333,1.771997,1.331164,1.771997,0.579552,
17,DRF_0_AutoML_20181020_153333,2.066301,1.437463,2.066301,0.619731,
51,GLM_grid_0_AutoML_20181020_153333_model_0,2.832115,1.682889,2.832115,0.730447,
59,DeepLearning_0_AutoML_20181020_153333,3.036828,1.74265,3.036828,0.841905,


In [57]:
# save leaderboard
leaderboard_stats=run_id+'_leaderboard.csv'
aml_leaderboard_df.to_csv(leaderboard_stats)

In [58]:
top=aml_leaderboard_df.iloc[0]['model_id']
print(top)

GBM_grid_0_AutoML_20181020_153333_model_0


In [59]:
mod_best=h2o.get_model(top)
print(mod_best._id)
print(mod_best.algo)

GBM_grid_0_AutoML_20181020_153333_model_0
gbm


In [60]:
meta_data['mod_best']=mod_best._id
meta_data['mod_best_algo']=mod_best.algo

In [61]:
meta_data['models']=bm

In [62]:
models_path=os.path.join(run_dir,'models')
for mod in bm:
  try:   
    m=h2o.get_model(mod) 
    h2o.save_model(m, path = models_path)
  except:    
    pass    

In [63]:
print(models_path)

D:\NEU\Courses\BDIA\Assignments\Assignment 2\erv7AqH8C\models


In [64]:
# GBM
 
mod,mod_id=get_model_by_algo("GBM",best_models)
if mod is not None:
    try:     
        sh_df=mod.scoring_history()
        sh_df.to_csv(run_id+'_gbm_scoring_history.csv') 
    except:
        pass   
    try:     
        stats_gbm={}
        stats_gbm=gbm_stats(mod)
        n=run_id+'_gbm_stats.json'
        dict_to_json(stats_gbm,n)
        print(stats_gbm)
    except:
        pass        

{'algo': 'gbm', 'model_id': 'GBM_grid_0_AutoML_20181020_153333_model_0', 'varimp': [('Rank', 26231.212890625, 1.0, 0.4682802846474105), ('Market Value', 13607.345703125, 0.5187463408521322, 0.2429186841540391), ('Assets', 5675.7646484375, 0.2163744647303751, 0.10132389593437115), ('Industry', 4073.401611328125, 0.15528834401645047, 0.07271846993844842), ('Country', 2522.34326171875, 0.09615808739900976, 0.04502893653835886), ('Sales', 2490.11083984375, 0.09492930617530508, 0.044453522517153045), ('Sector', 1415.873291015625, 0.05397666119821917, 0.025276206270218912)]}


In [65]:
# DeepLearning

mod,mod_id=get_model_by_algo("Dee",best_models)


In [66]:
if mod is not None:
    try:    
        sh_df=mod.scoring_history()
        sh_df.to_csv(run_id+'_dl_scoring_history.csv') 
    except:
        pass 
    try:
        stats_dl={}
        stats_dl=dl_stats(mod)
        n=run_id+'_dl_stats.json'
        dict_to_json(stats_dl,n)
        print(stats_dl)
    except:
        pass    
    try:
        cf=mod.confusion_matrix()    
        cf_df.to_csv(run_id+'_dl_confusion_matrix.csv')
    except:
        pass       

{'algo': 'deeplearning', 'model_id': 'DeepLearning_0_AutoML_20181020_153333', 'varimp': [('Country.Russia', 1.0, 1.0, 0.009733006528549224), ('Rank', 0.9645623564720154, 0.9645623564720154, 0.009388091712734949), ('Country.Peru', 0.9462348818778992, 0.9462348818778992, 0.009209710282858595), ('Country.United Kingdom', 0.9207746386528015, 0.9207746386528015, 0.00896190556933027), ('Country.Netherlands', 0.9130305051803589, 0.9130305051803589, 0.00888653186768503), ('Country.Indonesia', 0.8933472633361816, 0.8933472633361816, 0.008694954746312638), ('Industry.Drug Retail', 0.8813813328742981, 0.8813813328742981, 0.00857849026700696), ('Country.Mexico', 0.8637257218360901, 0.8637257218360901, 0.008406648089506556), ('Country.Vietnam', 0.859435498714447, 0.859435498714447, 0.00836489131985467), ('Industry.Department Stores', 0.852350115776062, 0.852350115776062, 0.008295929241458097), ('Industry.Computer Hardware', 0.8421946167945862, 0.8421946167945862, 0.008197085703570719), ('Industry.H




In [67]:
# DRF

mod,mod_id=get_model_by_algo("DRF",best_models)
if mod is not None:
    try:     
         sh_df=mod.scoring_history()
         sh_df.to_csv(run_id+'_drf_scoring_history.csv') 
    except:
         pass  
    try: 
         stats_drf={}
         stats_drf=drf_stats(mod)
         n=run_id+'_drf_stats.json'
         dict_to_json(stats_drf,n)
         print(stats_drf)
    except:
         pass     

In [68]:
# XRT

mod,mod_id=get_model_by_algo("XRT",best_models)
if mod is not None:
    try:     
         sh_df=mod.scoring_history()
         sh_df.to_csv(run_id+'_xrt_scoring_history.csv')
    except:
         pass     
    try:        
         stats_xrt={}
         stats_xrt=xrt_stats(mod)
         n=run_id+'_xrt_stats.json'
         dict_to_json(stats_xrt,n)
         print(stats_xrt)
    except:
         pass     

In [69]:
# GLM

mod,mod_id=get_model_by_algo("GLM",best_models)
if mod is not None:
    try:     
         stats_glm={}
         stats_glm=glm_stats(mod)
         n=run_id+'_glm_stats.json'
         dict_to_json(stats_glm,n)
         print(stats_glm)
    except:
         pass     

{'algo': 'glm', 'model_id': 'GLM_grid_0_AutoML_20181020_153333_model_0', 'coef': {'Intercept': 0.21542927435308945, 'Industry.Advertising': 0.0062168927852227045, 'Industry.Aerospace & Defense': 0.02054984461515262, 'Industry.Air Courier': 0.006455569033592708, 'Industry.Airline': 0.16476440177143503, 'Industry.Aluminum': -0.015445558253727752, 'Industry.Apparel/Accessories': 0.060137574087125785, 'Industry.Apparel/Footwear Retail': 0.03419597378674742, 'Industry.Auto & Truck Manufacturers': 0.20029339387943526, 'Industry.Auto & Truck Parts': 0.145268031933045, 'Industry.Beverages': -0.03215760603439055, 'Industry.Biotechs': 0.21770782800858693, 'Industry.Broadcasting & Cable': 0.0670610395741292, 'Industry.Business & Personal Services': 0.06541606529218305, 'Industry.Business Products & Supplies': -0.027470487953940238, 'Industry.Casinos & Gaming': 0.014560809878560064, 'Industry.Communications Equipment': 0.04673231373524576, 'Industry.Computer & Electronics Retail': 0.01110717017132




In [70]:
predictions_df=predictions_test(mod_best,test,run_id)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [71]:
predictions_df.head()

predict
18.4581
10.4422
8.78674
9.11098
7.82644
7.93705
9.4822
8.56284
11.0072
4.02624




In [72]:
predictions_df.describe()

Rows:191
Cols:1




Unnamed: 0,predict
type,real
mins,-6.235224977098063
mean,1.255693375256239
maxs,18.45809537725048
sigma,2.4333045350872533
zeros,0
missing,0
0,18.45809537725048
1,10.442222556864488
2,8.786742923207692


In [73]:
# Update and save meta data

meta_data['end_time'] = time.time()
meta_data['execution_time'] = meta_data['end_time'] - meta_data['start_time']
  
n=run_id+'_meta_data.json'
dict_to_json(meta_data,n)    


In [74]:
meta_data

{'X': ['Rank',
  'Country',
  'Sector',
  'Industry',
  'Company',
  'Sales',
  'Assets',
  'Market Value'],
 'analysis': 0,
 'balance': False,
 'balance_threshold': 0.2,
 'classification': False,
 'end_time': 1540064359.5927272,
 'execution_time': 348.30799317359924,
 'max_models': 9,
 'min_mem_size': 6,
 'mod_best': 'GBM_grid_0_AutoML_20181020_153333_model_0',
 'mod_best_algo': 'gbm',
 'model_execution_time': 344.58145689964294,
 'models': ['GBM_grid_0_AutoML_20181020_153333_model_0',
  'XRT_0_AutoML_20181020_153333',
  'DRF_0_AutoML_20181020_153333',
  'GLM_grid_0_AutoML_20181020_153333_model_0',
  'DeepLearning_0_AutoML_20181020_153333',
  'StackedEnsemble_BestOfFamily_0_AutoML_20181020_153333'],
 'nthreads': 1,
 'project': None,
 'run_id': 'erv7AqH8C',
 'run_time': 333,
 'scale': False,
 'start_time': 1540064011.284734,
 'target': None,
 'variables': {'Assets': 'real',
  'Company': 'string',
  'Country': 'enum',
  'Industry': 'enum',
  'Market Value': 'real',
  'Profits': 'real',


In [75]:
# Clean up
os.chdir(server_path)

In [76]:
h2o.cluster().shutdown()

H2O session _sid_8e61 closed.
