In [6]:
import h2o
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,8 hours 40 mins
H2O_cluster_timezone:,America/Asuncion
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.1
H2O_cluster_version_age:,1 month and 13 days
H2O_cluster_name:,H2O_from_python_matiaslopez_wk0ngf
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,15.21 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [7]:
import pandas as pd
y = "Protestas"
balanced_df = pd.read_csv('../balanced_output_35.csv', index_col=False)
# https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html
# from sklearn.model_selection import train_test_split
# train_data, test_data = train_test_split(balanced_df, test_size=0.2, random_state=42)

## Random undersampling

In [8]:
# Random undersampling
if True:
	balanced_df_true = balanced_df[balanced_df[y] == True]
	balanced_df_false = balanced_df[balanced_df[y] == False]
	n_cmp = len(balanced_df_true)
	n_no_cmp = len(balanced_df_false)
	total = n_cmp + n_no_cmp
	sample_false = balanced_df_false.sample(int(total*.65)) #Sample 70% no complaints 30 % complaints
	balanced_df = pd.concat([balanced_df_true, sample_false])

## Training

In [9]:
train = h2o.H2OFrame.from_python(balanced_df)
# Identify predictors and response
x = train.columns
x.remove(y)
# For binary classification, response should be a factor
train[y] = train[y].asfactor()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [10]:
# Run AutoML for 10 base models
aml = H2OAutoML(max_models=40, seed=23)
aml.train(x=x, y=y, training_frame=train)

# View the AutoML Leader
# board
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

AutoML progress: |
00:01:51.888: _train param, Dropping bad and constant columns: [contracts.investmentProjects.id_36, contracts.investmentProjects.id_37, parties.details.legalEntityTypeDetail buyer_15, parties.details.legalEntityTypeDetail buyer_14, parties.details.legalEntityTypeDetail buyer_13, parties.details.legalEntityTypeDetail buyer_12, parties.details.legalEntityTypeDetail buyer_11, parties.details.legalEntityTypeDetail buyer_10, parties.details.EntityType payee_1, parties.details.EntityType payee_2, parties.details.EntityType payee_3, parties.details.EntityType payee_4, planning.items.classification.id.n1_1_56, planning.items.classification.id.n1_1_57, contracts.investmentProjects.id_102, contracts.investmentProjects.id_100, Monto faltante, parties.details.legalEntityTypeDetail notifiedSupplier_18, tender.status_active, contracts.investmentProjects.id_17, tender.statusDetails_En Convocatoria (Abierta), parties.details.EntityType candidate_2, parties.details.EntityType candida

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_AllModels_1_AutoML_2_20230803_00151,0.983771,0.11134,0.976239,0.0549523,0.170705,0.0291401
StackedEnsemble_BestOfFamily_1_AutoML_2_20230803_00151,0.983336,0.112825,0.975743,0.0554224,0.171617,0.0294525
GBM_grid_1_AutoML_2_20230803_00151_model_7,0.982328,0.11741,0.97475,0.0568376,0.173019,0.0299356
GBM_grid_1_AutoML_2_20230803_00151_model_5,0.982037,0.117296,0.974222,0.0560288,0.173492,0.0300995
GBM_1_AutoML_2_20230803_00151,0.981763,0.118986,0.973831,0.0565484,0.174997,0.0306238
GBM_4_AutoML_2_20230803_00151,0.981478,0.119157,0.973639,0.0564148,0.175565,0.0308232
GBM_grid_1_AutoML_2_20230803_00151_model_4,0.981133,0.122226,0.973295,0.0587552,0.176445,0.0311327
GBM_grid_1_AutoML_2_20230803_00151_model_1,0.98083,0.123478,0.972367,0.0576496,0.178569,0.0318868
XGBoost_grid_1_AutoML_2_20230803_00151_model_5,0.980686,0.125428,0.972837,0.0565642,0.176564,0.0311749
GBM_3_AutoML_2_20230803_00151,0.980493,0.123107,0.972223,0.0577649,0.17892,0.0320123


In [11]:
model_path = h2o.save_model(model=aml.leader, path="train2021/smoteModelv7", force=True)

In [12]:
print(model_path)

/Users/matiaslopez/Documents/tesis/tesis-model/Training/train2021/smoteModelv7/StackedEnsemble_AllModels_1_AutoML_2_20230803_00151


In [13]:
saved_model = h2o.load_model(model_path)

## Testing 2022 Data

In [14]:
df_2022 = pd.read_csv('../df_dummizado_from_2021.csv', index_col=False)
df_2022 = df_2022[df_2022['date.year'] == 2022]
test_2022 = h2o.H2OFrame.from_python(df_2022)
test_2022[y] = test_2022[y].asfactor()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [25]:
def get_performance(model, test: h2o.H2OFrame):
	cm = model.model_performance(test).confusion_matrix().to_list()
	perf_t = cm[1][1] / (cm[1][0] + cm[1][1])
	perf_f = cm[0][0] / (cm[0][1] + cm[0][0])
	return perf_t, perf_f

In [29]:
best = ''
best_score_t = 0
best_score_f = 0
df_leaderboard = aml.leaderboard.as_data_frame()
model_ids = df_leaderboard['model_id']
for model_id in model_ids:
	model = h2o.get_model(model_id)
	score_t, score_f = get_performance(model, test_2022)
	print(model_id, score_t, score_f)
	if score_t > best_score_t and score_f > best_score_f:
		best_score_t = score_t
		best_score_f = score_f
		best = model_id

print(best, best_score_t, best_score_f)

StackedEnsemble_AllModels_1_AutoML_2_20230803_00151 0.4707655213984328 0.9492660452159005
StackedEnsemble_BestOfFamily_1_AutoML_2_20230803_00151 0.5006027727546715 0.9294439087110254
GBM_grid_1_AutoML_2_20230803_00151_model_7 0.47377938517179025 0.9139612129004607
GBM_grid_1_AutoML_2_20230803_00151_model_5 0.48884870403857744 0.9214079074252652
GBM_1_AutoML_2_20230803_00151 0.47106690777576854 0.9199614271938283
GBM_4_AutoML_2_20230803_00151 0.5301386377335744 0.8835851280402871
GBM_grid_1_AutoML_2_20230803_00151_model_4 0.4804098854731766 0.9248366013071896
GBM_grid_1_AutoML_2_20230803_00151_model_1 0.4590114526823388 0.9245151612557592
XGBoost_grid_1_AutoML_2_20230803_00151_model_5 0.5819770946353224 0.8502089360334297
GBM_3_AutoML_2_20230803_00151 0.5295358649789029 0.8836387013821922
XGBoost_grid_1_AutoML_2_20230803_00151_model_8 0.47649186256781195 0.9187828136719168
GBM_grid_1_AutoML_2_20230803_00151_model_6 0.47528631705846897 0.9051216114861245
GBM_2_AutoML_2_20230803_00151 0.4

In [30]:
best_model = h2o.get_model('DeepLearning_grid_1_AutoML_2_20230803_00151_model_3')
best_model.model_performance(test_2022).confusion_matrix()

Unnamed: 0,False,True,Error,Rate
False,15442.0,3224.0,0.1727,(3224.0/18666.0)
True,1076.0,2242.0,0.3243,(1076.0/3318.0)
Total,16518.0,5466.0,0.1956,(4300.0/21984.0)


In [21]:
best_model.model_performance(test_2022).confusion_matrix().to_list()

[[4702, 13964], [240, 3078]]

In [31]:
model_path = h2o.save_model(model=best_model, path="train2021/smoteModelv8", force=True)