In [48]:
import h2o
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,6 hours 30 mins
H2O_cluster_timezone:,America/Asuncion
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,3 months and 3 days
H2O_cluster_name:,H2O_from_python_davidnunez_ai0opn
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.969 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [49]:
import pandas as pd
y = "Protestas"
balanced_df = pd.read_csv('../balanced_output_35.csv', index_col=False)
# https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html
# from sklearn.model_selection import train_test_split
# train_data, test_data = train_test_split(balanced_df, test_size=0.2, random_state=42)

In [50]:
# Random undersampling
if True:
	balanced_df_true = balanced_df[balanced_df[y] == True]
	balanced_df_false = balanced_df[balanced_df[y] == False]
	n_cmp = len(balanced_df_true)
	n_no_cmp = len(balanced_df_false)
	total = n_cmp + n_no_cmp
	sample_false = balanced_df_false.sample(int(total*.65)) #Sample 70% no complaints 30 % complaints
	balanced_df = pd.concat([balanced_df_true, sample_false])

In [51]:
train = h2o.H2OFrame.from_python(balanced_df)
# Identify predictors and response
x = train.columns
x.remove(y)
# For binary classification, response should be a factor
train[y] = train[y].asfactor()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [52]:
# Run AutoML for 10 base models
aml = H2OAutoML(max_models=10, seed=24)
aml.train(x=x, y=y, training_frame=train)

# View the AutoML Leader
# board
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

AutoML progress: |
00:40:34.781: AutoML: XGBoost is not available; skipping it.
00:40:34.787: _train param, Dropping bad and constant columns: [contracts.investmentProjects.id_30, contracts.investmentProjects.id_37, parties.details.legalEntityTypeDetail buyer_15, parties.details.legalEntityTypeDetail buyer_14, parties.details.legalEntityTypeDetail buyer_13, parties.details.legalEntityTypeDetail buyer_12, parties.details.legalEntityTypeDetail buyer_11, parties.details.legalEntityTypeDetail buyer_10, parties.details.EntityType payee_1, parties.details.EntityType payee_2, parties.details.EntityType payee_3, parties.details.EntityType payee_4, planning.items.classification.id.n1_1_56, planning.items.classification.id.n1_1_57, contracts.investmentProjects.id_102, contracts.investmentProjects.id_100, Monto faltante, parties.details.legalEntityTypeDetail notifiedSupplier_18, tender.status_active, contracts.investmentProjects.id_17, tender.statusDetails_En Convocatoria (Abierta), parties.detai

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_AllModels_1_AutoML_8_20230801_04034,0.98325,0.115039,0.975306,0.0569815,0.173944,0.0302565
StackedEnsemble_BestOfFamily_1_AutoML_8_20230801_04034,0.982634,0.118152,0.974342,0.0575817,0.176399,0.0311165
GBM_grid_1_AutoML_8_20230801_04034_model_1,0.981488,0.119958,0.97351,0.0572734,0.176002,0.0309766
GBM_1_AutoML_8_20230801_04034,0.981437,0.120108,0.973444,0.057507,0.175737,0.0308834
GBM_4_AutoML_8_20230801_04034,0.980872,0.12076,0.972984,0.0566863,0.176507,0.0311547
GBM_3_AutoML_8_20230801_04034,0.980623,0.123439,0.972266,0.0590592,0.179362,0.0321708
GBM_2_AutoML_8_20230801_04034,0.980446,0.126258,0.971434,0.0617359,0.182138,0.0331744
DRF_1_AutoML_8_20230801_04034,0.978869,0.178253,0.967184,0.0674985,0.211321,0.0446565
GBM_5_AutoML_8_20230801_04034,0.976576,0.148378,0.964633,0.0705403,0.19869,0.0394777
XRT_1_AutoML_8_20230801_04034,0.966639,0.311313,0.943214,0.086777,0.292208,0.0853857


In [43]:
model_path = h2o.save_model(model=aml.leader, path="train2021/smoteModelv7", force=True)

In [44]:
print(model_path)

/Users/davidnunez/Desktop/tesis/tesis-model/Training/train2021/smoteModelv7/StackedEnsemble_AllModels_1_AutoML_7_20230731_234018


In [37]:
saved_model = h2o.load_model(model_path)

## Testing 2022 Data

In [53]:
df_2022 = pd.read_csv('../df_dummizado_from_2021.csv', index_col=False)
df_2022 = df_2022[df_2022['date.year'] == 2022]
test_2022 = h2o.H2OFrame.from_python(df_2022)
test_2022[y] = test_2022[y].asfactor()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [90]:
def get_performance(model, test: h2o.H2OFrame):
	cm = model.model_performance(test).confusion_matrix().to_list()
	perf = cm[1][1] / (cm[1][0] + cm[1][1])
	return perf

In [92]:
best = ''
best_score = 0
df_leaderboard = aml.leaderboard.as_data_frame()
model_ids = df_leaderboard['model_id']
for model_id in model_ids:
	model = h2o.get_model(model_id)
	score = get_performance(model, test_2022)
	if score > best_score:
		best_score = score
		best = model_id

print(best, best_score)

StackedEnsemble_BestOfFamily_1_AutoML_8_20230801_04034 0.554249547920434
