In [42]:
PARAMETERS = {
	'BALANCED_DATASET_CSV_PATH': '../balanced_output_35.csv',
	'TARGET_COLUMN': 'Protestas',
	'USE_RANDOM_UNDERSAMPLING': True,
	'NO_COMPLAINTS_%': 0.65,
	'SAVE_LEADER_MODEL': True,
	'SAVED_MODEL_PATH': 'train2022/smoteModelv1',
	'TEST_DATA_CSV': '../test_2023.csv',
	
}
import h2o
from h2o.automl import H2OAutoML
aml = H2OAutoML(max_models=5, seed=23, stopping_metric="AUCPR", sort_metric="AUCPR")

In [43]:

# Start the H2O cluster (locally)
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,17 hours 0 mins
H2O_cluster_timezone:,America/Asuncion
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.1
H2O_cluster_version_age:,1 month and 19 days
H2O_cluster_name:,H2O_from_python_davidnunez_w6en3c
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.799 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [44]:
import pandas as pd
y = PARAMETERS['TARGET_COLUMN']
balanced_df = pd.read_csv(PARAMETERS['BALANCED_DATASET_CSV_PATH'], index_col=False)
# https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html
# from sklearn.model_selection import train_test_split
# train_data, test_data = train_test_split(balanced_df, test_size=0.2, random_state=42)

## Random undersampling

In [45]:
balanced_df_true = balanced_df[balanced_df[y] == True]
balanced_df_false = balanced_df[balanced_df[y] == False]
n_cmp = len(balanced_df_true)
n_no_cmp = len(balanced_df_false)
n_cmp / (n_cmp + n_no_cmp)

0.2592568871972698

In [46]:
# Random undersampling
if PARAMETERS['USE_RANDOM_UNDERSAMPLING']:
	balanced_df_true = balanced_df[balanced_df[y] == True]
	balanced_df_false = balanced_df[balanced_df[y] == False]
	n_cmp = len(balanced_df_true)
	n_no_cmp = len(balanced_df_false)
	total = n_cmp + n_no_cmp
	sample_false = balanced_df_false.sample(int(total*PARAMETERS['NO_COMPLAINTS_%'])) #Sample 70% no complaints 30 % complaints
	#TODO: Analisis de sensibilidad de este parametro 
	balanced_df = pd.concat([balanced_df_true, sample_false])

## Training

In [47]:
train = h2o.H2OFrame.from_python(balanced_df)
# Identify predictors and response
x = train.columns
x.remove(y)
# For binary classification, response should be a factor
train[y] = train[y].asfactor()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [48]:
# Run AutoML for 10 base models
aml.train(x=x, y=y, training_frame=train)

# View the AutoML Leader
# board
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

AutoML progress: |
08:15:36.142: AutoML: XGBoost is not available; skipping it.
08:15:36.147: _train param, Dropping bad and constant columns: [tender.procurementMethod_open, parties.details.legalEntityTypeDetail buyer_15, parties.details.legalEntityTypeDetail buyer_14, parties.details.legalEntityTypeDetail buyer_13, parties.details.legalEntityTypeDetail buyer_12, parties.details.legalEntityTypeDetail buyer_11, parties.details.legalEntityTypeDetail buyer_10, parties.details.EntityType payee_1, parties.details.EntityType payee_2, parties.details.EntityType payee_3, parties.details.EntityType payee_4, planning.items.classification.id.n1_1_57, planning.items.classification.id.n1_1_58, Monto faltante, tender.status_active, tender.statusDetails_En Convocatoria (Abierta), parties.details.EntityType candidate_2, parties.details.EntityType candidate_1, parties.details.EntityType candidate_4, parties.details.legalEntityTypeDetail candidate_1, awards.statusDetails_2, parties.details.EntityType c

model_id,aucpr,auc,logloss,mean_per_class_error,rmse,mse
StackedEnsemble_AllModels_1_AutoML_2_20230809_81536,0.967501,0.979775,0.139971,0.07027,0.197084,0.0388419
StackedEnsemble_BestOfFamily_1_AutoML_2_20230809_81536,0.967467,0.979748,0.139988,0.0704329,0.197097,0.0388473
GBM_1_AutoML_2_20230809_81536,0.964775,0.978218,0.151889,0.0713721,0.203226,0.0413009
DRF_1_AutoML_2_20230809_81536,0.961728,0.976461,0.182656,0.0734592,0.218233,0.0476255
GBM_3_AutoML_2_20230809_81536,0.956875,0.973056,0.170645,0.0822657,0.216012,0.0466613
GBM_2_AutoML_2_20230809_81536,0.952556,0.970542,0.18021,0.0871124,0.222772,0.0496273
GLM_1_AutoML_2_20230809_81536,0.929896,0.953085,0.216744,0.110594,0.247147,0.0610819


In [49]:
if PARAMETERS['SAVE_LEADER_MODEL']:
	model_path = h2o.save_model(model=aml.leader, path=PARAMETERS['SAVED_MODEL_PATH'], force=True)
	print(model_path)

/Users/davidnunez/Desktop/tesis/tesis-model/Training/train2022/smoteModelv1/StackedEnsemble_AllModels_1_AutoML_2_20230809_81536


## Testing 2023 Data

In [50]:
df_test = pd.read_csv(PARAMETERS['TEST_DATA_CSV'], index_col=False)
test_frame = h2o.H2OFrame.from_python(df_test)
test_frame[y] = test_frame[y].asfactor()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [51]:
def get_performance(model, test: h2o.H2OFrame):
	cm = model.model_performance(test).confusion_matrix().to_list()
	perf_t = cm[1][1] / (cm[1][0] + cm[1][1])
	perf_f = cm[0][0] / (cm[0][1] + cm[0][0])
	return perf_t, perf_f

In [52]:
best = ''
best_score_t = 0
best_score_f = 0
df_leaderboard = aml.leaderboard.as_data_frame()
model_ids = df_leaderboard['model_id']
for model_id in model_ids:
	model = h2o.get_model(model_id)
	score_t, score_f = get_performance(model, test_frame)
	print(model_id, score_t, score_f)
	if score_t > best_score_t and score_f > best_score_f:
		best_score_t = score_t
		best_score_f = score_f
		best = model_id

print(best, best_score_t, best_score_f)

StackedEnsemble_AllModels_1_AutoML_2_20230809_81536 0.7373595505617978 0.9048099987375331
StackedEnsemble_BestOfFamily_1_AutoML_2_20230809_81536 0.7373595505617978 0.9048099987375331
GBM_1_AutoML_2_20230809_81536 0.7510533707865169 0.9001388713546269
DRF_1_AutoML_2_20230809_81536 0.7131320224719101 0.8891554096704961
GBM_3_AutoML_2_20230809_81536 0.7120786516853933 0.9231157682110844
GBM_2_AutoML_2_20230809_81536 0.7808988764044944 0.8809493750789041
GLM_1_AutoML_2_20230809_81536 0.7103230337078652 0.8954677439717208
StackedEnsemble_AllModels_1_AutoML_2_20230809_81536 0.7373595505617978 0.9048099987375331
