In [27]:
import h2o
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,4 hours 16 mins
H2O_cluster_timezone:,America/Asuncion
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.1
H2O_cluster_version_age:,1 month and 13 days
H2O_cluster_name:,H2O_from_python_matiaslopez_sbpkvc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,15.09 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [28]:
import pandas as pd
y = "Protestas"
balanced_df = pd.read_csv('../balanced_output_35.csv', index_col=False)
# https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html
# from sklearn.model_selection import train_test_split
# train_data, test_data = train_test_split(balanced_df, test_size=0.2, random_state=42)

## Random undersampling

In [29]:
# Random undersampling
if True:
	balanced_df_true = balanced_df[balanced_df[y] == True]
	balanced_df_false = balanced_df[balanced_df[y] == False]
	n_cmp = len(balanced_df_true)
	n_no_cmp = len(balanced_df_false)
	total = n_cmp + n_no_cmp
	sample_false = balanced_df_false.sample(int(total*.65)) #Sample 70% no complaints 30 % complaints
	balanced_df = pd.concat([balanced_df_true, sample_false])

## Training

In [30]:
train = h2o.H2OFrame.from_python(balanced_df)
# Identify predictors and response
x = train.columns
x.remove(y)
# For binary classification, response should be a factor
train[y] = train[y].asfactor()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [31]:
# Run AutoML for 10 base models
aml = H2OAutoML(max_models=42, seed=23, stopping_metric="RMSE", sort_metric="RMSE")
aml.train(x=x, y=y, training_frame=train)

# View the AutoML Leader
# board
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

AutoML progress: |
15:04:44.992: _train param, Dropping bad and constant columns: [contracts.investmentProjects.id_37, parties.details.legalEntityTypeDetail buyer_15, parties.details.legalEntityTypeDetail buyer_14, parties.details.legalEntityTypeDetail buyer_13, parties.details.legalEntityTypeDetail buyer_12, parties.details.legalEntityTypeDetail buyer_11, parties.details.legalEntityTypeDetail buyer_10, parties.details.EntityType payee_1, parties.details.EntityType payee_2, parties.details.EntityType payee_3, parties.details.EntityType payee_4, planning.items.classification.id.n1_1_56, planning.items.classification.id.n1_1_57, contracts.investmentProjects.id_102, contracts.investmentProjects.id_100, Monto faltante, parties.details.legalEntityTypeDetail notifiedSupplier_18, tender.status_active, contracts.investmentProjects.id_17, tender.statusDetails_En Convocatoria (Abierta), parties.details.EntityType candidate_2, parties.details.EntityType candidate_1, parties.details.EntityType can

model_id,rmse,auc,logloss,aucpr,mean_per_class_error,mse
StackedEnsemble_AllModels_1_AutoML_2_20230803_150444,0.170806,0.983556,0.111544,0.976213,0.0560647,0.0291745
StackedEnsemble_BestOfFamily_1_AutoML_2_20230803_150444,0.171583,0.983267,0.112723,0.975831,0.0564435,0.0294408
GBM_grid_1_AutoML_2_20230803_150444_model_8,0.173465,0.982336,0.11878,0.974915,0.0579127,0.03009
GBM_grid_1_AutoML_2_20230803_150444_model_7,0.173857,0.98225,0.119337,0.974899,0.0570869,0.0302262
GBM_grid_1_AutoML_2_20230803_150444_model_5,0.174133,0.981957,0.117933,0.974272,0.0568691,0.0303222
XGBoost_grid_1_AutoML_2_20230803_150444_model_5,0.175801,0.980532,0.126829,0.972944,0.0587378,0.030906
GBM_1_AutoML_2_20230803_150444,0.17686,0.981048,0.121488,0.973018,0.0578978,0.0312794
GBM_4_AutoML_2_20230803_150444,0.17772,0.981122,0.121367,0.972978,0.0584578,0.0315844
GBM_grid_1_AutoML_2_20230803_150444_model_4,0.177743,0.980646,0.12507,0.97253,0.0573703,0.0315925
GBM_grid_1_AutoML_2_20230803_150444_model_1,0.179352,0.980849,0.124077,0.972135,0.0577016,0.0321671


In [32]:
model_path = h2o.save_model(model=aml.leader, path="train2021/smoteModelv7", force=True)

In [33]:
print(model_path)

/Users/matiaslopez/Documents/tesis/tesis-model/Training/train2021/smoteModelv7/StackedEnsemble_AllModels_1_AutoML_2_20230803_150444


In [34]:
saved_model = h2o.load_model(model_path)

## Testing 2022 Data

In [35]:
df_2022 = pd.read_csv('../df_dummizado_from_2021.csv', index_col=False)
df_2022 = df_2022[df_2022['date.year'] == 2022]
test_2022 = h2o.H2OFrame.from_python(df_2022)
test_2022[y] = test_2022[y].asfactor()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [36]:
def get_performance(model, test: h2o.H2OFrame):
	cm = model.model_performance(test).confusion_matrix().to_list()
	perf_t = cm[1][1] / (cm[1][0] + cm[1][1])
	perf_f = cm[0][0] / (cm[0][1] + cm[0][0])
	return perf_t, perf_f

In [37]:
best = ''
best_score_t = 0
best_score_f = 0
df_leaderboard = aml.leaderboard.as_data_frame()
model_ids = df_leaderboard['model_id']
for model_id in model_ids:
	model = h2o.get_model(model_id)
	score_t, score_f = get_performance(model, test_2022)
	print(model_id, score_t, score_f)
	if score_t > best_score_t and score_f > best_score_f:
		best_score_t = score_t
		best_score_f = score_f
		best = model_id

print(best, best_score_t, best_score_f)

StackedEnsemble_AllModels_1_AutoML_2_20230803_150444 0.5241109101868595 0.9201221472195436
StackedEnsemble_BestOfFamily_1_AutoML_2_20230803_150444 0.48523206751054854 0.9347476695596272
GBM_grid_1_AutoML_2_20230803_150444_model_8 0.5033152501506932 0.8979427836708453
GBM_grid_1_AutoML_2_20230803_150444_model_7 0.498794454490657 0.9097289188899603
GBM_grid_1_AutoML_2_20230803_150444_model_5 0.4804098854731766 0.9063537983499411
XGBoost_grid_1_AutoML_2_20230803_150444_model_5 0.5334538878842676 0.8992821172184721
GBM_1_AutoML_2_20230803_150444 0.4843279083785413 0.924461587913854
GBM_4_AutoML_2_20230803_150444 0.47166968053044 0.8957998499946427
GBM_grid_1_AutoML_2_20230803_150444_model_4 0.5216998191681737 0.9093003321547198
GBM_grid_1_AutoML_2_20230803_150444_model_1 0.5012055455093429 0.9208186006643094
XGBoost_grid_1_AutoML_2_20230803_150444_model_7 0.4972875226039783 0.8816564877317047
XGBoost_grid_1_AutoML_2_20230803_150444_model_1 0.5443037974683544 0.8750133933354762
XGBoost_grid

In [57]:
best_model = h2o.get_model('DeepLearning_grid_1_AutoML_2_20230803_150444_model_3')
best_model.model_performance(test_2022).confusion_matrix()

Unnamed: 0,False,True,Error,Rate
False,16280.0,2386.0,0.1278,(2386.0/18666.0)
True,1418.0,1900.0,0.4274,(1418.0/3318.0)
Total,17698.0,4286.0,0.173,(3804.0/21984.0)


In [53]:
best_model.model_performance(test_2022).confusion_matrix().to_list()

[[15527, 3139], [1102, 2216]]

In [58]:
model_path = h2o.save_model(model=best_model, path="train2021/smoteModelv8", force=True)

In [55]:
saved_model = h2o.load_model("/Users/matiaslopez/Documents/tesis/tesis-model/Training/train2021/smoteModelv8/DeepLearning_grid_1_AutoML_2_20230803_00151_model_3")

In [56]:
saved_model

Unnamed: 0,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
,1,776,Input,10.0,,,,,,,,,
,2,50,RectifierDropout,50.0,0.0,0.0,0.2046173,0.3415112,0.0,0.1868293,0.7766852,0.5849996,0.4228494
,3,2,Softmax,,0.0,0.0,0.0001095,0.0001066,0.0,-0.0248836,0.684427,-0.4636504,1.7344742

Unnamed: 0,False,True,Error,Rate
False,6079.0,984.0,0.1393,(984.0/7063.0)
True,311.0,2561.0,0.1083,(311.0/2872.0)
Total,6390.0,3545.0,0.1303,(1295.0/9935.0)

metric,threshold,value,idx
max f1,0.9999525,0.7981923,1.0
max f2,0.9999525,0.8517927,1.0
max f0point5,0.9999525,0.7509383,1.0
max accuracy,0.9999525,0.8696527,1.0
max precision,0.9999968,0.7258501,0.0
max recall,0.7956351,1.0,324.0
max specificity,0.9999968,0.871018,0.0
max absolute_mcc,0.9999525,0.711991,1.0
max min_per_class_accuracy,0.9999525,0.8606824,1.0
max mean_per_class_accuracy,0.9999525,0.8761978,1.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0663312,1.0,3.3490274,3.3490274,0.9681335,1.0,0.9681335,1.0,0.2221448,0.2221448,234.9027394,234.9027394,0.2191716
2,0.100151,1.0,3.366603,3.3549625,0.9732143,1.0,0.9698492,1.0,0.1138579,0.3360028,236.6603039,235.4962487,0.3317553
3,0.1500755,1.0,3.3128012,3.340937,0.9576613,1.0,0.9657948,1.0,0.16539,0.5013928,231.2801156,234.0936987,0.494172
4,0.2,1.0,2.9850082,3.2520891,0.8629032,1.0,0.9401107,1.0,0.1490251,0.6504178,198.5008199,225.2089136,0.6335695
5,0.3000503,0.9999897,1.7713926,2.7583581,0.5120724,0.9999981,0.7973834,0.9999994,0.1772284,0.8276462,77.1392632,175.8358064,0.7421302
6,0.4,0.9998072,0.7838207,2.2649721,0.2265861,0.9999295,0.6547559,0.9999819,0.0783426,0.9059889,-21.6179342,126.4972145,0.7117371
7,0.5000503,0.9990347,0.3236533,1.8765521,0.0935614,0.9994853,0.5424718,0.9998825,0.0323816,0.9383705,-67.6346729,87.6552064,0.6165525
8,0.6,0.9970138,0.2647572,1.6080548,0.0765358,0.9981582,0.4648549,0.9995953,0.0264624,0.9648329,-73.52428,60.8054782,0.5131834
9,0.6999497,0.9926217,0.1567641,1.400817,0.0453172,0.9951046,0.4049468,0.998954,0.0156685,0.9805014,-84.3235868,40.0816988,0.3946314
10,0.8,0.9825852,0.0800433,1.2356372,0.0231388,0.988424,0.3571968,0.9976371,0.0080084,0.9885097,-91.9956718,23.5637187,0.2651627

Unnamed: 0,False,True,Error,Rate
False,44372.0,1906.0,0.0412,(1906.0/46278.0)
True,3566.0,14892.0,0.1932,(3566.0/18458.0)
Total,47938.0,16798.0,0.0845,(5472.0/64736.0)

metric,threshold,value,idx
max f1,0.5301278,0.8447924,138.0
max f2,0.1655192,0.8536157,261.0
max f0point5,0.8513016,0.8938131,56.0
max accuracy,0.5807504,0.9158891,125.0
max precision,0.9999724,0.9894619,0.0
max recall,0.0001435,1.0,399.0
max specificity,0.9999724,0.9981633,0.0
max absolute_mcc,0.5807504,0.7890772,125.0
max min_per_class_accuracy,0.1991079,0.8830859,245.0
max mean_per_class_accuracy,0.3372647,0.8889838,194.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.031976,1.0,3.464848,3.464848,0.9879227,1.0,0.9879227,1.0,0.1107921,0.1107921,246.4847993,246.4847993,0.1102519
2,0.0408583,1.0,3.4828076,3.4687523,0.9930435,1.0,0.9890359,1.0,0.0309351,0.1417272,248.2807596,246.8752254,0.1411005
3,0.050312,0.9999999,3.4842826,3.4716705,0.9934641,0.9999999,0.989868,1.0,0.0329396,0.1746668,248.4282636,247.1670459,0.1739537
4,0.1000216,0.9999703,3.4799588,3.4757897,0.9922312,0.9999941,0.9910425,0.9999971,0.1729873,0.3476541,247.9958768,247.5789652,0.3464008
5,0.1500093,0.9983749,3.4248361,3.4588104,0.9765142,0.9995619,0.9862012,0.9998521,0.1711995,0.5188536,242.4836073,245.8810373,0.5159581
6,0.2000124,0.9445512,3.1431583,3.3798973,0.8962002,0.9839603,0.963701,0.9958791,0.1571676,0.6760212,214.3158262,237.9897345,0.6658652
7,0.3000031,0.3013967,1.7950521,2.85167,0.5118183,0.614745,0.8130889,0.8688475,0.1794886,0.8555098,79.5052059,185.1669986,0.7770708
8,0.4000093,0.1171714,0.56449,2.2798529,0.1609515,0.1836912,0.6500483,0.6975518,0.0564525,0.9119623,-43.5510012,127.9852905,0.7161457
9,0.5,0.0780173,0.2313574,1.8701918,0.0659663,0.0929947,0.5332427,0.5766516,0.0231336,0.9350959,-76.8642551,87.0191787,0.6086341
10,0.6000062,0.0626149,0.1321838,1.5805089,0.0376892,0.0710135,0.4506462,0.4923742,0.0132192,0.9483151,-86.7816164,58.050888,0.4872321

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.9209713,0.0032745,0.9233086,0.9165058,0.9187456,0.924384,0.9219124
auc,0.9461826,0.0072538,0.9502347,0.934201,0.9445795,0.9520635,0.9498344
err,0.0790287,0.0032745,0.0766914,0.0834942,0.0812544,0.075616,0.0780876
err_count,1023.2,42.3816,993.0,1081.0,1052.0,979.0,1011.0
f0point5,0.8772144,0.0055595,0.8830195,0.8733522,0.8697264,0.8786504,0.8813233
f1,0.8557422,0.0067242,0.8595672,0.8459455,0.8529494,0.863592,0.8566567
f2,0.8353434,0.0103304,0.8373285,0.8202067,0.8368075,0.8490411,0.8333333
lift_top_group,3.4208758,0.0841019,3.2912242,3.4258459,3.4797962,3.3997915,3.5077214
logloss,0.2958856,0.0467591,0.2792144,0.3471147,0.2531977,0.2552963,0.3446047
max_per_class_error,0.1777,0.0129435,0.1768689,0.1960997,0.1736186,0.1603901,0.1815226

Unnamed: 0,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
,2023-08-03 02:44:35,0.000 sec,,0.0,0,0.0,,,,,,,
,2023-08-03 02:44:36,10 min 41.937 sec,81666 obs/sec,1.542851,1,99878.0,0.3316991,0.4373544,0.4646342,0.943769,0.9062317,3.1133357,0.083845
,2023-08-03 02:44:42,10 min 47.370 sec,105863 obs/sec,10.7979486,7,699016.0,0.2507148,0.2529652,0.6941405,0.9445303,0.9223803,3.3594754,0.0777051
,2023-08-03 02:44:47,10 min 52.593 sec,110386 obs/sec,20.0734213,13,1299473.0,0.2641551,0.2817792,0.6604684,0.9530862,0.9303122,3.355484,0.0765979
,2023-08-03 02:44:52,10 min 58.082 sec,111082 obs/sec,29.3407378,19,1899402.0,0.292954,0.5223259,0.5823997,0.9292028,0.9120363,3.4246692,0.0804227
,2023-08-03 02:44:58,11 min 3.491 sec,111418 obs/sec,38.6184967,25,2500007.0,0.5593798,1.0820155,-0.5225643,0.9452855,0.8902426,3.3262133,0.0847509
,2023-08-03 02:45:03,11 min 9.016 sec,111074 obs/sec,47.8845619,31,3099855.0,0.2440471,0.223826,0.7101926,0.9526548,0.9334248,3.4246692,0.0726724
,2023-08-03 02:45:09,11 min 14.429 sec,111308 obs/sec,57.1673566,37,3700786.0,0.5365901,7.0881896,-0.40103,0.6517616,0.5434082,3.3900766,0.2317061
,2023-08-03 02:45:14,11 min 20.142 sec,110547 obs/sec,66.4331284,43,4300615.0,0.2603179,0.2723971,0.6702612,0.9451884,0.9286324,3.4246692,0.0721691
,2023-08-03 02:45:17,11 min 22.956 sec,110418 obs/sec,71.0597658,46,4600125.0,0.8214488,4.5515114,-2.2833949,0.8944822,0.7028412,3.3490274,0.1303473

variable,relative_importance,scaled_importance,percentage
Oferente Unico.False,1.0,1.0,0.0038094
date.yearmonth,0.9809990,0.9809990,0.0037371
planning.items.classification.id.n1_1_20,0.8592893,0.8592893,0.0032734
tender.procurementMethodDetails q1,0.8206642,0.8206642,0.0031263
tender.submissionMethodDetails q1,0.8163281,0.8163281,0.0031097
date.year,0.7813047,0.7813047,0.0029763
tender.submissionMethodDetails q4,0.7730215,0.7730215,0.0029448
Enmiendas del contrato.True,0.7627531,0.7627531,0.0029057
tender.items.classification.id.n1_1_31,0.7532040,0.7532040,0.0028693
planning.items.classification.id.n1_1_28,0.7524646,0.7524646,0.0028665
