# ABOUT: 
- this code evaluates the node2vec embeddings on all node2vec embeddings generated
- findings: 
    - using card_id embeddings appear to cause overfitting
        
- details:       
    - i.e compared to baseline, performance on training set is better but performance on validation set is worse
    - baseline - using just feature_2 as feature
    - model used is Histogram Gradient boosting
    - metrics used are r2 and rmse
    - 3 fold cross validated

In [3]:
from config import *

import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nodevectors
from sklearn.model_selection import cross_validate

In [4]:
target_id_column = "state_id"
node2vec_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\model\node2vec_card_id_state_id.zip"
embedding_size = 16

### prepare data

In [5]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
id_columns = pd.read_csv(path, usecols = ["card_id", target_id_column])
id_columns.head()

Unnamed: 0,card_id,state_id
0,C_ID_4e6213e9bc,state_id_16
1,C_ID_4e6213e9bc,state_id_16
2,C_ID_4e6213e9bc,state_id_16
3,C_ID_4e6213e9bc,state_id_16
4,C_ID_4e6213e9bc,state_id_16


In [6]:
# group by card_id, then acquire nunique_merchant_id, count_merchant_id, nunique_count_frac_merchant_id
id_features = id_columns.groupby("card_id").agg(["nunique", "count"])
id_features = id_features.reset_index()
id_features.columns = ["card_id", f"nunique_{target_id_column}", f"count_{target_id_column}"]
id_features[f"nunique_count_frac_{target_id_column}"] = id_features[f"nunique_{target_id_column}"]/id_features[f"count_{target_id_column}"]
id_features

Unnamed: 0,card_id,nunique_state_id,count_state_id,nunique_count_frac_state_id
0,C_ID_00007093c1,4,151,0.026490
1,C_ID_0001238066,6,149,0.040268
2,C_ID_0001506ef0,2,68,0.029412
3,C_ID_0001793786,5,247,0.020243
4,C_ID_000183fdda,7,155,0.045161
...,...,...,...,...
325535,C_ID_ffff1d9928,3,16,0.187500
325536,C_ID_ffff579d3a,2,115,0.017391
325537,C_ID_ffff756266,2,25,0.080000
325538,C_ID_ffff828181,7,198,0.035354


In [7]:
# load train target variable
train_file = pd.read_csv(train_path, usecols = ["card_id","target", "feature_2"])
train_file.head()

Unnamed: 0,card_id,feature_2,target
0,C_ID_92a2005557,2,-0.820283
1,C_ID_3d0044924f,1,0.392913
2,C_ID_d639edf6cd,2,0.688056
3,C_ID_186d6a6901,3,0.142495
4,C_ID_cdbd2c0db2,3,-0.159749


In [8]:
# load trained node2vec
node2vec = nodevectors.GGVec.load(node2vec_path)
# convert embeddings to dataframe
node2vec_embeddings = pd.DataFrame.from_dict(node2vec.model, orient = "index")
node2vec_embeddings = node2vec_embeddings.reset_index()
node2vec_embeddings

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,C_ID_00007093c1,-0.198011,-0.200073,-0.259103,-0.186811,0.184979,-0.206529,-0.198618,-0.164513,-0.148675,0.092239,-0.219476,-0.005181,0.211745,-0.133512,0.015083,-0.075834
1,C_ID_0001238066,-0.095775,-0.130892,0.161676,-0.031171,-0.184186,-0.069007,0.068906,0.064137,0.055863,0.120300,-0.120665,0.326637,-0.116559,-0.088582,-0.111082,-0.051032
2,C_ID_0001506ef0,-0.067357,0.063012,0.073156,0.190059,-0.055308,-0.084450,0.177312,0.129409,0.048204,-0.123289,-0.223923,0.193429,0.157419,0.173367,0.114183,-0.100341
3,C_ID_0001793786,-0.091667,0.164125,-0.212477,-0.154005,0.082932,-0.176332,-0.320626,0.179887,-0.129675,-0.218853,0.302923,0.003706,0.132039,-0.112196,0.114383,-0.233117
4,C_ID_000183fdda,-0.274693,-0.102580,-0.170121,0.185044,0.160792,-0.288384,-0.313028,-0.253584,0.007690,0.109456,0.052742,0.310754,-0.020866,-0.117002,-0.099671,-0.197784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325560,state_id_5,0.003637,0.002488,0.000009,-0.001326,0.001948,-0.002380,-0.002264,-0.000496,-0.006189,-0.003233,0.002490,-0.000322,0.001762,0.002575,0.001253,0.002274
325561,state_id_6,-0.005294,0.001997,-0.002733,-0.003261,0.000969,0.004640,0.001451,0.004580,-0.001288,0.003467,-0.000642,0.000017,0.002266,-0.000940,0.001863,0.002127
325562,state_id_7,0.000171,0.000574,0.000320,-0.000410,0.001525,-0.002373,-0.002824,-0.000046,0.000115,0.001886,-0.001919,0.001293,-0.001731,0.000250,0.000482,-0.000991
325563,state_id_8,0.000828,0.001154,-0.000731,0.001326,0.003050,0.001467,0.001677,-0.000464,-0.000966,-0.000155,-0.000779,-0.002275,-0.002300,0.003041,0.000591,-0.001166


In [9]:
# group and aggregate the id embeddings (e.g city_id embeddings) by the "card_id"
node2vec_embeddings = id_columns.merge(node2vec_embeddings, how = "left", left_on = target_id_column, right_on = "index")
node2vec_embeddings = node2vec_embeddings.drop("index", axis = 1)
node2vec_embeddings = node2vec_embeddings.groupby("card_id").mean().reset_index()

In [10]:
node2vec_embeddings

Unnamed: 0,card_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,C_ID_00007093c1,0.000887,-0.004353,0.001450,0.000672,-0.000899,-0.001993,0.000300,0.000025,-0.001621,-0.001792,-0.002184,0.001050,-0.003217,-0.001746,-0.000112,-0.001320
1,C_ID_0001238066,0.000399,0.000254,0.000215,0.001230,0.001542,-0.002723,-0.001674,-0.000857,0.000622,-0.001019,-0.000736,0.001268,-0.000606,-0.000994,0.003473,-0.002193
2,C_ID_0001506ef0,0.001752,0.002050,-0.000413,0.000807,0.001328,0.001364,0.000930,-0.001062,-0.000722,-0.000924,-0.001738,-0.002353,0.001464,-0.000562,0.000946,-0.003014
3,C_ID_0001793786,0.000943,0.000851,0.000550,-0.001585,0.000303,-0.000594,-0.001788,0.000407,-0.001283,0.000470,-0.000421,-0.002006,-0.000491,-0.001073,0.001020,0.000038
4,C_ID_000183fdda,0.000887,0.000668,0.001087,0.000540,0.000932,-0.003484,0.001642,-0.000225,0.000039,0.001458,-0.000422,-0.001173,-0.003032,-0.002487,0.001279,-0.000034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325535,C_ID_ffff1d9928,0.001723,0.000721,0.001266,-0.001118,-0.001526,-0.000056,-0.002250,0.002928,0.001505,-0.001476,-0.001526,0.000203,-0.001953,-0.002124,0.000392,-0.000863
325536,C_ID_ffff579d3a,0.000207,-0.000266,-0.000173,0.001962,0.001657,-0.003321,-0.001440,-0.001773,0.000573,-0.001670,-0.000786,0.002138,-0.000158,-0.001989,0.003812,-0.002251
325537,C_ID_ffff756266,0.000383,-0.000121,-0.000171,0.001271,0.001404,-0.003050,-0.001363,-0.001489,0.000245,-0.001435,-0.000839,0.001607,-0.000087,-0.001490,0.003489,-0.002016
325538,C_ID_ffff828181,0.000418,0.000739,0.001048,0.000034,0.000536,0.000340,-0.003162,0.000495,-0.000806,0.001215,0.000227,-0.002193,-0.001219,-0.004359,0.000810,0.000392


In [11]:
# merge id embeddings with train.csv 
dataset = train_file.merge(node2vec_embeddings, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,C_ID_92a2005557,2,-0.820283,0.000203,-0.000261,-0.000165,0.001962,0.001658,-0.003313,-0.001447,-0.001757,0.000592,-0.001656,-0.00079,0.002132,-0.000174,-0.001981,0.003815,-0.00226
1,C_ID_3d0044924f,1,0.392913,0.00035,-0.000138,-0.000135,0.001354,0.001443,-0.003044,-0.001324,-0.001542,0.000277,-0.001417,-0.000845,0.001617,-7.1e-05,-0.001565,0.003474,-0.002135
2,C_ID_d639edf6cd,2,0.688056,0.003322,0.002236,-8e-06,-0.001025,0.001921,-0.002467,-0.002188,-0.000613,-0.005568,-0.003089,0.00219,-9.7e-05,0.001586,0.002156,0.001488,0.001859
3,C_ID_186d6a6901,3,0.142495,0.00188,0.000348,0.000315,-0.001574,0.000598,-0.001668,-0.00167,-9.6e-05,0.001137,-0.000105,-0.001362,-0.001692,-0.001231,-0.000695,-0.000913,9.5e-05
4,C_ID_cdbd2c0db2,3,-0.159749,0.001939,0.000342,0.000468,-0.001556,0.000479,-0.001504,-0.001855,0.000162,0.001434,-5.8e-05,-0.001378,-0.001639,-0.001495,-0.000835,-0.001255,0.000188


In [12]:
# merge id features 
dataset = dataset.merge(id_features, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,...,9,10,11,12,13,14,15,nunique_state_id,count_state_id,nunique_count_frac_state_id
0,C_ID_92a2005557,2,-0.820283,0.000203,-0.000261,-0.000165,0.001962,0.001658,-0.003313,-0.001447,...,-0.001656,-0.00079,0.002132,-0.000174,-0.001981,0.003815,-0.00226,3,283,0.010601
1,C_ID_3d0044924f,1,0.392913,0.00035,-0.000138,-0.000135,0.001354,0.001443,-0.003044,-0.001324,...,-0.001417,-0.000845,0.001617,-7.1e-05,-0.001565,0.003474,-0.002135,3,356,0.008427
2,C_ID_d639edf6cd,2,0.688056,0.003322,0.002236,-8e-06,-0.001025,0.001921,-0.002467,-0.002188,...,-0.003089,0.00219,-9.7e-05,0.001586,0.002156,0.001488,0.001859,2,44,0.045455
3,C_ID_186d6a6901,3,0.142495,0.00188,0.000348,0.000315,-0.001574,0.000598,-0.001668,-0.00167,...,-0.000105,-0.001362,-0.001692,-0.001231,-0.000695,-0.000913,9.5e-05,5,84,0.059524
4,C_ID_cdbd2c0db2,3,-0.159749,0.001939,0.000342,0.000468,-0.001556,0.000479,-0.001504,-0.001855,...,-5.8e-05,-0.001378,-0.001639,-0.001495,-0.000835,-0.001255,0.000188,7,169,0.04142


In [13]:
dataset

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,...,9,10,11,12,13,14,15,nunique_state_id,count_state_id,nunique_count_frac_state_id
0,C_ID_92a2005557,2,-0.820283,0.000203,-0.000261,-0.000165,0.001962,0.001658,-0.003313,-0.001447,...,-0.001656,-0.000790,0.002132,-0.000174,-0.001981,0.003815,-0.002260,3,283,0.010601
1,C_ID_3d0044924f,1,0.392913,0.000350,-0.000138,-0.000135,0.001354,0.001443,-0.003044,-0.001324,...,-0.001417,-0.000845,0.001617,-0.000071,-0.001565,0.003474,-0.002135,3,356,0.008427
2,C_ID_d639edf6cd,2,0.688056,0.003322,0.002236,-0.000008,-0.001025,0.001921,-0.002467,-0.002188,...,-0.003089,0.002190,-0.000097,0.001586,0.002156,0.001488,0.001859,2,44,0.045455
3,C_ID_186d6a6901,3,0.142495,0.001880,0.000348,0.000315,-0.001574,0.000598,-0.001668,-0.001670,...,-0.000105,-0.001362,-0.001692,-0.001231,-0.000695,-0.000913,0.000095,5,84,0.059524
4,C_ID_cdbd2c0db2,3,-0.159749,0.001939,0.000342,0.000468,-0.001556,0.000479,-0.001504,-0.001855,...,-0.000058,-0.001378,-0.001639,-0.001495,-0.000835,-0.001255,0.000188,7,169,0.041420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,2,-2.740821,0.000173,0.000317,0.000172,0.000305,0.001564,-0.002658,-0.002409,...,0.000831,-0.001590,0.001551,-0.001268,-0.000429,0.001480,-0.001378,2,47,0.042553
201913,C_ID_1314773c0b,1,0.312917,0.001017,0.000778,0.001140,-0.002088,0.000609,-0.000411,0.000648,...,0.001159,-0.001260,-0.002891,0.001263,0.000557,-0.000082,-0.003861,1,48,0.020833
201914,C_ID_7666735b3d,3,0.093494,0.000953,0.001033,-0.000503,0.000215,0.002176,0.000436,0.000668,...,-0.000024,-0.000925,-0.001785,-0.001555,0.002394,0.000835,-0.001015,5,90,0.055556
201915,C_ID_73f5a0efd0,2,-4.676589,0.000177,-0.000291,-0.000175,0.001991,0.001655,-0.003329,-0.001432,...,-0.001656,-0.000815,0.002160,-0.000175,-0.002029,0.003834,-0.002291,1,31,0.032258


### evaluate on baseline dataset 

In [14]:
# define columns for training
baseline_feature_names = ["feature_2"]
embedding_feature_names = list(range(embedding_size))
id_feature_feature_names = [f"nunique_{target_id_column}",f"count_{target_id_column}",f"nunique_count_frac_{target_id_column}"]
categorical_feature_names = ["feature_2"]
target_col = "target"
results = {}

In [15]:
features = baseline_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [16]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = 10,
                        return_train_score=True)
results["baseline"] = scores

[CV] START .....................................................................
Binning 0.001 GB of training data: 0.003 s
Binning 0.000 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34904, val loss: 8.13150, in 0.012s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34900, val loss: 8.13149, in 0.007s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34896, val loss: 8.13148, in 0.006s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34892, val loss: 8.13147, in 0.006s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34888, val loss: 8.13147, in 0.007s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34884, val loss: 8.13146, in 0.006s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34880, val loss: 8.13146, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34877, val loss: 8.13145, in 0.006s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34873, val loss

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 3 leaves, max depth = 2, train loss: 7.34835, val loss: 8.13141, in 0.006s
[22/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34832, val loss: 8.13141, in 0.006s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34829, val loss: 8.13141, in 0.006s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34827, val loss: 8.13141, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34824, val loss: 8.13141, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34821, val loss: 8.13141, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34819, val loss: 8.13141, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34816, val loss: 8.13141, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34814, val loss: 8.13141, in 0.006s
Fit 29 trees in 0.213 s, (87 total leaves)
Time spent computing histograms: 0.015s
Time spent finding best splits:  0.003s
Time spent applying splits:      0.027s
Time spent predict

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


Binning 0.001 GB of training data: 0.001 s
Binning 0.000 GB of validation data: 0.000 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46241, val loss: 7.24254, in 0.006s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46237, val loss: 7.24251, in 0.006s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46233, val loss: 7.24248, in 0.006s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46229, val loss: 7.24246, in 0.006s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46225, val loss: 7.24243, in 0.006s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46221, val loss: 7.24241, in 0.006s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46217, val loss: 7.24238, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46214, val loss: 7.24236, in 0.006s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46210, val loss: 7.24233, in 0.006s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.4620

[89/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46070, val loss: 7.24162, in 0.006s
[90/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46069, val loss: 7.24162, in 0.005s
[91/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46068, val loss: 7.24162, in 0.005s
[92/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46068, val loss: 7.24162, in 0.006s
[93/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46067, val loss: 7.24162, in 0.006s
[94/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46066, val loss: 7.24161, in 0.006s
[95/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46066, val loss: 7.24161, in 0.005s
[96/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46065, val loss: 7.24161, in 0.006s
[97/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46064, val loss: 7.24161, in 0.006s
[98/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46064, val loss: 7.24161, in 0.006s
[99/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46063, val loss: 7.24161

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.4s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.51011, val loss: 6.62373, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51009, val loss: 6.62372, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51007, val loss: 6.62371, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51004, val loss: 6.62370, in 0.005s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51002, val loss: 6.62368, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51000, val loss: 6.62367, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50998, val loss: 6.62366, in 0.005s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50996, val loss: 6.62365, in 0.005s
[34/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50994, val loss: 6.62364, in 0.006s
[35/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50992, val loss: 6.62363, in 0.006s
[36/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50990, val loss: 6.62362, in 0.00

[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43369, val loss: 6.54884, in 0.006s
[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43366, val loss: 6.54879, in 0.006s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43364, val loss: 6.54875, in 0.007s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43361, val loss: 6.54870, in 0.006s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43358, val loss: 6.54866, in 0.006s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43356, val loss: 6.54862, in 0.006s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43354, val loss: 6.54858, in 0.006s
[17/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43351, val loss: 6.54854, in 0.006s
[18/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43349, val loss: 6.54850, in 0.006s
[19/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43347, val loss: 6.54846, in 0.006s
[20/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43345, val loss: 6.54842

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.3s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.43338, val loss: 6.54830, in 0.006s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43336, val loss: 6.54827, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43334, val loss: 6.54823, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43332, val loss: 6.54819, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43330, val loss: 6.54816, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43328, val loss: 6.54812, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43327, val loss: 6.54809, in 0.005s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43325, val loss: 6.54806, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43323, val loss: 6.54802, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43321, val loss: 6.54799, in 0.006s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43319, val loss: 6.54796, in 0.00

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.4s finished


### evaluate on baseline with embeddings

In [17]:
features = baseline_feature_names + embedding_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [18]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings"] = scores

Binning 0.020 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.338 s
Binning 0.002 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34867, val loss: 8.13121, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34825, val loss: 8.13094, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34785, val loss: 8.13066, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34746, val loss: 8.13040, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34707, val loss: 8.13014, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34667, val loss: 8.12989, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34629, val loss: 8.12963, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34592, val loss: 8.12939, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34555, val loss: 8.12915, in 0.013s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34519, val loss: 8.12893, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32298, val loss: 8.12119, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32272, val loss: 8.12109, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32240, val loss: 8.12105, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32222, val loss: 8.12097, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32194, val loss: 8.12090, in 0.012s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32165, val loss: 8.12085, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32144, val loss: 8.12077, in 0.011s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32119, val loss: 8.12071, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32102, val loss: 8.12064, in 0.011s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32078, val loss: 8.12059, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32054, val lo



Binning 0.020 GB of training data: 



0.312 s
Binning 0.002 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38694, val loss: 7.81579, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38650, val loss: 7.81551, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38608, val loss: 7.81527, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38566, val loss: 7.81504, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38524, val loss: 7.81479, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38482, val loss: 7.81458, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38445, val loss: 7.81435, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38406, val loss: 7.81414, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38369, val loss: 7.81392, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38330, val loss: 7.81368, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36070, val loss: 7.80628, in 0.012s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36043, val loss: 7.80627, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36017, val loss: 7.80623, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35995, val loss: 7.80612, in 0.012s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35970, val loss: 7.80608, in 0.011s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35949, val loss: 7.80598, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35923, val loss: 7.80592, in 0.012s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35895, val loss: 7.80585, in 0.011s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35871, val loss: 7.80577, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35846, val loss: 7.80573, in 0.011s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35819, val lo



Binning 0.020 GB of training data: 



0.325 s
Binning 0.002 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46202, val loss: 7.24228, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46159, val loss: 7.24200, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46118, val loss: 7.24172, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46078, val loss: 7.24152, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46039, val loss: 7.24136, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45999, val loss: 7.24111, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45961, val loss: 7.24092, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45923, val loss: 7.24072, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45885, val loss: 7.24054, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45847, val loss: 7.24036, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43740, val loss: 7.23424, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43716, val loss: 7.23427, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43690, val loss: 7.23422, in 0.011s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43663, val loss: 7.23422, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43637, val loss: 7.23418, in 0.011s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43609, val loss: 7.23415, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43591, val loss: 7.23409, in 0.012s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43565, val loss: 7.23404, in 0.011s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43544, val loss: 7.23405, in 0.011s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43522, val loss: 7.23408, in 0.011s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43494, val lo



Binning 0.020 GB of training data: 



0.356 s
Binning 0.002 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51049, val loss: 6.62408, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51010, val loss: 6.62395, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50970, val loss: 6.62385, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50923, val loss: 6.62376, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50877, val loss: 6.62367, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50831, val loss: 6.62358, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50788, val loss: 6.62351, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50752, val loss: 6.62338, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50709, val loss: 6.62332, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50672, val loss: 6.62318, in 



Binning 0.020 GB of training data: 



0.346 s
Binning 0.002 GB of validation data: 0.006 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43353, val loss: 6.54928, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43309, val loss: 6.54923, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43265, val loss: 6.54917, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43222, val loss: 6.54911, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43180, val loss: 6.54908, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43138, val loss: 6.54904, in 0.013s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43097, val loss: 6.54901, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43057, val loss: 6.54897, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43016, val loss: 6.54895, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42975, val loss: 6.54891, in 

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.6s finished


### evaluate on baseline with id features

In [19]:
features = baseline_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [20]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_id_features"] = scores

Binning 0.005 GB of training data: 0.036 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34859, val loss: 8.13122, in 0.011s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34811, val loss: 8.13095, in 0.011s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34764, val loss: 8.13068, in 0.010s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34718, val loss: 8.13042, in 0.010s
[5/100] 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 20 leaves, max depth = 5, train loss: 7.34672, val loss: 8.13015, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34628, val loss: 8.12990, in 0.010s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34585, val loss: 8.12965, in 0.010s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34544, val loss: 8.12944, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34504, val loss: 8.12922, in 0.010s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34464, val loss: 8.12901, in 0.010s
[11/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34426, val loss: 8.12880, in 0.010s
[12/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34389, val loss: 8.12860, in 0.010s
[13/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34352, val loss: 8.12840, in 0.010s
[14/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34313, val loss: 8.12820, in 0.010s
[15/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34277, val loss: 8.12799, 

[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32618, val loss: 8.12152, in 0.010s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32605, val loss: 8.12154, in 0.011s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32591, val loss: 8.12152, in 0.010s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32581, val loss: 8.12150, in 0.010s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32567, val loss: 8.12148, in 0.010s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32555, val loss: 8.12150, in 0.010s
[100/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32542, val loss: 8.12149, in 0.010s
Fit 100 trees in 1.093 s, (2000 total leaves)
Time spent computing histograms: 0.137s
Time spent finding best splits:  0.051s
Time spent applying splits:      0.235s
Time spent predicting:           0.027s
Binning 0.005 GB of training data: 0.049 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 le

[79/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36647, val loss: 7.80661, in 0.009s
[80/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36634, val loss: 7.80659, in 0.009s
[81/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36621, val loss: 7.80659, in 0.009s
[82/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36607, val loss: 7.80658, in 0.009s
[83/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36595, val loss: 7.80658, in 0.010s
[84/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36582, val loss: 7.80657, in 0.009s
[85/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36570, val loss: 7.80652, in 0.010s
[86/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36555, val loss: 7.80650, in 0.009s
[87/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36542, val loss: 7.80648, in 0.010s
[88/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36527, val loss: 7.80646, in 0.009s
[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36513, val lo

[64/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44120, val loss: 7.23436, in 0.010s
[65/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44101, val loss: 7.23434, in 0.010s
[66/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44084, val loss: 7.23433, in 0.009s
[67/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44067, val loss: 7.23431, in 0.011s
[68/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44050, val loss: 7.23429, in 0.010s
[69/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44033, val loss: 7.23426, in 0.010s
[70/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44017, val loss: 7.23422, in 0.010s
[71/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44000, val loss: 7.23423, in 0.010s
[72/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43984, val loss: 7.23420, in 0.009s
[73/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43968, val loss: 7.23416, in 0.010s
[74/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43952, val lo

[59/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49364, val loss: 6.61601, in 0.010s
[60/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49345, val loss: 6.61597, in 0.010s
[61/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49325, val loss: 6.61595, in 0.010s
[62/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49309, val loss: 6.61588, in 0.010s
[63/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49290, val loss: 6.61586, in 0.010s
[64/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49274, val loss: 6.61580, in 0.009s
[65/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49256, val loss: 6.61578, in 0.010s
[66/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49240, val loss: 6.61572, in 0.010s
[67/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49225, val loss: 6.61570, in 0.010s
[68/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49209, val loss: 6.61568, in 0.009s
[69/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49193, val lo

[43/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42008, val loss: 6.54010, in 0.010s
[44/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41986, val loss: 6.53995, in 0.009s
[45/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41964, val loss: 6.53984, in 0.010s
[46/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41941, val loss: 6.53971, in 0.010s
[47/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41921, val loss: 6.53958, in 0.010s
[48/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41901, val loss: 6.53944, in 0.010s
[49/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41880, val loss: 6.53934, in 0.010s
[50/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41861, val loss: 6.53921, in 0.009s
[51/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41840, val loss: 6.53911, in 0.010s
[52/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41820, val loss: 6.53900, in 0.010s
[53/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41802, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.2s finished


### evaluate on baseline with embeddings and id features

In [21]:
features = baseline_feature_names + embedding_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [22]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings_and_id_features"] = scores

Binning 0.023 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.360 s
Binning 0.003 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34833, val loss: 8.13105, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34760, val loss: 8.13060, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34689, val loss: 8.13014, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34618, val loss: 8.12972, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34550, val loss: 8.12930, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34482, val loss: 8.12890, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34416, val loss: 8.12844, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34348, val loss: 8.12797, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34284, val loss: 8.12764, in 0.013s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34221, val loss: 8.12720, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30588, val loss: 8.10954, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30557, val loss: 8.10929, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30526, val loss: 8.10907, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30492, val loss: 8.10902, in 0.012s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30462, val loss: 8.10883, in 0.015s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30425, val loss: 8.10873, in 0.013s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30394, val loss: 8.10843, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30364, val loss: 8.10819, in 0.011s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30335, val loss: 8.10801, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30308, val loss: 8.10790, in 0.014s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30279, val lo



Binning 0.023 GB of training data: 



0.366 s
Binning 0.003 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38643, val loss: 7.81533, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38550, val loss: 7.81463, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38460, val loss: 7.81395, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38371, val loss: 7.81328, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38283, val loss: 7.81265, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38199, val loss: 7.81196, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38115, val loss: 7.81137, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38033, val loss: 7.81070, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.37952, val loss: 7.81011, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.37874, val loss: 7.80950, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33918, val loss: 7.79041, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33880, val loss: 7.79037, in 0.014s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33852, val loss: 7.79031, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33815, val loss: 7.79027, in 0.013s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33784, val loss: 7.79024, in 0.012s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33744, val loss: 7.79015, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33713, val loss: 7.78999, in 0.012s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33678, val loss: 7.78996, in 0.013s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33640, val loss: 7.78988, in 0.014s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33606, val loss: 7.78985, in 0.013s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33573, val lo



Binning 0.023 GB of training data: 



0.383 s
Binning 0.003 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46159, val loss: 7.24219, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46070, val loss: 7.24179, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45982, val loss: 7.24139, in 0.014s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45897, val loss: 7.24102, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45813, val loss: 7.24065, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45730, val loss: 7.24031, in 0.013s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45642, val loss: 7.23989, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45566, val loss: 7.23961, in 0.013s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45481, val loss: 7.23922, in 0.013s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45397, val loss: 7.23896, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41340, val loss: 7.22647, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41309, val loss: 7.22640, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41276, val loss: 7.22642, in 0.013s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41238, val loss: 7.22616, in 0.013s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41206, val loss: 7.22619, in 0.014s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41175, val loss: 7.22616, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41142, val loss: 7.22612, in 0.012s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41111, val loss: 7.22605, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41080, val loss: 7.22605, in 0.013s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41051, val loss: 7.22609, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41015, val lo



Binning 0.023 GB of training data: 



0.351 s
Binning 0.003 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51006, val loss: 6.62381, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50926, val loss: 6.62346, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50844, val loss: 6.62304, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50767, val loss: 6.62271, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50685, val loss: 6.62243, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50606, val loss: 6.62201, in 0.013s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50533, val loss: 6.62167, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50456, val loss: 6.62125, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50379, val loss: 6.62101, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50303, val loss: 6.62077, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46550, val loss: 6.61184, in 0.014s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46522, val loss: 6.61183, in 0.015s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46493, val loss: 6.61185, in 0.016s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46462, val loss: 6.61190, in 0.015s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46433, val loss: 6.61190, in 0.015s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46398, val loss: 6.61193, in 0.015s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46356, val loss: 6.61189, in 0.014s
Fit 95 trees in 1.754 s, (1900 total leaves)
Time spent computing histograms: 0.293s
Time spent finding best splits:  0.076s
Time spent applying splits:      0.234s
Time spent predicting:           0.026s




Binning 0.023 GB of training data: 



0.415 s
Binning 0.003 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43325, val loss: 6.54897, in 0.016s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43232, val loss: 6.54860, in 0.014s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43158, val loss: 6.54815, in 0.014s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43087, val loss: 6.54781, in 0.014s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43016, val loss: 6.54739, in 0.014s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42946, val loss: 6.54700, in 0.013s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42859, val loss: 6.54666, in 0.016s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42791, val loss: 6.54629, in 0.014s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42706, val loss: 6.54597, in 0.014s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42641, val loss: 6.54561, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38922, val loss: 6.53317, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38885, val loss: 6.53314, in 0.014s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38852, val loss: 6.53317, in 0.013s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38829, val loss: 6.53312, in 0.015s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38795, val loss: 6.53313, in 0.016s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38760, val loss: 6.53317, in 0.015s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38726, val loss: 6.53315, in 0.015s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38688, val loss: 6.53313, in 0.014s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38658, val loss: 6.53311, in 0.013s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38628, val loss: 6.53309, in 0.014s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38592, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   11.0s finished


In [23]:
from numpy import mean
import csv

def make_output(key):
    output = {}
    output["id_column"] = target_id_column
    output["type"] = key
    output["train_r2"] = mean(results[key]["train_r2"])
    output["test_r2"] = mean(results[key]["test_r2"])
    output["train_root_mean_squared_error"] = -1*mean(results[key]["train_neg_root_mean_squared_error"])
    output["test_root_mean_squared_error"] = -1*mean(results[key]["test_neg_root_mean_squared_error"])
    return output
def save(output):
    path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\results\node2vec_embeddings.csv"
    with open(path, 'a', newline='') as csv_file:
        dict_object = csv.DictWriter(csv_file, fieldnames=list(output.keys())) 
        dict_object.writerow(output)

In [24]:
print(make_output("baseline"))
print(make_output("baseline_with_embeddings"))
print(make_output("baseline_with_id_features"))
print(make_output("baseline_with_embeddings_and_id_features"))

{'id_column': 'state_id', 'type': 'baseline', 'train_r2': 0.0001655841013616044, 'test_r2': 0.00013871442920936338, 'train_root_mean_squared_error': 3.8501609494819595, 'test_root_mean_squared_error': 3.85005310803518}
{'id_column': 'state_id', 'type': 'baseline_with_embeddings', 'train_r2': 0.0030211551294739847, 'test_r2': 0.0010558355422211508, 'train_root_mean_squared_error': 3.8446557899299294, 'test_root_mean_squared_error': 3.848289922201822}
{'id_column': 'state_id', 'type': 'baseline_with_id_features', 'train_r2': 0.003002786182226136, 'test_r2': 0.002147849297674931, 'train_root_mean_squared_error': 3.844694054524519, 'test_root_mean_squared_error': 3.8461801692448554}
{'id_column': 'state_id', 'type': 'baseline_with_embeddings_and_id_features', 'train_r2': 0.006251146653342188, 'test_r2': 0.0033328601879277107, 'train_root_mean_squared_error': 3.838425402046103, 'test_root_mean_squared_error': 3.843895647667759}


### save

In [25]:
save(make_output("baseline"))
save(make_output("baseline_with_embeddings"))
save(make_output("baseline_with_id_features"))
save(make_output("baseline_with_embeddings_and_id_features"))