# ABOUT: 
- this code evaluates the node2vec embeddings on all node2vec embeddings generated
- findings: 
    - using card_id embeddings appear to cause overfitting
        
- details:       
    - i.e compared to baseline, performance on training set is better but performance on validation set is worse
    - baseline - using just feature_2 as feature
    - model used is Histogram Gradient boosting
    - metrics used are r2 and rmse
    - 3 fold cross validated

In [1]:
from config import *

import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nodevectors
from sklearn.model_selection import cross_validate

In [2]:
target_id_column = "state_id"
node2vec_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\model\node2vec_card_id_state_id.zip"
embedding_size = 8

### prepare data

In [3]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
id_columns = pd.read_csv(path, usecols = ["card_id", target_id_column])
id_columns.head()

Unnamed: 0,card_id,state_id
0,C_ID_4e6213e9bc,state_id_16
1,C_ID_4e6213e9bc,state_id_16
2,C_ID_4e6213e9bc,state_id_16
3,C_ID_4e6213e9bc,state_id_16
4,C_ID_4e6213e9bc,state_id_16


In [4]:
# group by card_id, then acquire nunique_merchant_id, count_merchant_id, nunique_count_frac_merchant_id
id_features = id_columns.groupby("card_id").agg(["nunique", "count"])
id_features = id_features.reset_index()
id_features.columns = ["card_id", f"nunique_{target_id_column}", f"count_{target_id_column}"]
id_features[f"nunique_count_frac_{target_id_column}"] = id_features[f"nunique_{target_id_column}"]/id_features[f"count_{target_id_column}"]
id_features

Unnamed: 0,card_id,nunique_state_id,count_state_id,nunique_count_frac_state_id
0,C_ID_00007093c1,4,151,0.026490
1,C_ID_0001238066,6,149,0.040268
2,C_ID_0001506ef0,2,68,0.029412
3,C_ID_0001793786,5,247,0.020243
4,C_ID_000183fdda,7,155,0.045161
...,...,...,...,...
325535,C_ID_ffff1d9928,3,16,0.187500
325536,C_ID_ffff579d3a,2,115,0.017391
325537,C_ID_ffff756266,2,25,0.080000
325538,C_ID_ffff828181,7,198,0.035354


In [5]:
# load train target variable
train_file = pd.read_csv(train_path, usecols = ["card_id","target", "feature_2"])
train_file.head()

Unnamed: 0,card_id,feature_2,target
0,C_ID_92a2005557,2,-0.820283
1,C_ID_3d0044924f,1,0.392913
2,C_ID_d639edf6cd,2,0.688056
3,C_ID_186d6a6901,3,0.142495
4,C_ID_cdbd2c0db2,3,-0.159749


In [6]:
# load trained node2vec
node2vec = nodevectors.GGVec.load(node2vec_path)
# convert embeddings to dataframe
node2vec_embeddings = pd.DataFrame.from_dict(node2vec.model, orient = "index")
node2vec_embeddings = node2vec_embeddings.reset_index()
node2vec_embeddings

Unnamed: 0,index,0,1,2,3,4,5,6,7
0,C_ID_00007093c1,-0.158865,0.044397,0.085267,0.053340,0.018979,0.097660,0.093443,-0.139136
1,C_ID_0001238066,-0.150473,-0.068141,-0.088837,-0.096943,0.088486,0.294565,0.213416,0.159755
2,C_ID_0001506ef0,-0.105324,0.262319,-0.024576,0.032032,-0.329527,0.009933,0.224257,-0.162728
3,C_ID_0001793786,0.232132,0.063254,0.139306,-0.245536,-0.042158,0.251522,-0.046345,-0.132429
4,C_ID_000183fdda,0.069649,-0.184890,0.156165,-0.338577,0.116683,0.143579,-0.059252,0.105837
...,...,...,...,...,...,...,...,...,...
325560,state_id_5,-0.001164,-0.000787,-0.001845,-0.000411,-0.000649,0.002814,-0.001124,0.001244
325561,state_id_6,-0.002305,-0.000151,0.002066,-0.000758,-0.001173,-0.002576,0.000802,0.001303
325562,state_id_7,0.000520,-0.000457,-0.000698,0.001114,-0.001228,-0.001035,-0.000111,-0.001668
325563,state_id_8,-0.000788,0.001114,0.000678,-0.000008,-0.000071,0.002317,0.001977,0.000181


In [7]:
# group and aggregate the id embeddings (e.g city_id embeddings) by the "card_id"
node2vec_embeddings = id_columns.merge(node2vec_embeddings, how = "left", left_on = target_id_column, right_on = "index")
node2vec_embeddings = node2vec_embeddings.drop("index", axis = 1)
node2vec_embeddings = node2vec_embeddings.groupby("card_id").mean().reset_index()

In [8]:
node2vec_embeddings

Unnamed: 0,card_id,0,1,2,3,4,5,6,7
0,C_ID_00007093c1,0.001941,-0.001361,-0.000859,-0.000760,0.000669,0.000588,0.000016,-0.000562
1,C_ID_0001238066,-0.000771,0.000067,-0.000118,-0.001081,-0.000910,0.000823,-0.001856,0.000130
2,C_ID_0001506ef0,0.000867,-0.001532,-0.001234,-0.001165,0.000316,-0.000211,-0.001421,0.001033
3,C_ID_0001793786,0.000161,0.000441,-0.001334,0.000565,-0.000216,0.001035,-0.000692,0.000353
4,C_ID_000183fdda,0.000495,-0.000010,-0.001706,-0.000936,-0.000147,0.001555,-0.000034,0.000742
...,...,...,...,...,...,...,...,...,...
325535,C_ID_ffff1d9928,-0.000396,-0.000253,0.000611,0.001111,-0.000224,0.000858,0.000280,-0.000430
325536,C_ID_ffff579d3a,-0.000805,-0.000218,0.000087,-0.001746,-0.001281,0.000825,-0.002213,-0.000213
325537,C_ID_ffff756266,-0.000715,-0.000180,-0.000106,-0.001235,-0.001190,0.000878,-0.002118,-0.000091
325538,C_ID_ffff828181,0.000538,0.000789,-0.001864,-0.001080,0.000060,0.000597,0.000578,-0.000350


In [9]:
# merge id embeddings with train.csv 
dataset = train_file.merge(node2vec_embeddings, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7
0,C_ID_92a2005557,2,-0.820283,-0.000804,-0.000211,9e-05,-0.001741,-0.001276,0.00082,-0.002207,-0.000209
1,C_ID_3d0044924f,1,0.392913,-0.000774,-0.000159,-0.000123,-0.001281,-0.001154,0.000881,-0.002165,-0.000103
2,C_ID_d639edf6cd,2,0.688056,-0.001131,-0.000735,-0.001668,-0.000533,-0.000707,0.002631,-0.001223,0.001111
3,C_ID_186d6a6901,3,0.142495,-0.000186,-0.001148,-0.002002,9.1e-05,-0.001075,0.002303,-0.000968,-0.000499
4,C_ID_cdbd2c0db2,3,-0.159749,-0.000135,-0.00121,-0.001951,0.000139,-0.001088,0.002244,-0.000745,-0.00064


In [10]:
# merge id features 
dataset = dataset.merge(id_features, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7,nunique_state_id,count_state_id,nunique_count_frac_state_id
0,C_ID_92a2005557,2,-0.820283,-0.000804,-0.000211,9e-05,-0.001741,-0.001276,0.00082,-0.002207,-0.000209,3,283,0.010601
1,C_ID_3d0044924f,1,0.392913,-0.000774,-0.000159,-0.000123,-0.001281,-0.001154,0.000881,-0.002165,-0.000103,3,356,0.008427
2,C_ID_d639edf6cd,2,0.688056,-0.001131,-0.000735,-0.001668,-0.000533,-0.000707,0.002631,-0.001223,0.001111,2,44,0.045455
3,C_ID_186d6a6901,3,0.142495,-0.000186,-0.001148,-0.002002,9.1e-05,-0.001075,0.002303,-0.000968,-0.000499,5,84,0.059524
4,C_ID_cdbd2c0db2,3,-0.159749,-0.000135,-0.00121,-0.001951,0.000139,-0.001088,0.002244,-0.000745,-0.00064,7,169,0.04142


In [11]:
dataset

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7,nunique_state_id,count_state_id,nunique_count_frac_state_id
0,C_ID_92a2005557,2,-0.820283,-0.000804,-0.000211,0.000090,-0.001741,-0.001276,0.000820,-0.002207,-0.000209,3,283,0.010601
1,C_ID_3d0044924f,1,0.392913,-0.000774,-0.000159,-0.000123,-0.001281,-0.001154,0.000881,-0.002165,-0.000103,3,356,0.008427
2,C_ID_d639edf6cd,2,0.688056,-0.001131,-0.000735,-0.001668,-0.000533,-0.000707,0.002631,-0.001223,0.001111,2,44,0.045455
3,C_ID_186d6a6901,3,0.142495,-0.000186,-0.001148,-0.002002,0.000091,-0.001075,0.002303,-0.000968,-0.000499,5,84,0.059524
4,C_ID_cdbd2c0db2,3,-0.159749,-0.000135,-0.001210,-0.001951,0.000139,-0.001088,0.002244,-0.000745,-0.000640,7,169,0.041420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,2,-2.740821,0.000126,-0.000384,-0.000459,0.000259,-0.001245,-0.000486,-0.000740,-0.001239,2,47,0.042553
201913,C_ID_1314773c0b,1,0.312917,-0.002049,0.000856,-0.002551,0.001703,0.000952,0.001607,-0.002882,0.000670,1,48,0.020833
201914,C_ID_7666735b3d,3,0.093494,-0.000534,0.000689,0.000078,0.000462,-0.000302,0.001770,0.000914,0.000160,5,90,0.055556
201915,C_ID_73f5a0efd0,2,-4.676589,-0.000802,-0.000213,0.000104,-0.001758,-0.001286,0.000808,-0.002222,-0.000226,1,31,0.032258


### evaluate on baseline dataset 

In [12]:
# define columns for training
baseline_feature_names = ["feature_2"]
embedding_feature_names = list(range(embedding_size))
id_feature_feature_names = [f"nunique_{target_id_column}",f"count_{target_id_column}",f"nunique_count_frac_{target_id_column}"]
categorical_feature_names = ["feature_2"]
target_col = "target"
results = {}

In [13]:
features = baseline_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [14]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = 10,
                        return_train_score=True)
results["baseline"] = scores

[CV] START .....................................................................
Binning 0.001 GB of training data: 0.004 s
Binning 0.000 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34904, val loss: 8.13150, in 0.009s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34900, val loss: 8.13149, in 0.009s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34896, val loss: 8.13148, in 0.009s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34892, val loss: 8.13147, in 0.008s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34888, val loss: 8.13147, in 0.008s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34884, val loss: 8.13146, in 0.009s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34880, val loss: 8.13146, in 0.008s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34877, val loss: 8.13145, in 0.007s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34873, val loss

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 3 leaves, max depth = 2, train loss: 7.34843, val loss: 8.13141, in 0.008s
[19/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34841, val loss: 8.13141, in 0.007s
[20/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34838, val loss: 8.13141, in 0.006s
[21/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34835, val loss: 8.13141, in 0.006s
[22/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34832, val loss: 8.13141, in 0.008s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34829, val loss: 8.13141, in 0.006s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34827, val loss: 8.13141, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34824, val loss: 8.13141, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34821, val loss: 8.13141, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34819, val loss: 8.13141, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34816, val loss: 8.13141, in 0.00

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.38682, val loss: 7.81603, in 0.006s
Fit 17 trees in 0.135 s, (51 total leaves)
Time spent computing histograms: 0.009s
Time spent finding best splits:  0.002s
Time spent applying splits:      0.015s
Time spent predicting:           0.003s
[CV] END  neg_root_mean_squared_error: (train=-3.855, test=-3.833) r2: (train=0.000, test=0.000) total time=   0.1s
[CV] START .....................................................................
Binning 0.001 GB of training data: 0.002 s
Binning 0.000 GB of validation data: 0.000 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46241, val loss: 7.24254, in 0.006s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46237, val loss: 7.24251, in 0.006s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46233, val loss: 7.24248, in 0.006s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46229, val loss: 7.24246, in 0.006s
[5/100] 1 tree, 3 leaves, max de

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.46167, val loss: 7.24207, in 0.007s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46164, val loss: 7.24205, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46162, val loss: 7.24204, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46159, val loss: 7.24202, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46157, val loss: 7.24201, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46154, val loss: 7.24199, in 0.005s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46152, val loss: 7.24198, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46150, val loss: 7.24197, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46147, val loss: 7.24195, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46145, val loss: 7.24194, in 0.006s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46143, val loss: 7.24193, in 0.00

[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51065, val loss: 6.62405, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51061, val loss: 6.62403, in 0.006s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51058, val loss: 6.62401, in 0.006s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51055, val loss: 6.62399, in 0.006s
[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51052, val loss: 6.62397, in 0.006s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51048, val loss: 6.62395, in 0.006s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51045, val loss: 6.62393, in 0.006s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51042, val loss: 6.62392, in 0.006s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51040, val loss: 6.62390, in 0.006s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51037, val loss: 6.62388, in 0.006s
[17/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51034, val loss: 6.62386, i

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.4s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.51016, val loss: 6.62376, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51013, val loss: 6.62375, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51011, val loss: 6.62373, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51009, val loss: 6.62372, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51007, val loss: 6.62371, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51004, val loss: 6.62370, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51002, val loss: 6.62368, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51000, val loss: 6.62367, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50998, val loss: 6.62366, in 0.006s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50996, val loss: 6.62365, in 0.006s
[34/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50994, val loss: 6.62364, in 0.00

[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43374, val loss: 6.54893, in 0.006s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43371, val loss: 6.54888, in 0.006s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43369, val loss: 6.54884, in 0.006s
[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43366, val loss: 6.54879, in 0.006s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43364, val loss: 6.54875, in 0.005s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43361, val loss: 6.54870, in 0.006s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43358, val loss: 6.54866, in 0.005s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43356, val loss: 6.54862, in 0.006s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43354, val loss: 6.54858, in 0.006s
[17/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43351, val loss: 6.54854, in 0.006s
[18/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43349, val loss: 6.54850, 

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.4s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.43334, val loss: 6.54823, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43332, val loss: 6.54819, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43330, val loss: 6.54816, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43328, val loss: 6.54812, in 0.005s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43327, val loss: 6.54809, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43325, val loss: 6.54806, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43323, val loss: 6.54802, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43321, val loss: 6.54799, in 0.006s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43319, val loss: 6.54796, in 0.006s
[34/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43318, val loss: 6.54793, in 0.006s
[35/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43316, val loss: 6.54790, in 0.00

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.3s finished


### evaluate on baseline with embeddings

In [15]:
features = baseline_feature_names + embedding_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [16]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings"] = scores

Binning 0.010 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.156 s
Binning 0.001 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34873, val loss: 8.13127, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34839, val loss: 8.13104, in 0.011s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34805, val loss: 8.13083, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34772, val loss: 8.13061, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34739, val loss: 8.13040, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34707, val loss: 8.13019, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34676, val loss: 8.12998, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34646, val loss: 8.12979, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34616, val loss: 8.12960, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34586, val loss: 8.12940, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32623, val loss: 8.12249, in 0.010s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32606, val loss: 8.12243, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32587, val loss: 8.12244, in 0.010s
[92/100] 1 tree, 19 leaves, max depth = 5, train loss: 7.32573, val loss: 8.12241, in 0.010s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32555, val loss: 8.12243, in 0.010s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32534, val loss: 8.12233, in 0.010s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32516, val loss: 8.12231, in 0.011s
[96/100] 1 tree, 19 leaves, max depth = 5, train loss: 7.32501, val loss: 8.12224, in 0.011s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32483, val loss: 8.12224, in 0.010s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32461, val loss: 8.12212, in 0.010s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32445, val lo



Binning 0.010 GB of training data: 



0.158 s
Binning 0.001 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38700, val loss: 7.81577, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38663, val loss: 7.81549, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38627, val loss: 7.81522, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38587, val loss: 7.81491, in 0.010s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38549, val loss: 7.81461, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38512, val loss: 7.81434, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38474, val loss: 7.81414, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38441, val loss: 7.81386, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38405, val loss: 7.81364, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38368, val loss: 7.81334, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36475, val loss: 7.80301, in 0.010s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36454, val loss: 7.80291, in 0.010s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36437, val loss: 7.80282, in 0.010s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36420, val loss: 7.80268, in 0.010s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36403, val loss: 7.80264, in 0.010s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36382, val loss: 7.80253, in 0.010s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36365, val loss: 7.80249, in 0.010s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36344, val loss: 7.80238, in 0.010s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36328, val loss: 7.80231, in 0.010s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36308, val loss: 7.80224, in 0.010s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36285, val lo



Binning 0.010 GB of training data: 



0.167 s
Binning 0.001 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46209, val loss: 7.24234, in 0.011s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46174, val loss: 7.24214, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46139, val loss: 7.24194, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46105, val loss: 7.24175, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46071, val loss: 7.24156, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46037, val loss: 7.24137, in 0.010s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46005, val loss: 7.24120, in 0.010s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45972, val loss: 7.24104, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45940, val loss: 7.24087, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45909, val loss: 7.24072, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44139, val loss: 7.23440, in 0.010s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44122, val loss: 7.23438, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44105, val loss: 7.23429, in 0.010s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44088, val loss: 7.23430, in 0.010s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44071, val loss: 7.23430, in 0.010s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44047, val loss: 7.23428, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44028, val loss: 7.23428, in 0.010s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44008, val loss: 7.23424, in 0.010s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43988, val loss: 7.23420, in 0.010s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43966, val loss: 7.23414, in 0.010s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43947, val lo



Binning 0.010 GB of training data: 



0.147 s
Binning 0.001 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51044, val loss: 6.62405, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50998, val loss: 6.62391, in 0.011s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50957, val loss: 6.62380, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50920, val loss: 6.62371, in 0.010s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50880, val loss: 6.62362, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50837, val loss: 6.62339, in 0.010s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50801, val loss: 6.62330, in 0.010s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50758, val loss: 6.62312, in 0.010s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50723, val loss: 6.62304, in 0.010s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50682, val loss: 6.62294, in 



Binning 0.010 GB of training data: 



0.148 s
Binning 0.001 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43359, val loss: 6.54914, in 0.011s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43321, val loss: 6.54896, in 0.011s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43284, val loss: 6.54878, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43248, val loss: 6.54866, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43213, val loss: 6.54854, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43177, val loss: 6.54838, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43143, val loss: 6.54827, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43110, val loss: 6.54819, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43077, val loss: 6.54806, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43040, val loss: 6.54796, in 

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.7s finished


### evaluate on baseline with id features

In [17]:
features = baseline_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [18]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_id_features"] = scores

Binning 0.005 GB of training data: 0.036 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34859, val loss: 8.13122, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34811, val loss: 8.13095, in 0.011s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34764, val loss: 8.13068, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34718, val loss: 8.13042, in 0.010s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34672, val loss: 8.13015, in 0.011s
[6/100] 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 20 leaves, max depth = 5, train loss: 7.34628, val loss: 8.12990, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34585, val loss: 8.12965, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34544, val loss: 8.12944, in 0.010s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34504, val loss: 8.12922, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34464, val loss: 8.12901, in 0.011s
[11/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34426, val loss: 8.12880, in 0.011s
[12/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34389, val loss: 8.12860, in 0.011s
[13/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34352, val loss: 8.12840, in 0.011s
[14/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34313, val loss: 8.12820, in 0.011s
[15/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34277, val loss: 8.12799, in 0.010s
[16/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34242, val loss: 8.12779,

[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32605, val loss: 8.12154, in 0.010s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32591, val loss: 8.12152, in 0.009s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32581, val loss: 8.12150, in 0.010s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32567, val loss: 8.12148, in 0.010s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32555, val loss: 8.12150, in 0.010s
[100/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32542, val loss: 8.12149, in 0.010s
Fit 100 trees in 1.122 s, (2000 total leaves)
Time spent computing histograms: 0.143s
Time spent finding best splits:  0.052s
Time spent applying splits:      0.243s
Time spent predicting:           0.027s
Binning 0.005 GB of training data: 0.048 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38687, val loss: 7.81576, in 0.011s
[2/100] 1 tree, 20 lea

[80/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36634, val loss: 7.80659, in 0.012s
[81/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36621, val loss: 7.80659, in 0.013s
[82/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36607, val loss: 7.80658, in 0.010s
[83/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36595, val loss: 7.80658, in 0.013s
[84/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36582, val loss: 7.80657, in 0.013s
[85/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36570, val loss: 7.80652, in 0.011s
[86/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36555, val loss: 7.80650, in 0.012s
[87/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36542, val loss: 7.80648, in 0.013s
[88/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36527, val loss: 7.80646, in 0.014s
[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36513, val loss: 7.80646, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36498, val lo

[65/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44101, val loss: 7.23434, in 0.010s
[66/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44084, val loss: 7.23433, in 0.009s
[67/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44067, val loss: 7.23431, in 0.010s
[68/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44050, val loss: 7.23429, in 0.010s
[69/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44033, val loss: 7.23426, in 0.010s
[70/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44017, val loss: 7.23422, in 0.010s
[71/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44000, val loss: 7.23423, in 0.010s
[72/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43984, val loss: 7.23420, in 0.010s
[73/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43968, val loss: 7.23416, in 0.011s
[74/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43952, val loss: 7.23413, in 0.010s
[75/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43936, val lo

[60/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49345, val loss: 6.61597, in 0.010s
[61/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49325, val loss: 6.61595, in 0.011s
[62/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49309, val loss: 6.61588, in 0.010s
[63/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49290, val loss: 6.61586, in 0.013s
[64/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49274, val loss: 6.61580, in 0.013s
[65/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49256, val loss: 6.61578, in 0.013s
[66/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49240, val loss: 6.61572, in 0.011s
[67/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49225, val loss: 6.61570, in 0.012s
[68/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49209, val loss: 6.61568, in 0.010s
[69/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49193, val loss: 6.61565, in 0.011s
[70/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49177, val lo

[45/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41964, val loss: 6.53984, in 0.010s
[46/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41941, val loss: 6.53971, in 0.010s
[47/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41921, val loss: 6.53958, in 0.011s
[48/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41901, val loss: 6.53944, in 0.011s
[49/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41880, val loss: 6.53934, in 0.010s
[50/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41861, val loss: 6.53921, in 0.010s
[51/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41840, val loss: 6.53911, in 0.011s
[52/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41820, val loss: 6.53900, in 0.011s
[53/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41802, val loss: 6.53889, in 0.010s
[54/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41784, val loss: 6.53878, in 0.010s
[55/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41764, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.8s finished


### evaluate on baseline with embeddings and id features

In [19]:
features = baseline_feature_names + embedding_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [20]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings_and_id_features"] = scores

Binning 0.014 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.195 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34842, val loss: 8.13094, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34780, val loss: 8.13046, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34716, val loss: 8.12991, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34656, val loss: 8.12945, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34596, val loss: 8.12897, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34538, val loss: 8.12853, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34481, val loss: 8.12803, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34422, val loss: 8.12753, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34367, val loss: 8.12704, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34313, val loss: 8.12661, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31190, val loss: 8.10945, in 0.012s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31168, val loss: 8.10930, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31144, val loss: 8.10915, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31123, val loss: 8.10900, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31103, val loss: 8.10886, in 0.012s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31073, val loss: 8.10884, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31053, val loss: 8.10872, in 0.012s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31029, val loss: 8.10862, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31004, val loss: 8.10851, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30978, val loss: 8.10844, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30957, val lo



Binning 0.014 GB of training data: 



0.196 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38662, val loss: 7.81556, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38587, val loss: 7.81502, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38514, val loss: 7.81450, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38443, val loss: 7.81403, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38373, val loss: 7.81358, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38304, val loss: 7.81309, in 0.014s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38237, val loss: 7.81267, in 0.014s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38170, val loss: 7.81221, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38105, val loss: 7.81181, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38041, val loss: 7.81135, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34664, val loss: 7.79321, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34637, val loss: 7.79309, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34612, val loss: 7.79296, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34582, val loss: 7.79290, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34554, val loss: 7.79286, in 0.012s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34529, val loss: 7.79272, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34499, val loss: 7.79248, in 0.011s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34471, val loss: 7.79220, in 0.010s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34446, val loss: 7.79199, in 0.011s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34417, val loss: 7.79196, in 0.011s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34391, val lo



Binning 0.014 GB of training data: 



0.204 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46171, val loss: 7.24213, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46099, val loss: 7.24172, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46028, val loss: 7.24132, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45958, val loss: 7.24092, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45888, val loss: 7.24053, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45821, val loss: 7.24017, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45755, val loss: 7.23980, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45689, val loss: 7.23943, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45626, val loss: 7.23907, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45562, val loss: 7.23873, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42103, val loss: 7.22559, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42067, val loss: 7.22540, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42032, val loss: 7.22529, in 0.011s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41998, val loss: 7.22510, in 0.012s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41964, val loss: 7.22497, in 0.012s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41931, val loss: 7.22481, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41899, val loss: 7.22463, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41873, val loss: 7.22459, in 0.014s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41842, val loss: 7.22446, in 0.014s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41811, val loss: 7.22442, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41780, val lo



Binning 0.014 GB of training data: 



0.186 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51017, val loss: 6.62379, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50947, val loss: 6.62333, in 0.011s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50878, val loss: 6.62289, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50808, val loss: 6.62249, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50741, val loss: 6.62207, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50673, val loss: 6.62167, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50607, val loss: 6.62128, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50546, val loss: 6.62100, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50484, val loss: 6.62067, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50425, val loss: 6.62043, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47076, val loss: 6.60969, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47049, val loss: 6.60962, in 0.013s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47017, val loss: 6.60957, in 0.011s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46990, val loss: 6.60957, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46959, val loss: 6.60954, in 0.011s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46926, val loss: 6.60957, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46903, val loss: 6.60956, in 0.011s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46870, val loss: 6.60964, in 0.011s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46849, val loss: 6.60963, in 0.011s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46819, val loss: 6.60965, in 0.010s
Fit 98 trees in 1.371 s, (1960 total leaves)
Time spent computing hist



Binning 0.014 GB of training data: 



0.186 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43318, val loss: 6.54889, in 0.011s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43240, val loss: 6.54849, in 0.011s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43163, val loss: 6.54807, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43098, val loss: 6.54768, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43024, val loss: 6.54730, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42951, val loss: 6.54691, in 0.010s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42889, val loss: 6.54656, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42827, val loss: 6.54619, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42757, val loss: 6.54584, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42697, val loss: 6.54545, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39421, val loss: 6.53330, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39388, val loss: 6.53326, in 0.010s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39357, val loss: 6.53322, in 0.011s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39329, val loss: 6.53313, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39298, val loss: 6.53302, in 0.011s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39272, val loss: 6.53292, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39240, val loss: 6.53286, in 0.011s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39210, val loss: 6.53283, in 0.011s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39178, val loss: 6.53276, in 0.011s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39152, val loss: 6.53266, in 0.011s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39127, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.1s finished


In [21]:
from numpy import mean
import csv

def make_output(key):
    output = {}
    output["id_column"] = target_id_column
    output["type"] = key
    output["train_r2"] = mean(results[key]["train_r2"])
    output["test_r2"] = mean(results[key]["test_r2"])
    output["train_root_mean_squared_error"] = -1*mean(results[key]["train_neg_root_mean_squared_error"])
    output["test_root_mean_squared_error"] = -1*mean(results[key]["test_neg_root_mean_squared_error"])
    output["embedding_dimension"] = embedding_size
    return output
def save(output):
    path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\results\node2vec_embeddings.csv"
    with open(path, 'a', newline='') as csv_file:
        dict_object = csv.DictWriter(csv_file, fieldnames=list(output.keys())) 
        dict_object.writerow(output)

In [22]:
print(make_output("baseline"))
print(make_output("baseline_with_embeddings"))
print(make_output("baseline_with_id_features"))
print(make_output("baseline_with_embeddings_and_id_features"))

{'id_column': 'state_id', 'type': 'baseline', 'train_r2': 0.0001655841013616044, 'test_r2': 0.00013871442920936338, 'train_root_mean_squared_error': 3.8501609494819595, 'test_root_mean_squared_error': 3.85005310803518, 'embedding_dimension': 8}
{'id_column': 'state_id', 'type': 'baseline_with_embeddings', 'train_r2': 0.0029340331745365766, 'test_r2': 0.0010612199213982042, 'train_root_mean_squared_error': 3.844825741443015, 'test_root_mean_squared_error': 3.848278470713627, 'embedding_dimension': 8}
{'id_column': 'state_id', 'type': 'baseline_with_id_features', 'train_r2': 0.003002786182226136, 'test_r2': 0.002147849297674931, 'train_root_mean_squared_error': 3.844694054524519, 'test_root_mean_squared_error': 3.8461801692448554, 'embedding_dimension': 8}
{'id_column': 'state_id', 'type': 'baseline_with_embeddings_and_id_features', 'train_r2': 0.005459606829093544, 'test_r2': 0.0032127648445988257, 'train_root_mean_squared_error': 3.839954197986007, 'test_root_mean_squared_error': 3.844

### save

In [23]:
save(make_output("baseline"))
save(make_output("baseline_with_embeddings"))
save(make_output("baseline_with_id_features"))
save(make_output("baseline_with_embeddings_and_id_features"))