# ABOUT: 
- this code evaluates the node2vec embeddings on all node2vec embeddings generated
- findings: 
    - using card_id embeddings appear to cause overfitting
        
- details:       
    - i.e compared to baseline, performance on training set is better but performance on validation set is worse
    - baseline - using just feature_2 as feature
    - model used is Histogram Gradient boosting
    - metrics used are r2 and rmse
    - 3 fold cross validated

In [1]:
from config import *

import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nodevectors
from sklearn.model_selection import cross_validate

In [2]:
target_id_column = "merchant_id"
node2vec_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\model\node2vec_card_id_merchant_id.zip"
embedding_size = 8

### prepare data

In [3]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
id_columns = pd.read_csv(path, usecols = ["card_id", target_id_column])
id_columns.head()

Unnamed: 0,card_id,merchant_id
0,C_ID_4e6213e9bc,M_ID_e020e9b302
1,C_ID_4e6213e9bc,M_ID_86ec983688
2,C_ID_4e6213e9bc,M_ID_979ed661fc
3,C_ID_4e6213e9bc,M_ID_e6d5ae8ea6
4,C_ID_4e6213e9bc,M_ID_e020e9b302


In [4]:
# group by card_id, then acquire nunique_merchant_id, count_merchant_id, nunique_count_frac_merchant_id
id_features = id_columns.groupby("card_id").agg(["nunique", "count"])
id_features = id_features.reset_index()
id_features.columns = ["card_id", f"nunique_{target_id_column}", f"count_{target_id_column}"]
id_features[f"nunique_count_frac_{target_id_column}"] = id_features[f"nunique_{target_id_column}"]/id_features[f"count_{target_id_column}"]
id_features

Unnamed: 0,card_id,nunique_merchant_id,count_merchant_id,nunique_count_frac_merchant_id
0,C_ID_00007093c1,31,151,0.205298
1,C_ID_0001238066,90,149,0.604027
2,C_ID_0001506ef0,29,68,0.426471
3,C_ID_0001793786,150,247,0.607287
4,C_ID_000183fdda,84,155,0.541935
...,...,...,...,...
325535,C_ID_ffff1d9928,12,16,0.750000
325536,C_ID_ffff579d3a,63,115,0.547826
325537,C_ID_ffff756266,14,25,0.560000
325538,C_ID_ffff828181,97,198,0.489899


In [5]:
# load train target variable
train_file = pd.read_csv(train_path, usecols = ["card_id","target", "feature_2"])
train_file.head()

Unnamed: 0,card_id,feature_2,target
0,C_ID_92a2005557,2,-0.820283
1,C_ID_3d0044924f,1,0.392913
2,C_ID_d639edf6cd,2,0.688056
3,C_ID_186d6a6901,3,0.142495
4,C_ID_cdbd2c0db2,3,-0.159749


In [6]:
# load trained node2vec
node2vec = nodevectors.GGVec.load(node2vec_path)
# convert embeddings to dataframe
node2vec_embeddings = pd.DataFrame.from_dict(node2vec.model, orient = "index")
node2vec_embeddings = node2vec_embeddings.reset_index()
node2vec_embeddings

Unnamed: 0,index,0,1,2,3,4,5,6,7
0,C_ID_00007093c1,0.150694,-0.007826,0.107475,-0.532680,0.089988,0.576864,-0.292914,0.150284
1,C_ID_0001238066,0.162716,-0.305601,0.481354,-0.327146,0.442401,-0.339237,0.359859,0.010117
2,C_ID_0001506ef0,0.241464,0.126929,-0.098902,0.178153,0.041189,0.385079,0.054022,0.696743
3,C_ID_0001793786,0.426334,-0.023610,0.081554,-0.468118,-0.479091,0.163840,-0.119669,-0.631075
4,C_ID_000183fdda,-0.204253,-0.128048,0.027742,0.022795,-0.456723,0.035700,-0.519274,0.307047
...,...,...,...,...,...,...,...,...,...
660168,M_ID_fffeeb852d,-0.334127,-0.349824,0.502450,-0.110790,-0.075758,0.279072,0.444894,0.490928
660169,M_ID_fffef87522,-0.171834,-0.367770,0.228110,-0.052386,-0.303810,-0.208953,0.307866,-0.242859
660170,M_ID_ffff0af8e7,-0.293179,-0.121529,0.008474,-0.367563,0.370339,-0.393717,0.582979,0.130355
660171,M_ID_ffff655e2c,0.390756,0.264945,0.309071,0.551439,-0.050322,-0.161866,0.110679,-0.123692


In [7]:
# group and aggregate the id embeddings (e.g city_id embeddings) by the "card_id"
node2vec_embeddings = id_columns.merge(node2vec_embeddings, how = "left", left_on = target_id_column, right_on = "index")
node2vec_embeddings = node2vec_embeddings.drop("index", axis = 1)
node2vec_embeddings = node2vec_embeddings.groupby("card_id").mean().reset_index()

In [8]:
node2vec_embeddings

Unnamed: 0,card_id,0,1,2,3,4,5,6,7
0,C_ID_00007093c1,0.327767,-0.149240,-0.118518,-0.257957,0.085486,0.392529,-0.091746,-0.121668
1,C_ID_0001238066,-0.182204,-0.081253,0.023536,-0.097288,0.185248,0.036607,0.156894,0.082290
2,C_ID_0001506ef0,0.165273,0.100381,-0.378504,0.177755,0.098096,0.244406,-0.004749,0.169208
3,C_ID_0001793786,0.149577,0.083781,0.084787,-0.072578,-0.257452,0.204353,-0.002614,-0.096046
4,C_ID_000183fdda,-0.202921,0.152154,-0.220229,0.075419,-0.280709,0.187872,-0.197388,0.199256
...,...,...,...,...,...,...,...,...,...
325535,C_ID_ffff1d9928,0.164811,0.113642,0.035816,0.007593,-0.245958,-0.274752,-0.076468,0.065786
325536,C_ID_ffff579d3a,0.008263,0.234693,0.210772,-0.065836,0.182873,-0.022825,-0.101637,-0.020084
325537,C_ID_ffff756266,-0.255450,0.011155,0.040053,-0.170268,0.243391,-0.150612,0.205210,0.035069
325538,C_ID_ffff828181,0.226619,-0.158167,-0.032019,0.204196,-0.212057,0.122855,0.202833,-0.184587


In [9]:
# merge id embeddings with train.csv 
dataset = train_file.merge(node2vec_embeddings, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7
0,C_ID_92a2005557,2,-0.820283,-0.174056,0.088346,0.015092,-0.031286,0.150027,0.092678,0.052955,-0.004793
1,C_ID_3d0044924f,1,0.392913,-0.136972,0.12077,0.017582,-0.003471,0.140931,0.103379,0.022354,-0.036342
2,C_ID_d639edf6cd,2,0.688056,0.013226,0.134787,0.052386,0.030661,0.001913,0.04793,-0.112947,-0.000865
3,C_ID_186d6a6901,3,0.142495,0.091077,0.107889,-0.196212,-0.121584,-0.208922,0.091168,0.015861,0.042788
4,C_ID_cdbd2c0db2,3,-0.159749,0.121907,0.096852,-0.26702,-0.175529,-0.312636,0.100114,0.056525,0.04746


In [10]:
# merge id features 
dataset = dataset.merge(id_features, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7,nunique_merchant_id,count_merchant_id,nunique_count_frac_merchant_id
0,C_ID_92a2005557,2,-0.820283,-0.174056,0.088346,0.015092,-0.031286,0.150027,0.092678,0.052955,-0.004793,117,283,0.413428
1,C_ID_3d0044924f,1,0.392913,-0.136972,0.12077,0.017582,-0.003471,0.140931,0.103379,0.022354,-0.036342,148,356,0.41573
2,C_ID_d639edf6cd,2,0.688056,0.013226,0.134787,0.052386,0.030661,0.001913,0.04793,-0.112947,-0.000865,14,44,0.318182
3,C_ID_186d6a6901,3,0.142495,0.091077,0.107889,-0.196212,-0.121584,-0.208922,0.091168,0.015861,0.042788,57,84,0.678571
4,C_ID_cdbd2c0db2,3,-0.159749,0.121907,0.096852,-0.26702,-0.175529,-0.312636,0.100114,0.056525,0.04746,102,169,0.60355


In [11]:
dataset

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7,nunique_merchant_id,count_merchant_id,nunique_count_frac_merchant_id
0,C_ID_92a2005557,2,-0.820283,-0.174056,0.088346,0.015092,-0.031286,0.150027,0.092678,0.052955,-0.004793,117,283,0.413428
1,C_ID_3d0044924f,1,0.392913,-0.136972,0.120770,0.017582,-0.003471,0.140931,0.103379,0.022354,-0.036342,148,356,0.415730
2,C_ID_d639edf6cd,2,0.688056,0.013226,0.134787,0.052386,0.030661,0.001913,0.047930,-0.112947,-0.000865,14,44,0.318182
3,C_ID_186d6a6901,3,0.142495,0.091077,0.107889,-0.196212,-0.121584,-0.208922,0.091168,0.015861,0.042788,57,84,0.678571
4,C_ID_cdbd2c0db2,3,-0.159749,0.121907,0.096852,-0.267020,-0.175529,-0.312636,0.100114,0.056525,0.047460,102,169,0.603550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,2,-2.740821,-0.226834,-0.235258,-0.087438,0.368510,-0.073050,0.146362,-0.290680,-0.192877,16,47,0.340426
201913,C_ID_1314773c0b,1,0.312917,-0.205269,0.303093,0.143437,0.260230,-0.276191,0.046141,0.349340,-0.189564,29,48,0.604167
201914,C_ID_7666735b3d,3,0.093494,-0.045739,-0.073491,-0.095705,0.325634,-0.036683,0.221453,-0.220529,0.018364,55,90,0.611111
201915,C_ID_73f5a0efd0,2,-4.676589,-0.192939,0.134115,-0.003117,-0.010153,0.120634,0.120620,0.014390,-0.023801,24,31,0.774194


### evaluate on baseline dataset 

In [13]:
# define columns for training
baseline_feature_names = ["feature_2"]
embedding_feature_names = list(range(embedding_size))
id_feature_feature_names = [f"nunique_{target_id_column}",f"count_{target_id_column}",f"nunique_count_frac_{target_id_column}"]
categorical_feature_names = ["feature_2"]
target_col = "target"
results = {}

In [14]:
features = baseline_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [15]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = 10,
                        return_train_score=True)
results["baseline"] = scores

[CV] START .....................................................................
Binning 0.001 GB of training data: 0.002 s
Binning 0.000 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34904, val loss: 8.13150, in 0.008s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34900, val loss: 8.13149, in 0.008s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34896, val loss: 8.13148, in 0.006s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34892, val loss: 8.13147, in 0.006s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34888, val loss: 8.13147, in 0.007s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34884, val loss: 8.13146, in 0.006s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34880, val loss: 8.13146, in 0.007s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34877, val loss: 8.13145, in 0.008s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34873, val loss

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 3 leaves, max depth = 2, train loss: 7.34838, val loss: 8.13141, in 0.009s
[21/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34835, val loss: 8.13141, in 0.007s
[22/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34832, val loss: 8.13141, in 0.007s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34829, val loss: 8.13141, in 0.006s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34827, val loss: 8.13141, in 0.007s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34824, val loss: 8.13141, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34821, val loss: 8.13141, in 0.007s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34819, val loss: 8.13141, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34816, val loss: 8.13141, in 0.008s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34814, val loss: 8.13141, in 0.007s
Fit 29 trees in 0.236 s, (87 total leaves)
Time spent computing histograms: 0.018s
Time s

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] END  neg_root_mean_squared_error: (train=-3.855, test=-3.833) r2: (train=0.000, test=0.000) total time=   0.1s
[CV] START .....................................................................
Binning 0.001 GB of training data: 0.001 s
Binning 0.000 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46241, val loss: 7.24254, in 0.007s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46237, val loss: 7.24251, in 0.007s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46233, val loss: 7.24248, in 0.006s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46229, val loss: 7.24246, in 0.007s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46225, val loss: 7.24243, in 0.006s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46221, val loss: 7.24241, in 0.006s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46217, val loss: 7.24238, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, trai

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.46170, val loss: 7.24208, in 0.007s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46167, val loss: 7.24207, in 0.007s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46164, val loss: 7.24205, in 0.007s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46162, val loss: 7.24204, in 0.007s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46159, val loss: 7.24202, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46157, val loss: 7.24201, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46154, val loss: 7.24199, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46152, val loss: 7.24198, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46150, val loss: 7.24197, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46147, val loss: 7.24195, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46145, val loss: 7.24194, in 0.00

[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51068, val loss: 6.62408, in 0.006s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51065, val loss: 6.62405, in 0.005s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51061, val loss: 6.62403, in 0.006s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51058, val loss: 6.62401, in 0.006s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51055, val loss: 6.62399, in 0.006s
[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51052, val loss: 6.62397, in 0.005s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51048, val loss: 6.62395, in 0.006s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51045, val loss: 6.62393, in 0.007s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51042, val loss: 6.62392, in 0.006s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51040, val loss: 6.62390, in 0.007s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51037, val loss: 6.62388, in

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.6s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.51021, val loss: 6.62379, in 0.007s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51018, val loss: 6.62377, in 0.007s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51016, val loss: 6.62376, in 0.007s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51013, val loss: 6.62375, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51011, val loss: 6.62373, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51009, val loss: 6.62372, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51007, val loss: 6.62371, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51004, val loss: 6.62370, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51002, val loss: 6.62368, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51000, val loss: 6.62367, in 0.008s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50998, val loss: 6.62366, in 0.00

[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43380, val loss: 6.54903, in 0.007s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43377, val loss: 6.54898, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43374, val loss: 6.54893, in 0.006s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43371, val loss: 6.54888, in 0.006s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43369, val loss: 6.54884, in 0.007s
[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43366, val loss: 6.54879, in 0.006s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43364, val loss: 6.54875, in 0.006s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43361, val loss: 6.54870, in 0.007s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43358, val loss: 6.54866, in 0.007s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43356, val loss: 6.54862, in 0.007s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43354, val loss: 6.54858, in

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.7s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.43340, val loss: 6.54834, in 0.007s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43338, val loss: 6.54830, in 0.005s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43336, val loss: 6.54827, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43334, val loss: 6.54823, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43332, val loss: 6.54819, in 0.007s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43330, val loss: 6.54816, in 0.007s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43328, val loss: 6.54812, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43327, val loss: 6.54809, in 0.007s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43325, val loss: 6.54806, in 0.007s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43323, val loss: 6.54802, in 0.007s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43321, val loss: 6.54799, in 0.00

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.8s finished


### evaluate on baseline with embeddings

In [16]:
features = baseline_feature_names + embedding_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [17]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings"] = scores

Binning 0.010 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.181 s
Binning 0.001 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34882, val loss: 8.13152, in 0.016s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34857, val loss: 8.13153, in 0.016s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34832, val loss: 8.13154, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34808, val loss: 8.13156, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34784, val loss: 8.13157, in 0.014s
Fit 5 trees in 0.309 s, (100 total leaves)
Time spent computing histograms: 0.014s
Time spent finding best splits:  0.004s
Time spent applying splits:      0.016s
Time spent predicting:           0.001s
Binning 0.010 GB of training data: 



0.162 s
Binning 0.001 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38711, val loss: 7.81601, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38687, val loss: 7.81599, in 0.014s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38662, val loss: 7.81594, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38637, val loss: 7.81591, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38613, val loss: 7.81588, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38589, val loss: 7.81586, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38565, val loss: 7.81583, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38542, val loss: 7.81582, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38517, val loss: 7.81578, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38494, val loss: 7.81575, in 



Binning 0.010 GB of training data: 



0.164 s
Binning 0.001 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46219, val loss: 7.24256, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46193, val loss: 7.24255, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46168, val loss: 7.24260, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46142, val loss: 7.24258, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46118, val loss: 7.24260, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46093, val loss: 7.24261, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46071, val loss: 7.24259, in 0.012s
Fit 7 trees in 0.320 s, (140 total leaves)
Time spent computing histograms: 0.015s
Time spent finding best splits:  0.005s
Time spent applying splits:      0.018s
Time spent predicting:           0.002s
Binning 0.010 GB of training data: 



0.166 s
Binning 0.001 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51063, val loss: 6.62424, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51037, val loss: 6.62426, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51011, val loss: 6.62428, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50986, val loss: 6.62431, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50961, val loss: 6.62431, in 0.012s
Fit 5 trees in 0.297 s, (100 total leaves)
Time spent computing histograms: 0.010s
Time spent finding best splits:  0.003s
Time spent applying splits:      0.013s
Time spent predicting:           0.001s
Binning 0.010 GB of training data: 



0.168 s
Binning 0.001 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43371, val loss: 6.54930, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43344, val loss: 6.54927, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43318, val loss: 6.54923, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43292, val loss: 6.54920, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43268, val loss: 6.54916, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43243, val loss: 6.54924, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43218, val loss: 6.54921, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43194, val loss: 6.54928, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43171, val loss: 6.54936, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43149, val loss: 6.54935, in 

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.7s finished


### evaluate on baseline with id features

In [18]:
features = baseline_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [19]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_id_features"] = scores

Binning 0.005 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.049 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34852, val loss: 8.13119, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34796, val loss: 8.13088, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34742, val loss: 8.13058, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34688, val loss: 8.13028, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34636, val loss: 8.12999, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34585, val loss: 8.12972, in 0.014s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34533, val loss: 8.12945, in 0.014s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34484, val loss: 8.12919, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34435, val loss: 8.12893, in 0.013s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34387, val loss: 8.12869, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32048, val loss: 8.12117, in 0.012s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32029, val loss: 8.12118, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32012, val loss: 8.12111, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31995, val loss: 8.12110, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31979, val loss: 8.12105, in 0.012s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31961, val loss: 8.12108, in 0.010s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31944, val loss: 8.12108, in 0.010s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31929, val loss: 8.12110, in 0.011s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31914, val loss: 8.12104, in 0.011s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31897, val loss: 8.12102, in 0.011s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31882, val lo

[74/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36331, val loss: 7.80226, in 0.010s
[75/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36313, val loss: 7.80220, in 0.011s
[76/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36294, val loss: 7.80218, in 0.011s
[77/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36277, val loss: 7.80209, in 0.011s
[78/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36259, val loss: 7.80203, in 0.010s
[79/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36242, val loss: 7.80196, in 0.010s
[80/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36224, val loss: 7.80191, in 0.010s
[81/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36206, val loss: 7.80192, in 0.011s
[82/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36190, val loss: 7.80189, in 0.010s
[83/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36175, val loss: 7.80181, in 0.010s
[84/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36158, val lo

[59/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43803, val loss: 7.22872, in 0.011s
[60/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43779, val loss: 7.22860, in 0.009s
[61/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43751, val loss: 7.22857, in 0.010s
[62/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43729, val loss: 7.22849, in 0.010s
[63/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43706, val loss: 7.22839, in 0.011s
[64/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43679, val loss: 7.22829, in 0.010s
[65/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43657, val loss: 7.22823, in 0.010s
[66/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43633, val loss: 7.22814, in 0.010s
[67/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43609, val loss: 7.22811, in 0.011s
[68/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43585, val loss: 7.22798, in 0.010s
[69/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43559, val lo

[62/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48890, val loss: 6.61350, in 0.010s
[63/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48869, val loss: 6.61349, in 0.011s
[64/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48849, val loss: 6.61343, in 0.011s
[65/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48830, val loss: 6.61339, in 0.010s
[66/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48808, val loss: 6.61326, in 0.012s
[67/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48789, val loss: 6.61326, in 0.011s
[68/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48767, val loss: 6.61314, in 0.012s
[69/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48749, val loss: 6.61313, in 0.012s
[70/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48728, val loss: 6.61311, in 0.011s
[71/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48704, val loss: 6.61301, in 0.011s
[72/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48685, val lo

[47/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41642, val loss: 6.53810, in 0.010s
[48/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41615, val loss: 6.53798, in 0.010s
[49/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41590, val loss: 6.53787, in 0.010s
[50/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41562, val loss: 6.53777, in 0.010s
[51/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41537, val loss: 6.53769, in 0.010s
[52/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41513, val loss: 6.53762, in 0.010s
[53/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41490, val loss: 6.53748, in 0.010s
[54/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41464, val loss: 6.53739, in 0.010s
[55/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41441, val loss: 6.53727, in 0.011s
[56/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41419, val loss: 6.53724, in 0.011s
[57/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41396, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.7s finished


### evaluate on baseline with embeddings and id features

In [20]:
features = baseline_feature_names + embedding_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [21]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings_and_id_features"] = scores

Binning 0.014 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.219 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34844, val loss: 8.13121, in 0.016s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34781, val loss: 8.13092, in 0.016s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34719, val loss: 8.13063, in 0.015s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34658, val loss: 8.13036, in 0.017s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34599, val loss: 8.13009, in 0.014s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34541, val loss: 8.12983, in 0.014s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34479, val loss: 8.12962, in 0.014s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34423, val loss: 8.12937, in 0.013s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34363, val loss: 8.12918, in 0.014s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34305, val loss: 8.12899, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31465, val loss: 8.12143, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31429, val loss: 8.12122, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31402, val loss: 8.12122, in 0.013s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31368, val loss: 8.12102, in 0.013s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31345, val loss: 8.12100, in 0.013s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31310, val loss: 8.12078, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31286, val loss: 8.12073, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31253, val loss: 8.12055, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31229, val loss: 8.12054, in 0.013s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31197, val loss: 8.12035, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31173, val lo



Binning 0.014 GB of training data: 



0.243 s
Binning 0.002 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38671, val loss: 7.81566, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38606, val loss: 7.81528, in 0.014s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38543, val loss: 7.81491, in 0.014s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38480, val loss: 7.81450, in 0.022s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38419, val loss: 7.81415, in 0.015s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38359, val loss: 7.81378, in 0.014s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38298, val loss: 7.81343, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38241, val loss: 7.81314, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38184, val loss: 7.81277, in 0.013s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38127, val loss: 7.81244, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35218, val loss: 7.80226, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35197, val loss: 7.80226, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35170, val loss: 7.80222, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35143, val loss: 7.80221, in 0.013s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35121, val loss: 7.80221, in 0.013s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35095, val loss: 7.80217, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35073, val loss: 7.80208, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35047, val loss: 7.80207, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35025, val loss: 7.80202, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35001, val loss: 7.80199, in 0.013s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34978, val lo



Binning 0.014 GB of training data: 



0.205 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46179, val loss: 7.24222, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46112, val loss: 7.24190, in 0.014s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46048, val loss: 7.24157, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45986, val loss: 7.24127, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45924, val loss: 7.24095, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45862, val loss: 7.24069, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45800, val loss: 7.24038, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45737, val loss: 7.24013, in 0.013s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45679, val loss: 7.23984, in 0.013s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45618, val loss: 7.23960, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42610, val loss: 7.23133, in 0.019s
Fit 89 trees in 1.412 s, (1780 total leaves)
Time spent computing histograms: 0.205s
Time spent finding best splits:  0.065s
Time spent applying splits:      0.247s
Time spent predicting:           0.024s




Binning 0.014 GB of training data: 



0.234 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51027, val loss: 6.62383, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50959, val loss: 6.62355, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50898, val loss: 6.62318, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50839, val loss: 6.62282, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50780, val loss: 6.62250, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50718, val loss: 6.62222, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50658, val loss: 6.62191, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50601, val loss: 6.62162, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50544, val loss: 6.62134, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50489, val loss: 6.62106, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47516, val loss: 6.61245, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47489, val loss: 6.61240, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47460, val loss: 6.61245, in 0.014s
Fit 91 trees in 1.473 s, (1820 total leaves)
Time spent computing histograms: 0.224s
Time spent finding best splits:  0.072s
Time spent applying splits:      0.248s
Time spent predicting:           0.025s




Binning 0.014 GB of training data: 



0.216 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43339, val loss: 6.54895, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43280, val loss: 6.54859, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43223, val loss: 6.54821, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43161, val loss: 6.54793, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43106, val loss: 6.54756, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43051, val loss: 6.54723, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42993, val loss: 6.54696, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42940, val loss: 6.54662, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42889, val loss: 6.54631, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42831, val loss: 6.54604, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40100, val loss: 6.53550, in 0.012s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40078, val loss: 6.53551, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40053, val loss: 6.53539, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40030, val loss: 6.53536, in 0.012s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40008, val loss: 6.53534, in 0.011s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39982, val loss: 6.53534, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39958, val loss: 6.53532, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39936, val loss: 6.53521, in 0.013s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39913, val loss: 6.53519, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39892, val loss: 6.53510, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39867, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.9s finished


In [22]:
from numpy import mean
import csv

def make_output(key):
    output = {}
    output["id_column"] = target_id_column
    output["type"] = key
    output["train_r2"] = mean(results[key]["train_r2"])
    output["test_r2"] = mean(results[key]["test_r2"])
    output["train_root_mean_squared_error"] = -1*mean(results[key]["train_neg_root_mean_squared_error"])
    output["test_root_mean_squared_error"] = -1*mean(results[key]["test_neg_root_mean_squared_error"])
    output["embedding_dimension"] = embedding_size
    return output
def save(output):
    path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\results\node2vec_embeddings.csv"
    with open(path, 'a', newline='') as csv_file:
        dict_object = csv.DictWriter(csv_file, fieldnames=list(output.keys())) 
        dict_object.writerow(output)

In [23]:
print(make_output("baseline"))
print(make_output("baseline_with_embeddings"))
print(make_output("baseline_with_id_features"))
print(make_output("baseline_with_embeddings_and_id_features"))

{'id_column': 'merchant_id', 'type': 'baseline', 'train_r2': 0.0001655841013616044, 'test_r2': 0.00013871442920936338, 'train_root_mean_squared_error': 3.8501609494819595, 'test_root_mean_squared_error': 3.85005310803518, 'embedding_dimension': 8}
{'id_column': 'merchant_id', 'type': 'baseline_with_embeddings', 'train_r2': 0.0004541714752166248, 'test_r2': 0.0001242292641201237, 'train_root_mean_squared_error': 3.8496047057897385, 'test_root_mean_squared_error': 3.8500820248684122, 'embedding_dimension': 8}
{'id_column': 'merchant_id', 'type': 'baseline_with_id_features', 'train_r2': 0.0036681666974903715, 'test_r2': 0.002314528415001793, 'train_root_mean_squared_error': 3.843410644738846, 'test_root_mean_squared_error': 3.8458591635475896, 'embedding_dimension': 8}
{'id_column': 'merchant_id', 'type': 'baseline_with_embeddings_and_id_features', 'train_r2': 0.004626224302509541, 'test_r2': 0.002320280318707768, 'train_root_mean_squared_error': 3.841562592569548, 'test_root_mean_squared

### save

In [24]:
save(make_output("baseline"))
save(make_output("baseline_with_embeddings"))
save(make_output("baseline_with_id_features"))
save(make_output("baseline_with_embeddings_and_id_features"))