# ABOUT: 
- this code evaluates the node2vec embeddings on all node2vec embeddings generated
- findings: 
    - using card_id embeddings appear to cause overfitting
        
- details:       
    - i.e compared to baseline, performance on training set is better but performance on validation set is worse
    - baseline - using just feature_2 as feature
    - model used is Histogram Gradient boosting
    - metrics used are r2 and rmse
    - 3 fold cross validated

In [1]:
from config import *

import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nodevectors
from sklearn.model_selection import cross_validate

In [2]:
target_id_column = "merchant_category_id"
node2vec_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\model\node2vec_card_id_merchant_category_id.zip"
embedding_size = 8

### prepare data

In [3]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
id_columns = pd.read_csv(path, usecols = ["card_id", target_id_column])
id_columns.head()

Unnamed: 0,card_id,merchant_category_id
0,C_ID_4e6213e9bc,merchant_category_id_80
1,C_ID_4e6213e9bc,merchant_category_id_367
2,C_ID_4e6213e9bc,merchant_category_id_80
3,C_ID_4e6213e9bc,merchant_category_id_560
4,C_ID_4e6213e9bc,merchant_category_id_80


In [4]:
# group by card_id, then acquire nunique_merchant_id, count_merchant_id, nunique_count_frac_merchant_id
id_features = id_columns.groupby("card_id").agg(["nunique", "count"])
id_features = id_features.reset_index()
id_features.columns = ["card_id", f"nunique_{target_id_column}", f"count_{target_id_column}"]
id_features[f"nunique_count_frac_{target_id_column}"] = id_features[f"nunique_{target_id_column}"]/id_features[f"count_{target_id_column}"]
id_features

Unnamed: 0,card_id,nunique_merchant_category_id,count_merchant_category_id,nunique_count_frac_merchant_category_id
0,C_ID_00007093c1,19,151,0.125828
1,C_ID_0001238066,35,149,0.234899
2,C_ID_0001506ef0,20,68,0.294118
3,C_ID_0001793786,57,247,0.230769
4,C_ID_000183fdda,38,155,0.245161
...,...,...,...,...
325535,C_ID_ffff1d9928,9,16,0.562500
325536,C_ID_ffff579d3a,28,115,0.243478
325537,C_ID_ffff756266,14,25,0.560000
325538,C_ID_ffff828181,45,198,0.227273


In [5]:
# load train target variable
train_file = pd.read_csv(train_path, usecols = ["card_id","target", "feature_2"])
train_file.head()

Unnamed: 0,card_id,feature_2,target
0,C_ID_92a2005557,2,-0.820283
1,C_ID_3d0044924f,1,0.392913
2,C_ID_d639edf6cd,2,0.688056
3,C_ID_186d6a6901,3,0.142495
4,C_ID_cdbd2c0db2,3,-0.159749


In [6]:
# load trained node2vec
node2vec = nodevectors.GGVec.load(node2vec_path)
# convert embeddings to dataframe
node2vec_embeddings = pd.DataFrame.from_dict(node2vec.model, orient = "index")
node2vec_embeddings = node2vec_embeddings.reset_index()
node2vec_embeddings

Unnamed: 0,index,0,1,2,3,4,5,6,7
0,C_ID_00007093c1,-0.076224,-0.211655,-0.295419,-0.309996,-0.190238,-0.230680,-0.205324,0.102346
1,C_ID_0001238066,0.176778,-0.013434,-0.118517,-0.418732,0.440417,-0.060409,0.200751,0.046039
2,C_ID_0001506ef0,-0.182617,0.036822,-0.186016,-0.587992,0.543554,0.086865,0.262915,-0.186468
3,C_ID_0001793786,0.102586,0.158642,-0.592433,-0.034929,0.484206,0.685533,0.496278,0.109312
4,C_ID_000183fdda,0.236352,-0.037076,-0.384824,-0.291995,0.631856,0.485006,0.000942,0.093809
...,...,...,...,...,...,...,...,...,...
325866,merchant_category_id_885,0.000509,-0.000139,0.000712,-0.001165,0.000556,0.000092,-0.000792,-0.000475
325867,merchant_category_id_889,0.005656,-0.007558,-0.001554,-0.004508,-0.002740,0.014832,0.010686,0.009888
325868,merchant_category_id_891,-0.000036,0.000081,-0.000076,-0.001399,0.001202,0.000289,0.000608,-0.000652
325869,merchant_category_id_9,-0.000048,-0.000809,0.000678,0.000842,-0.001528,-0.000567,-0.000193,0.000038


In [7]:
# group and aggregate the id embeddings (e.g city_id embeddings) by the "card_id"
node2vec_embeddings = id_columns.merge(node2vec_embeddings, how = "left", left_on = target_id_column, right_on = "index")
node2vec_embeddings = node2vec_embeddings.drop("index", axis = 1)
node2vec_embeddings = node2vec_embeddings.groupby("card_id").mean().reset_index()

In [8]:
node2vec_embeddings

Unnamed: 0,card_id,0,1,2,3,4,5,6,7
0,C_ID_00007093c1,-0.000098,0.000051,0.000216,-0.000663,0.000153,-0.000116,-0.000222,-0.000467
1,C_ID_0001238066,-0.000220,0.000072,0.000156,-0.000804,0.000412,-0.000207,-0.000021,-0.000536
2,C_ID_0001506ef0,-0.000387,-0.000243,0.000020,-0.000769,0.000180,-0.000155,-0.000164,-0.000392
3,C_ID_0001793786,-0.001343,-0.000101,0.000423,-0.000409,0.001026,0.000809,-0.000893,0.000983
4,C_ID_000183fdda,-0.000168,0.000088,0.000036,-0.000960,0.000782,-0.000239,-0.000225,-0.000644
...,...,...,...,...,...,...,...,...,...
325535,C_ID_ffff1d9928,-0.000328,-0.000309,0.000007,-0.000696,0.000088,-0.000197,-0.000134,-0.000389
325536,C_ID_ffff579d3a,-0.000295,0.000009,0.000030,-0.000760,0.000534,-0.000391,-0.000238,-0.000558
325537,C_ID_ffff756266,-0.000195,-0.000054,0.000196,-0.000808,0.000545,-0.000158,-0.000429,-0.000577
325538,C_ID_ffff828181,0.000168,0.000165,-0.000070,-0.001346,0.000281,-0.000455,-0.000150,-0.001027


In [9]:
# merge id embeddings with train.csv 
dataset = train_file.merge(node2vec_embeddings, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7
0,C_ID_92a2005557,2,-0.820283,-9.5e-05,7.1e-05,4.7e-05,-0.000888,0.000738,-0.00027,-0.000182,-0.000626
1,C_ID_3d0044924f,1,0.392913,0.000183,0.00017,0.000276,-0.00077,0.00058,-0.000187,-0.000189,-0.000804
2,C_ID_d639edf6cd,2,0.688056,-0.000538,-0.000301,8.2e-05,-0.000495,-0.000286,-0.00024,5e-06,-0.000247
3,C_ID_186d6a6901,3,0.142495,-0.000193,-1.3e-05,9.1e-05,-0.000833,0.000633,-0.000363,-0.000218,-0.000604
4,C_ID_cdbd2c0db2,3,-0.159749,-0.000123,-6.1e-05,6.6e-05,-0.001194,0.000469,-0.000262,-0.000125,-0.000456


In [10]:
# merge id features 
dataset = dataset.merge(id_features, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7,nunique_merchant_category_id,count_merchant_category_id,nunique_count_frac_merchant_category_id
0,C_ID_92a2005557,2,-0.820283,-9.5e-05,7.1e-05,4.7e-05,-0.000888,0.000738,-0.00027,-0.000182,-0.000626,46,283,0.162544
1,C_ID_3d0044924f,1,0.392913,0.000183,0.00017,0.000276,-0.00077,0.00058,-0.000187,-0.000189,-0.000804,58,356,0.162921
2,C_ID_d639edf6cd,2,0.688056,-0.000538,-0.000301,8.2e-05,-0.000495,-0.000286,-0.00024,5e-06,-0.000247,9,44,0.204545
3,C_ID_186d6a6901,3,0.142495,-0.000193,-1.3e-05,9.1e-05,-0.000833,0.000633,-0.000363,-0.000218,-0.000604,28,84,0.333333
4,C_ID_cdbd2c0db2,3,-0.159749,-0.000123,-6.1e-05,6.6e-05,-0.001194,0.000469,-0.000262,-0.000125,-0.000456,37,169,0.218935


In [11]:
dataset

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7,nunique_merchant_category_id,count_merchant_category_id,nunique_count_frac_merchant_category_id
0,C_ID_92a2005557,2,-0.820283,-0.000095,0.000071,0.000047,-0.000888,0.000738,-0.000270,-0.000182,-0.000626,46,283,0.162544
1,C_ID_3d0044924f,1,0.392913,0.000183,0.000170,0.000276,-0.000770,0.000580,-0.000187,-0.000189,-0.000804,58,356,0.162921
2,C_ID_d639edf6cd,2,0.688056,-0.000538,-0.000301,0.000082,-0.000495,-0.000286,-0.000240,0.000005,-0.000247,9,44,0.204545
3,C_ID_186d6a6901,3,0.142495,-0.000193,-0.000013,0.000091,-0.000833,0.000633,-0.000363,-0.000218,-0.000604,28,84,0.333333
4,C_ID_cdbd2c0db2,3,-0.159749,-0.000123,-0.000061,0.000066,-0.001194,0.000469,-0.000262,-0.000125,-0.000456,37,169,0.218935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,2,-2.740821,-0.000446,-0.000584,0.000031,-0.000770,-0.000026,-0.000176,0.000033,-0.000426,11,47,0.234043
201913,C_ID_1314773c0b,1,0.312917,-0.000244,0.000152,0.000003,-0.000971,0.000857,-0.000343,-0.000293,-0.000615,19,48,0.395833
201914,C_ID_7666735b3d,3,0.093494,-0.000125,-0.000109,-0.000057,-0.000841,0.000617,-0.000243,-0.000194,-0.000581,26,90,0.288889
201915,C_ID_73f5a0efd0,2,-4.676589,-0.000267,-0.000075,0.000085,-0.000754,0.000457,-0.000138,-0.000147,-0.000411,14,31,0.451613


### evaluate on baseline dataset 

In [12]:
# define columns for training
baseline_feature_names = ["feature_2"]
embedding_feature_names = list(range(embedding_size))
id_feature_feature_names = [f"nunique_{target_id_column}",f"count_{target_id_column}",f"nunique_count_frac_{target_id_column}"]
categorical_feature_names = ["feature_2"]
target_col = "target"
results = {}

In [13]:
features = baseline_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [14]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = 10,
                        return_train_score=True)
results["baseline"] = scores

[CV] START .....................................................................
Binning 0.001 GB of training data: 0.002 s
Binning 0.000 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34904, val loss: 8.13150, in 0.008s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34900, val loss: 8.13149, in 0.007s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34896, val loss: 8.13148, in 0.006s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34892, val loss: 8.13147, in 0.007s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34888, val loss: 8.13147, in 0.006s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34884, val loss: 8.13146, in 0.006s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34880, val loss: 8.13146, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34877, val loss: 8.13145, in 0.006s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34873, val loss

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 3 leaves, max depth = 2, train loss: 7.34827, val loss: 8.13141, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34824, val loss: 8.13141, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34821, val loss: 8.13141, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34819, val loss: 8.13141, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34816, val loss: 8.13141, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34814, val loss: 8.13141, in 0.006s
Fit 29 trees in 0.205 s, (87 total leaves)
Time spent computing histograms: 0.016s
Time spent finding best splits:  0.004s
Time spent applying splits:      0.029s
Time spent predicting:           0.005s
[CV] END  neg_root_mean_squared_error: (train=-3.854, test=-3.836) r2: (train=0.000, test=0.000) total time=   0.1s
[CV] START .....................................................................
Binning 0.001 GB of training data: 0.002 s
Binning 0.000 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.38682, val loss: 7.81603, in 0.007s
Fit 17 trees in 0.134 s, (51 total leaves)
Time spent computing histograms: 0.010s
Time spent finding best splits:  0.002s
Time spent applying splits:      0.016s
Time spent predicting:           0.004s
[CV] END  neg_root_mean_squared_error: (train=-3.855, test=-3.833) r2: (train=0.000, test=0.000) total time=   0.1s
[CV] START .....................................................................
Binning 0.001 GB of training data: 0.001 s
Binning 0.000 GB of validation data: 0.000 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46241, val loss: 7.24254, in 0.007s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46237, val loss: 7.24251, in 0.006s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46233, val loss: 7.24248, in 0.006s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46229, val loss: 7.24246, in 0.007s
[5/100] 1 tree, 3 leaves, max de

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.46170, val loss: 7.24208, in 0.006s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46167, val loss: 7.24207, in 0.006s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46164, val loss: 7.24205, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46162, val loss: 7.24204, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46159, val loss: 7.24202, in 0.005s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46157, val loss: 7.24201, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46154, val loss: 7.24199, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46152, val loss: 7.24198, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46150, val loss: 7.24197, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46147, val loss: 7.24195, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46145, val loss: 7.24194, in 0.00

[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51068, val loss: 6.62408, in 0.006s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51065, val loss: 6.62405, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51061, val loss: 6.62403, in 0.006s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51058, val loss: 6.62401, in 0.006s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51055, val loss: 6.62399, in 0.006s
[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51052, val loss: 6.62397, in 0.006s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51048, val loss: 6.62395, in 0.006s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51045, val loss: 6.62393, in 0.006s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51042, val loss: 6.62392, in 0.006s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51040, val loss: 6.62390, in 0.006s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51037, val loss: 6.62388, in

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.4s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.51021, val loss: 6.62379, in 0.006s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51018, val loss: 6.62377, in 0.006s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51016, val loss: 6.62376, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51013, val loss: 6.62375, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51011, val loss: 6.62373, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51009, val loss: 6.62372, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51007, val loss: 6.62371, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51004, val loss: 6.62370, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51002, val loss: 6.62368, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51000, val loss: 6.62367, in 0.007s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50998, val loss: 6.62366, in 0.00

[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43380, val loss: 6.54903, in 0.006s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43377, val loss: 6.54898, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43374, val loss: 6.54893, in 0.006s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43371, val loss: 6.54888, in 0.006s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43369, val loss: 6.54884, in 0.007s
[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43366, val loss: 6.54879, in 0.006s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43364, val loss: 6.54875, in 0.006s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43361, val loss: 6.54870, in 0.006s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43358, val loss: 6.54866, in 0.006s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43356, val loss: 6.54862, in 0.006s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43354, val loss: 6.54858, in

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.4s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.43336, val loss: 6.54827, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43334, val loss: 6.54823, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43332, val loss: 6.54819, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43330, val loss: 6.54816, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43328, val loss: 6.54812, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43327, val loss: 6.54809, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43325, val loss: 6.54806, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43323, val loss: 6.54802, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43321, val loss: 6.54799, in 0.006s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43319, val loss: 6.54796, in 0.007s
[34/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43318, val loss: 6.54793, in 0.00

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.4s finished


### evaluate on baseline with embeddings

In [15]:
features = baseline_feature_names + embedding_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [16]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings"] = scores

Binning 0.010 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.173 s
Binning 0.001 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34866, val loss: 8.13127, in 0.011s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34821, val loss: 8.13096, in 0.011s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34779, val loss: 8.13071, in 0.010s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34736, val loss: 8.13041, in 0.010s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34696, val loss: 8.13017, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34654, val loss: 8.13005, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34613, val loss: 8.12980, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34575, val loss: 8.12962, in 0.010s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34533, val loss: 8.12941, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34495, val loss: 8.12931, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32095, val loss: 8.12175, in 0.010s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32069, val loss: 8.12160, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32042, val loss: 8.12151, in 0.013s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32018, val loss: 8.12161, in 0.010s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31996, val loss: 8.12153, in 0.011s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31975, val loss: 8.12154, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31949, val loss: 8.12145, in 0.011s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31927, val loss: 8.12151, in 0.010s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31906, val loss: 8.12141, in 0.010s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31883, val loss: 8.12128, in 0.010s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31862, val lo



Binning 0.010 GB of training data: 



0.160 s
Binning 0.001 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38692, val loss: 7.81577, in 0.011s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38646, val loss: 7.81551, in 0.011s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38601, val loss: 7.81529, in 0.010s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38556, val loss: 7.81497, in 0.010s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38511, val loss: 7.81462, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38468, val loss: 7.81442, in 0.010s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38426, val loss: 7.81408, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38386, val loss: 7.81384, in 0.010s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38343, val loss: 7.81354, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38305, val loss: 7.81325, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35635, val loss: 7.79989, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35607, val loss: 7.79964, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35583, val loss: 7.79951, in 0.010s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35554, val loss: 7.79948, in 0.010s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35523, val loss: 7.79923, in 0.010s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35498, val loss: 7.79906, in 0.010s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35473, val loss: 7.79900, in 0.011s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35443, val loss: 7.79875, in 0.010s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35419, val loss: 7.79863, in 0.010s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35392, val loss: 7.79862, in 0.010s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35365, val lo



Binning 0.010 GB of training data: 



0.174 s
Binning 0.001 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46182, val loss: 7.24235, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46120, val loss: 7.24216, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46059, val loss: 7.24200, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46000, val loss: 7.24182, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45941, val loss: 7.24168, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45884, val loss: 7.24151, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45828, val loss: 7.24138, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45772, val loss: 7.24123, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45719, val loss: 7.24109, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45665, val loss: 7.24087, in 



Binning 0.010 GB of training data: 



0.181 s
Binning 0.001 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51042, val loss: 6.62399, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50994, val loss: 6.62377, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50948, val loss: 6.62355, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50902, val loss: 6.62341, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50855, val loss: 6.62325, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50808, val loss: 6.62314, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50764, val loss: 6.62300, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50720, val loss: 6.62289, in 0.010s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50677, val loss: 6.62277, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50635, val loss: 6.62267, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47988, val loss: 6.61767, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47960, val loss: 6.61766, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47934, val loss: 6.61760, in 0.011s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47907, val loss: 6.61757, in 0.010s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47882, val loss: 6.61763, in 0.010s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47856, val loss: 6.61758, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47829, val loss: 6.61760, in 0.011s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47804, val loss: 6.61766, in 0.013s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47780, val loss: 6.61761, in 0.011s
Fit 97 trees in 1.343 s, (1940 total leaves)
Time spent computing histograms: 0.189s
Time spent finding best splits:  0.058s
Time spent applying splits:      0.222



Binning 0.010 GB of training data: 



0.171 s
Binning 0.001 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43348, val loss: 6.54901, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43299, val loss: 6.54870, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43251, val loss: 6.54839, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43204, val loss: 6.54814, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43163, val loss: 6.54795, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43119, val loss: 6.54783, in 0.010s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43075, val loss: 6.54765, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43033, val loss: 6.54749, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42994, val loss: 6.54734, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42952, val loss: 6.54710, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40462, val loss: 6.54165, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40438, val loss: 6.54174, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40411, val loss: 6.54166, in 0.011s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40384, val loss: 6.54160, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40357, val loss: 6.54165, in 0.010s
Fit 93 trees in 1.332 s, (1860 total leaves)
Time spent computing histograms: 0.192s
Time spent finding best splits:  0.063s
Time spent applying splits:      0.223s
Time spent predicting:           0.025s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.3s finished


### evaluate on baseline with id features

In [17]:
features = baseline_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [18]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_id_features"] = scores

Binning 0.005 GB of training data: 0.041 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34856, val loss: 8.13124, in 0.011s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34804, val loss: 8.13099, in 0.011s
[3/100] 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 20 leaves, max depth = 5, train loss: 7.34754, val loss: 8.13074, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34704, val loss: 8.13049, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34656, val loss: 8.13025, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34609, val loss: 8.13001, in 0.010s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34562, val loss: 8.12978, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34516, val loss: 8.12956, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34472, val loss: 8.12927, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34429, val loss: 8.12899, in 0.011s
[11/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34385, val loss: 8.12879, in 0.010s
[12/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34344, val loss: 8.12857, in 0.010s
[13/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34303, val loss: 8.12838, in

[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32419, val loss: 8.12183, in 0.009s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32406, val loss: 8.12183, in 0.011s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32389, val loss: 8.12182, in 0.009s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32374, val loss: 8.12177, in 0.010s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32362, val loss: 8.12174, in 0.010s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32349, val loss: 8.12173, in 0.010s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32336, val loss: 8.12172, in 0.010s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32324, val loss: 8.12165, in 0.010s
[100/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32308, val loss: 8.12167, in 0.009s
Fit 100 trees in 1.100 s, (2000 total leaves)
Time spent computing histograms: 0.133s
Time spent finding best splits:  0.052s
Time spent applying splits:      0.2

[77/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36407, val loss: 7.80414, in 0.010s
[78/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36392, val loss: 7.80413, in 0.010s
[79/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36377, val loss: 7.80412, in 0.010s
[80/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36360, val loss: 7.80403, in 0.009s
[81/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36347, val loss: 7.80399, in 0.010s
[82/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36331, val loss: 7.80391, in 0.009s
[83/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36316, val loss: 7.80387, in 0.010s
[84/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36303, val loss: 7.80383, in 0.009s
[85/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36287, val loss: 7.80378, in 0.010s
[86/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36273, val loss: 7.80377, in 0.009s
[87/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36254, val lo

[62/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44063, val loss: 7.23427, in 0.010s
[63/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44042, val loss: 7.23424, in 0.010s
[64/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44024, val loss: 7.23426, in 0.009s
[65/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44002, val loss: 7.23422, in 0.009s
[66/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43984, val loss: 7.23423, in 0.009s
[67/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43965, val loss: 7.23421, in 0.010s
[68/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43945, val loss: 7.23420, in 0.009s
[69/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43927, val loss: 7.23421, in 0.010s
[70/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43908, val loss: 7.23417, in 0.010s
[71/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43888, val loss: 7.23415, in 0.010s
[72/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43870, val lo

[63/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49090, val loss: 6.61480, in 0.010s
[64/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49072, val loss: 6.61472, in 0.010s
[65/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49051, val loss: 6.61468, in 0.010s
[66/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49034, val loss: 6.61462, in 0.010s
[67/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49016, val loss: 6.61458, in 0.010s
[68/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48998, val loss: 6.61453, in 0.010s
[69/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48979, val loss: 6.61450, in 0.010s
[70/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48960, val loss: 6.61446, in 0.009s
[71/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48943, val loss: 6.61442, in 0.010s
[72/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48926, val loss: 6.61439, in 0.010s
[73/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48909, val lo

[48/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41900, val loss: 6.53972, in 0.010s
[49/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41879, val loss: 6.53963, in 0.010s
[50/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41855, val loss: 6.53954, in 0.010s
[51/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41834, val loss: 6.53947, in 0.010s
[52/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41812, val loss: 6.53937, in 0.010s
[53/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41792, val loss: 6.53927, in 0.010s
[54/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41771, val loss: 6.53920, in 0.010s
[55/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41749, val loss: 6.53914, in 0.010s
[56/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41730, val loss: 6.53904, in 0.011s
[57/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41709, val loss: 6.53897, in 0.011s
[58/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41688, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.2s finished


### evaluate on baseline with embeddings and id features

In [19]:
features = baseline_feature_names + embedding_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [20]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings_and_id_features"] = scores

Binning 0.014 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.222 s
Binning 0.002 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34832, val loss: 8.13120, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34756, val loss: 8.13090, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34683, val loss: 8.13058, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34611, val loss: 8.13028, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34540, val loss: 8.12998, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34470, val loss: 8.12969, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34401, val loss: 8.12939, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34334, val loss: 8.12908, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34267, val loss: 8.12879, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34202, val loss: 8.12853, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30573, val loss: 8.11540, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30537, val loss: 8.11531, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30507, val loss: 8.11528, in 0.011s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30479, val loss: 8.11518, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30450, val loss: 8.11506, in 0.011s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30411, val loss: 8.11488, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30376, val loss: 8.11479, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30346, val loss: 8.11472, in 0.013s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30308, val loss: 8.11455, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30279, val loss: 8.11448, in 0.011s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30251, val lo



Binning 0.014 GB of training data: 



0.202 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38653, val loss: 7.81550, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38570, val loss: 7.81503, in 0.011s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38489, val loss: 7.81453, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38409, val loss: 7.81408, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38331, val loss: 7.81360, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38254, val loss: 7.81318, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38179, val loss: 7.81272, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38105, val loss: 7.81232, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38032, val loss: 7.81190, in 0.013s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.37961, val loss: 7.81150, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34137, val loss: 7.79693, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34100, val loss: 7.79699, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34066, val loss: 7.79701, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34033, val loss: 7.79702, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34002, val loss: 7.79692, in 0.011s
Fit 93 trees in 1.376 s, (1860 total leaves)
Time spent computing histograms: 0.202s
Time spent finding best splits:  0.067s
Time spent applying splits:      0.214s
Time spent predicting:           0.024s




Binning 0.014 GB of training data: 



0.197 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46161, val loss: 7.24203, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46079, val loss: 7.24163, in 0.011s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45998, val loss: 7.24111, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45918, val loss: 7.24062, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45839, val loss: 7.24014, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45763, val loss: 7.23966, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45690, val loss: 7.23931, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45608, val loss: 7.23883, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45538, val loss: 7.23850, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45466, val loss: 7.23811, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41424, val loss: 7.22321, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41382, val loss: 7.22325, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41354, val loss: 7.22313, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41316, val loss: 7.22305, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41279, val loss: 7.22310, in 0.012s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41240, val loss: 7.22297, in 0.013s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41205, val loss: 7.22293, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41168, val loss: 7.22288, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41133, val loss: 7.22281, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41092, val loss: 7.22278, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41059, val lo



Binning 0.014 GB of training data: 



0.213 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51006, val loss: 6.62378, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50930, val loss: 6.62340, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50849, val loss: 6.62298, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50776, val loss: 6.62261, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50697, val loss: 6.62224, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50624, val loss: 6.62189, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50553, val loss: 6.62155, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50477, val loss: 6.62117, in 0.010s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50407, val loss: 6.62086, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50334, val loss: 6.62048, in 

[88/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46554, val loss: 6.60733, in 0.012s
[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46514, val loss: 6.60724, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46478, val loss: 6.60721, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46439, val loss: 6.60710, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46405, val loss: 6.60698, in 0.013s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46365, val loss: 6.60694, in 0.013s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46329, val loss: 6.60683, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46292, val loss: 6.60678, in 0.012s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46255, val loss: 6.60678, in 0.013s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46221, val loss: 6.60682, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46188, val lo



Binning 0.014 GB of training data: 



0.209 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43319, val loss: 6.54875, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43243, val loss: 6.54817, in 0.011s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43171, val loss: 6.54763, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43089, val loss: 6.54711, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43021, val loss: 6.54658, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42941, val loss: 6.54610, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42875, val loss: 6.54555, in 0.010s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42804, val loss: 6.54505, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42738, val loss: 6.54464, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42663, val loss: 6.54420, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38988, val loss: 6.52518, in 0.012s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38951, val loss: 6.52504, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38914, val loss: 6.52493, in 0.011s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38885, val loss: 6.52490, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38853, val loss: 6.52485, in 0.011s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38821, val loss: 6.52466, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38785, val loss: 6.52437, in 0.012s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38751, val loss: 6.52433, in 0.011s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38718, val loss: 6.52422, in 0.011s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38687, val loss: 6.52416, in 0.011s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38657, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.0s finished


In [21]:
from numpy import mean
import csv

def make_output(key):
    output = {}
    output["id_column"] = target_id_column
    output["type"] = key
    output["train_r2"] = mean(results[key]["train_r2"])
    output["test_r2"] = mean(results[key]["test_r2"])
    output["train_root_mean_squared_error"] = -1*mean(results[key]["train_neg_root_mean_squared_error"])
    output["test_root_mean_squared_error"] = -1*mean(results[key]["test_neg_root_mean_squared_error"])
    output["embedding_dimension"] = embedding_size
    return output
def save(output):
    path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\results\node2vec_embeddings.csv"
    with open(path, 'a', newline='') as csv_file:
        dict_object = csv.DictWriter(csv_file, fieldnames=list(output.keys())) 
        dict_object.writerow(output)

In [22]:
print(make_output("baseline"))
print(make_output("baseline_with_embeddings"))
print(make_output("baseline_with_id_features"))
print(make_output("baseline_with_embeddings_and_id_features"))

{'id_column': 'merchant_category_id', 'type': 'baseline', 'train_r2': 0.0001655841013616044, 'test_r2': 0.00013871442920936338, 'train_root_mean_squared_error': 3.8501609494819595, 'test_root_mean_squared_error': 3.85005310803518, 'embedding_dimension': 8}
{'id_column': 'merchant_category_id', 'type': 'baseline_with_embeddings', 'train_r2': 0.0038666843976260523, 'test_r2': 0.0011043321049071908, 'train_root_mean_squared_error': 3.8430282363569077, 'test_root_mean_squared_error': 3.8481920739051945, 'embedding_dimension': 8}
{'id_column': 'merchant_category_id', 'type': 'baseline_with_id_features', 'train_r2': 0.0032413793418800685, 'test_r2': 0.00209346390784948, 'train_root_mean_squared_error': 3.8442337668305235, 'test_root_mean_squared_error': 3.846283973458732, 'embedding_dimension': 8}
{'id_column': 'merchant_category_id', 'type': 'baseline_with_embeddings_and_id_features', 'train_r2': 0.0061884888754821345, 'test_r2': 0.0034083228466227798, 'train_root_mean_squared_error': 3.838

### save

In [23]:
save(make_output("baseline"))
save(make_output("baseline_with_embeddings"))
save(make_output("baseline_with_id_features"))
save(make_output("baseline_with_embeddings_and_id_features"))