# ABOUT: 
- this code evaluates the node2vec embeddings on all node2vec embeddings generated
- findings: 
    - using card_id embeddings appear to cause overfitting
        
- details:       
    - i.e compared to baseline, performance on training set is better but performance on validation set is worse
    - baseline - using just feature_2 as feature
    - model used is Histogram Gradient boosting
    - metrics used are r2 and rmse
    - 3 fold cross validated

In [1]:
from config import *

import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nodevectors
from sklearn.model_selection import cross_validate

In [2]:
target_id_column = "merchant_category_id"
node2vec_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\model\node2vec_card_id_merchant_category_id.zip"
embedding_size = 16

### prepare data

In [3]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
id_columns = pd.read_csv(path, usecols = ["card_id", target_id_column])
id_columns.head()

Unnamed: 0,card_id,merchant_category_id
0,C_ID_4e6213e9bc,merchant_category_id_80
1,C_ID_4e6213e9bc,merchant_category_id_367
2,C_ID_4e6213e9bc,merchant_category_id_80
3,C_ID_4e6213e9bc,merchant_category_id_560
4,C_ID_4e6213e9bc,merchant_category_id_80


In [4]:
# group by card_id, then acquire nunique_merchant_id, count_merchant_id, nunique_count_frac_merchant_id
id_features = id_columns.groupby("card_id").agg(["nunique", "count"])
id_features = id_features.reset_index()
id_features.columns = ["card_id", f"nunique_{target_id_column}", f"count_{target_id_column}"]
id_features[f"nunique_count_frac_{target_id_column}"] = id_features[f"nunique_{target_id_column}"]/id_features[f"count_{target_id_column}"]
id_features

Unnamed: 0,card_id,nunique_merchant_category_id,count_merchant_category_id,nunique_count_frac_merchant_category_id
0,C_ID_00007093c1,19,151,0.125828
1,C_ID_0001238066,35,149,0.234899
2,C_ID_0001506ef0,20,68,0.294118
3,C_ID_0001793786,57,247,0.230769
4,C_ID_000183fdda,38,155,0.245161
...,...,...,...,...
325535,C_ID_ffff1d9928,9,16,0.562500
325536,C_ID_ffff579d3a,28,115,0.243478
325537,C_ID_ffff756266,14,25,0.560000
325538,C_ID_ffff828181,45,198,0.227273


In [5]:
# load train target variable
train_file = pd.read_csv(train_path, usecols = ["card_id","target", "feature_2"])
train_file.head()

Unnamed: 0,card_id,feature_2,target
0,C_ID_92a2005557,2,-0.820283
1,C_ID_3d0044924f,1,0.392913
2,C_ID_d639edf6cd,2,0.688056
3,C_ID_186d6a6901,3,0.142495
4,C_ID_cdbd2c0db2,3,-0.159749


In [6]:
# load trained node2vec
node2vec = nodevectors.GGVec.load(node2vec_path)
# convert embeddings to dataframe
node2vec_embeddings = pd.DataFrame.from_dict(node2vec.model, orient = "index")
node2vec_embeddings = node2vec_embeddings.reset_index()
node2vec_embeddings

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,C_ID_00007093c1,-0.357941,0.307006,0.009354,-0.246673,0.075446,-0.317019,0.055850,0.238421,0.539864,-0.106298,0.139172,-0.479250,-0.150935,0.413590,0.233879,0.213350
1,C_ID_0001238066,-0.368396,-0.367825,0.026838,-0.307736,0.083546,-0.219076,0.044009,0.494800,0.291377,-0.048323,-0.020847,-0.113235,-0.298803,0.534310,0.175696,0.092672
2,C_ID_0001506ef0,-0.254740,-0.589928,0.169768,-0.175632,0.585489,-0.264163,-0.296803,0.275318,0.261759,0.011973,0.244776,0.039688,-0.343730,0.360554,0.444197,0.496964
3,C_ID_0001793786,0.191584,-0.151448,-0.014445,0.366293,0.120945,-0.083885,-0.488117,-0.183988,-0.259406,0.052649,0.310510,0.151063,-0.176174,0.332875,0.707484,0.696419
4,C_ID_000183fdda,0.164014,-0.017319,0.132596,-0.035931,0.355319,-0.541536,-0.616958,0.110934,0.045973,-0.333951,0.149723,0.029574,0.221357,0.120723,0.560751,0.443785
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325866,merchant_category_id_885,-0.000002,0.000566,-0.000450,0.000353,-0.000237,-0.000571,0.000721,0.000700,0.001303,-0.000349,0.000322,0.000766,-0.000849,-0.001186,-0.000548,-0.001014
325867,merchant_category_id_889,-0.030407,-0.000800,-0.008912,0.017210,-0.056317,-0.004105,-0.023937,0.013748,0.002963,0.013833,-0.007967,-0.005426,-0.017560,-0.013585,0.007870,0.058390
325868,merchant_category_id_891,-0.000157,-0.000650,0.000238,-0.000246,0.000851,-0.000066,-0.000058,0.000807,0.001308,-0.000010,0.000288,-0.000454,-0.000094,-0.000353,0.000180,0.000968
325869,merchant_category_id_9,-0.000066,0.000996,0.000412,-0.000345,-0.001026,0.000874,0.002293,-0.000502,-0.000730,0.000589,-0.000745,0.000489,-0.000409,-0.001897,-0.000862,-0.000763


In [7]:
# group and aggregate the id embeddings (e.g city_id embeddings) by the "card_id"
node2vec_embeddings = id_columns.merge(node2vec_embeddings, how = "left", left_on = target_id_column, right_on = "index")
node2vec_embeddings = node2vec_embeddings.drop("index", axis = 1)
node2vec_embeddings = node2vec_embeddings.groupby("card_id").mean().reset_index()

In [8]:
node2vec_embeddings

Unnamed: 0,card_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,C_ID_00007093c1,-0.000182,-0.000331,-0.000112,-0.000530,0.000485,-0.000137,0.000165,0.001039,0.001338,0.000006,0.000432,-0.000190,0.000015,-0.000495,0.000433,0.000373
1,C_ID_0001238066,-0.000090,-0.000419,-0.000085,-0.000389,0.000448,-0.000190,0.000107,0.000981,0.001250,-0.000016,0.000366,-0.000222,0.000042,-0.000417,0.000352,0.000376
2,C_ID_0001506ef0,0.000040,-0.000520,-0.000005,-0.000416,0.000757,-0.000489,-0.000088,0.000919,0.001537,-0.000048,0.000587,-0.000267,-0.000317,-0.000070,0.000444,0.000580
3,C_ID_0001793786,-0.000920,0.001111,0.002873,-0.001826,0.001236,0.001108,-0.003143,-0.000041,-0.002702,0.000700,-0.000136,0.000097,0.001699,-0.002352,-0.000167,0.001499
4,C_ID_000183fdda,-0.000085,-0.000479,-0.000055,-0.000433,0.000497,-0.000266,0.000040,0.000922,0.001213,0.000253,0.000238,-0.000280,-0.000081,-0.000252,0.000251,0.000423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325535,C_ID_ffff1d9928,0.000053,-0.000579,0.000053,-0.000538,0.000819,-0.000427,-0.000194,0.001005,0.001682,-0.000097,0.000626,-0.000391,-0.000055,-0.000107,0.000582,0.000770
325536,C_ID_ffff579d3a,-0.000175,-0.000530,-0.000122,-0.000452,0.000605,-0.000304,0.000094,0.000943,0.001199,0.000055,0.000374,-0.000118,-0.000122,-0.000259,0.000171,0.000453
325537,C_ID_ffff756266,-0.000209,-0.000431,-0.000292,-0.000442,0.000582,-0.000293,0.000186,0.000842,0.000952,0.000258,0.000186,-0.000147,0.000031,-0.000613,0.000222,0.000356
325538,C_ID_ffff828181,-0.000142,-0.001184,-0.000144,-0.001894,-0.000581,0.000640,-0.000404,0.000098,0.001331,-0.001651,-0.000396,0.002603,0.000028,-0.000583,-0.001141,0.000744


In [9]:
# merge id embeddings with train.csv 
dataset = train_file.merge(node2vec_embeddings, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,C_ID_92a2005557,2,-0.820283,-5.8e-05,-0.000386,-6.7e-05,-0.000314,0.000582,-0.000209,9.5e-05,0.000788,0.001004,0.000213,0.000281,-2.5e-05,-0.00018,-0.000414,0.000323,0.000568
1,C_ID_3d0044924f,1,0.392913,-4.1e-05,-0.000446,-0.000248,-0.000185,0.000413,-0.000276,0.000164,0.000741,0.00099,0.000247,-3.1e-05,6.6e-05,-0.000231,-0.000569,0.000271,0.00062
2,C_ID_d639edf6cd,2,0.688056,0.000117,-0.000671,0.000142,-0.000462,0.001062,-0.00064,-0.00023,0.001086,0.001825,-0.000377,0.000946,-0.000368,-0.00021,0.000115,0.000683,0.000699
3,C_ID_186d6a6901,3,0.142495,-5.9e-05,-0.000435,-0.000205,-0.000434,0.00072,-0.000168,2.3e-05,0.000732,0.001041,0.000245,0.000119,-4.1e-05,-0.000102,-0.000413,0.00029,0.000584
4,C_ID_cdbd2c0db2,3,-0.159749,-0.000201,-0.000511,-0.00013,-0.000302,0.000483,-0.000345,1e-05,0.000953,0.000787,0.000499,0.000144,-0.000292,-1.6e-05,-0.000318,0.000522,0.00081


In [10]:
# merge id features 
dataset = dataset.merge(id_features, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,...,9,10,11,12,13,14,15,nunique_merchant_category_id,count_merchant_category_id,nunique_count_frac_merchant_category_id
0,C_ID_92a2005557,2,-0.820283,-5.8e-05,-0.000386,-6.7e-05,-0.000314,0.000582,-0.000209,9.5e-05,...,0.000213,0.000281,-2.5e-05,-0.00018,-0.000414,0.000323,0.000568,46,283,0.162544
1,C_ID_3d0044924f,1,0.392913,-4.1e-05,-0.000446,-0.000248,-0.000185,0.000413,-0.000276,0.000164,...,0.000247,-3.1e-05,6.6e-05,-0.000231,-0.000569,0.000271,0.00062,58,356,0.162921
2,C_ID_d639edf6cd,2,0.688056,0.000117,-0.000671,0.000142,-0.000462,0.001062,-0.00064,-0.00023,...,-0.000377,0.000946,-0.000368,-0.00021,0.000115,0.000683,0.000699,9,44,0.204545
3,C_ID_186d6a6901,3,0.142495,-5.9e-05,-0.000435,-0.000205,-0.000434,0.00072,-0.000168,2.3e-05,...,0.000245,0.000119,-4.1e-05,-0.000102,-0.000413,0.00029,0.000584,28,84,0.333333
4,C_ID_cdbd2c0db2,3,-0.159749,-0.000201,-0.000511,-0.00013,-0.000302,0.000483,-0.000345,1e-05,...,0.000499,0.000144,-0.000292,-1.6e-05,-0.000318,0.000522,0.00081,37,169,0.218935


In [11]:
dataset

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,...,9,10,11,12,13,14,15,nunique_merchant_category_id,count_merchant_category_id,nunique_count_frac_merchant_category_id
0,C_ID_92a2005557,2,-0.820283,-0.000058,-0.000386,-0.000067,-0.000314,0.000582,-0.000209,0.000095,...,0.000213,0.000281,-0.000025,-0.000180,-0.000414,0.000323,0.000568,46,283,0.162544
1,C_ID_3d0044924f,1,0.392913,-0.000041,-0.000446,-0.000248,-0.000185,0.000413,-0.000276,0.000164,...,0.000247,-0.000031,0.000066,-0.000231,-0.000569,0.000271,0.000620,58,356,0.162921
2,C_ID_d639edf6cd,2,0.688056,0.000117,-0.000671,0.000142,-0.000462,0.001062,-0.000640,-0.000230,...,-0.000377,0.000946,-0.000368,-0.000210,0.000115,0.000683,0.000699,9,44,0.204545
3,C_ID_186d6a6901,3,0.142495,-0.000059,-0.000435,-0.000205,-0.000434,0.000720,-0.000168,0.000023,...,0.000245,0.000119,-0.000041,-0.000102,-0.000413,0.000290,0.000584,28,84,0.333333
4,C_ID_cdbd2c0db2,3,-0.159749,-0.000201,-0.000511,-0.000130,-0.000302,0.000483,-0.000345,0.000010,...,0.000499,0.000144,-0.000292,-0.000016,-0.000318,0.000522,0.000810,37,169,0.218935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,2,-2.740821,0.000132,-0.000701,0.000098,-0.000465,0.000876,-0.000495,-0.000313,...,-0.000266,0.000588,-0.000476,-0.000090,-0.000101,0.000645,0.001145,11,47,0.234043
201913,C_ID_1314773c0b,1,0.312917,-0.000177,-0.000506,-0.000217,-0.000417,0.000521,-0.000194,0.000122,...,0.000166,0.000186,-0.000061,-0.000134,-0.000225,0.000090,0.000318,19,48,0.395833
201914,C_ID_7666735b3d,3,0.093494,-0.000049,-0.000590,-0.000059,-0.000504,0.000523,-0.000314,-0.000043,...,0.000043,0.000248,-0.000279,-0.000032,-0.000287,0.000401,0.000577,26,90,0.288889
201915,C_ID_73f5a0efd0,2,-4.676589,-0.000060,-0.000411,-0.000042,-0.000367,0.000744,-0.000386,-0.000009,...,0.000123,0.000355,-0.000293,-0.000251,-0.000235,0.000492,0.000585,14,31,0.451613


### evaluate on baseline dataset 

In [12]:
# define columns for training
baseline_feature_names = ["feature_2"]
embedding_feature_names = list(range(embedding_size))
id_feature_feature_names = [f"nunique_{target_id_column}",f"count_{target_id_column}",f"nunique_count_frac_{target_id_column}"]
categorical_feature_names = ["feature_2"]
target_col = "target"
results = {}

In [13]:
features = baseline_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [14]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = 10,
                        return_train_score=True)
results["baseline"] = scores

[CV] START .....................................................................
Binning 0.001 GB of training data: 0.003 s
Binning 0.000 GB of validation data: 0.000 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34904, val loss: 8.13150, in 0.012s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34900, val loss: 8.13149, in 0.007s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34896, val loss: 8.13148, in 0.007s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34892, val loss: 8.13147, in 0.006s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34888, val loss: 8.13147, in 0.007s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34884, val loss: 8.13146, in 0.006s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34880, val loss: 8.13146, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34877, val loss: 8.13145, in 0.006s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34873, val loss

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 3 leaves, max depth = 2, train loss: 7.34832, val loss: 8.13141, in 0.006s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34829, val loss: 8.13141, in 0.006s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34827, val loss: 8.13141, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34824, val loss: 8.13141, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34821, val loss: 8.13141, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34819, val loss: 8.13141, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34816, val loss: 8.13141, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34814, val loss: 8.13141, in 0.006s
Fit 29 trees in 0.216 s, (87 total leaves)
Time spent computing histograms: 0.016s
Time spent finding best splits:  0.004s
Time spent applying splits:      0.027s
Time spent predicting:           0.006s
[CV] END  neg_root_mean_squared_error: (train=-3.854, test=-3.836) r2:

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] END  neg_root_mean_squared_error: (train=-3.855, test=-3.833) r2: (train=0.000, test=0.000) total time=   0.1s
[CV] START .....................................................................
Binning 0.001 GB of training data: 0.002 s
Binning 0.000 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46241, val loss: 7.24254, in 0.006s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46237, val loss: 7.24251, in 0.007s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46233, val loss: 7.24248, in 0.006s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46229, val loss: 7.24246, in 0.006s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46225, val loss: 7.24243, in 0.006s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46221, val loss: 7.24241, in 0.006s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46217, val loss: 7.24238, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, trai

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.46167, val loss: 7.24207, in 0.006s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46164, val loss: 7.24205, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46162, val loss: 7.24204, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46159, val loss: 7.24202, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46157, val loss: 7.24201, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46154, val loss: 7.24199, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46152, val loss: 7.24198, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46150, val loss: 7.24197, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46147, val loss: 7.24195, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46145, val loss: 7.24194, in 0.006s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46143, val loss: 7.24193, in 0.00

[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51065, val loss: 6.62405, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51061, val loss: 6.62403, in 0.006s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51058, val loss: 6.62401, in 0.005s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51055, val loss: 6.62399, in 0.006s
[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51052, val loss: 6.62397, in 0.006s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51048, val loss: 6.62395, in 0.006s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51045, val loss: 6.62393, in 0.006s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51042, val loss: 6.62392, in 0.006s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51040, val loss: 6.62390, in 0.006s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51037, val loss: 6.62388, in 0.005s
[17/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51034, val loss: 6.62386, i

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.4s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.51016, val loss: 6.62376, in 0.007s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51013, val loss: 6.62375, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51011, val loss: 6.62373, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51009, val loss: 6.62372, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51007, val loss: 6.62371, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51004, val loss: 6.62370, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51002, val loss: 6.62368, in 0.005s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51000, val loss: 6.62367, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50998, val loss: 6.62366, in 0.006s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50996, val loss: 6.62365, in 0.006s
[34/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50994, val loss: 6.62364, in 0.00

[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43374, val loss: 6.54893, in 0.005s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43371, val loss: 6.54888, in 0.006s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43369, val loss: 6.54884, in 0.006s
[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43366, val loss: 6.54879, in 0.006s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43364, val loss: 6.54875, in 0.006s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43361, val loss: 6.54870, in 0.006s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43358, val loss: 6.54866, in 0.006s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43356, val loss: 6.54862, in 0.006s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43354, val loss: 6.54858, in 0.006s
[17/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43351, val loss: 6.54854, in 0.006s
[18/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43349, val loss: 6.54850, 

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.4s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.43338, val loss: 6.54830, in 0.006s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43336, val loss: 6.54827, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43334, val loss: 6.54823, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43332, val loss: 6.54819, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43330, val loss: 6.54816, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43328, val loss: 6.54812, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43327, val loss: 6.54809, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43325, val loss: 6.54806, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43323, val loss: 6.54802, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43321, val loss: 6.54799, in 0.006s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43319, val loss: 6.54796, in 0.00

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.4s finished


### evaluate on baseline with embeddings

In [15]:
features = baseline_feature_names + embedding_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [16]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings"] = scores

Binning 0.020 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.343 s
Binning 0.002 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34853, val loss: 8.13144, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34800, val loss: 8.13141, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34748, val loss: 8.13127, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34694, val loss: 8.13124, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34644, val loss: 8.13111, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34595, val loss: 8.13097, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34543, val loss: 8.13095, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34496, val loss: 8.13084, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34449, val loss: 8.13072, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34404, val loss: 8.13060, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31500, val loss: 8.12343, in 0.012s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31465, val loss: 8.12340, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31436, val loss: 8.12326, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31390, val loss: 8.12321, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31358, val loss: 8.12317, in 0.011s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31329, val loss: 8.12314, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31283, val loss: 8.12310, in 0.011s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31252, val loss: 8.12307, in 0.011s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31218, val loss: 8.12313, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31184, val loss: 8.12311, in 0.011s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31149, val lo



Binning 0.020 GB of training data: 



0.349 s
Binning 0.002 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38679, val loss: 7.81581, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38621, val loss: 7.81556, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38565, val loss: 7.81533, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38510, val loss: 7.81512, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38456, val loss: 7.81491, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38402, val loss: 7.81471, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38351, val loss: 7.81453, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38299, val loss: 7.81432, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38252, val loss: 7.81412, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38204, val loss: 7.81395, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35233, val loss: 7.80511, in 0.012s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35204, val loss: 7.80497, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35175, val loss: 7.80494, in 0.011s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35145, val loss: 7.80493, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35115, val loss: 7.80483, in 0.013s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35083, val loss: 7.80475, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35050, val loss: 7.80473, in 0.011s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35022, val loss: 7.80472, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34989, val loss: 7.80471, in 0.011s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34957, val loss: 7.80468, in 0.011s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34921, val lo



Binning 0.020 GB of training data: 



0.349 s
Binning 0.002 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46184, val loss: 7.24235, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46125, val loss: 7.24218, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46067, val loss: 7.24200, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46009, val loss: 7.24181, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45953, val loss: 7.24163, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45900, val loss: 7.24150, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45844, val loss: 7.24134, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45789, val loss: 7.24121, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45736, val loss: 7.24101, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45682, val loss: 7.24081, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42366, val loss: 7.23618, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42331, val loss: 7.23613, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42284, val loss: 7.23610, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42250, val loss: 7.23613, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42206, val loss: 7.23612, in 0.012s
Fit 93 trees in 1.544 s, (1860 total leaves)
Time spent computing histograms: 0.218s
Time spent finding best splits:  0.065s
Time spent applying splits:      0.192s
Time spent predicting:           0.025s




Binning 0.020 GB of training data: 



0.347 s
Binning 0.002 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51030, val loss: 6.62412, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50970, val loss: 6.62405, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50912, val loss: 6.62395, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50853, val loss: 6.62386, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50797, val loss: 6.62377, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50740, val loss: 6.62372, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50684, val loss: 6.62366, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50631, val loss: 6.62356, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50578, val loss: 6.62347, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50523, val loss: 6.62336, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47061, val loss: 6.61670, in 0.012s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47015, val loss: 6.61669, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46984, val loss: 6.61670, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46936, val loss: 6.61661, in 0.012s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46907, val loss: 6.61661, in 0.011s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46864, val loss: 6.61661, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46833, val loss: 6.61654, in 0.011s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46800, val loss: 6.61646, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46769, val loss: 6.61637, in 0.011s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46741, val loss: 6.61637, in 0.011s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46700, val lo



Binning 0.020 GB of training data: 



0.322 s
Binning 0.002 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43344, val loss: 6.54924, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43291, val loss: 6.54915, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43239, val loss: 6.54907, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43189, val loss: 6.54899, in 0.011s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43139, val loss: 6.54892, in 0.011s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43086, val loss: 6.54890, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43039, val loss: 6.54883, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42986, val loss: 6.54869, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42936, val loss: 6.54862, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42888, val loss: 6.54849, in 

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.2s finished


### evaluate on baseline with id features

In [17]:
features = baseline_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [18]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_id_features"] = scores

Binning 0.005 GB of training data: 0.039 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34856, val loss: 8.13124, in 0.011s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34804, val loss: 8.13099, in 0.010s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34754, val loss: 8.13074, in 0.010s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34704, val loss: 8.13049, in 0.010s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34656, val loss: 8.13025, in 0.010s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34609, val loss: 8.13001, in 0.009s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34562, val loss: 8.12978, in 0.010s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34516, val loss: 8.12956, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34472, val loss: 8.12927, in 0.010s
[10/100] 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 20 leaves, max depth = 5, train loss: 7.34429, val loss: 8.12899, in 0.010s
[11/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34385, val loss: 8.12879, in 0.009s
[12/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34344, val loss: 8.12857, in 0.010s
[13/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34303, val loss: 8.12838, in 0.010s
[14/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34262, val loss: 8.12818, in 0.010s
[15/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34223, val loss: 8.12797, in 0.010s
[16/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34184, val loss: 8.12778, in 0.009s
[17/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34145, val loss: 8.12764, in 0.010s
[18/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34109, val loss: 8.12744, in 0.010s
[19/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34071, val loss: 8.12727, in 0.010s
[20/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34034, val loss: 8.127

[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32324, val loss: 8.12165, in 0.010s
[100/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32308, val loss: 8.12167, in 0.009s
Fit 100 trees in 1.036 s, (2000 total leaves)
Time spent computing histograms: 0.125s
Time spent finding best splits:  0.048s
Time spent applying splits:      0.213s
Time spent predicting:           0.025s
Binning 0.005 GB of training data: 0.048 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38682, val loss: 7.81567, in 0.010s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38628, val loss: 7.81530, in 0.010s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38575, val loss: 7.81492, in 0.009s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38523, val loss: 7.81457, in 0.010s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38472, val loss: 7.81421, in 0.009s
[6/100] 1 tree, 20 leaves,

[84/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36303, val loss: 7.80383, in 0.009s
[85/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36287, val loss: 7.80378, in 0.009s
[86/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36273, val loss: 7.80377, in 0.009s
[87/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36254, val loss: 7.80377, in 0.009s
[88/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36242, val loss: 7.80374, in 0.009s
[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36229, val loss: 7.80373, in 0.009s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36210, val loss: 7.80373, in 0.009s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36197, val loss: 7.80373, in 0.009s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36183, val loss: 7.80374, in 0.009s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36171, val loss: 7.80371, in 0.009s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36159, val lo

[69/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43927, val loss: 7.23421, in 0.009s
[70/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43908, val loss: 7.23417, in 0.009s
[71/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43888, val loss: 7.23415, in 0.009s
[72/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43870, val loss: 7.23415, in 0.010s
[73/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43850, val loss: 7.23414, in 0.010s
[74/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43831, val loss: 7.23414, in 0.010s
[75/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43814, val loss: 7.23410, in 0.010s
[76/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43795, val loss: 7.23410, in 0.010s
[77/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43779, val loss: 7.23408, in 0.010s
[78/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43761, val loss: 7.23407, in 0.009s
[79/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43745, val lo

[70/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48960, val loss: 6.61446, in 0.009s
[71/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48943, val loss: 6.61442, in 0.010s
[72/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48926, val loss: 6.61439, in 0.010s
[73/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48909, val loss: 6.61434, in 0.010s
[74/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48893, val loss: 6.61431, in 0.010s
[75/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48875, val loss: 6.61430, in 0.009s
[76/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48859, val loss: 6.61427, in 0.009s
[77/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48842, val loss: 6.61423, in 0.009s
[78/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48827, val loss: 6.61422, in 0.009s
[79/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48812, val loss: 6.61422, in 0.010s
[80/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48795, val lo

[55/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41749, val loss: 6.53914, in 0.009s
[56/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41730, val loss: 6.53904, in 0.009s
[57/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41709, val loss: 6.53897, in 0.010s
[58/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41688, val loss: 6.53887, in 0.009s
[59/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41670, val loss: 6.53881, in 0.009s
[60/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41650, val loss: 6.53876, in 0.009s
[61/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41631, val loss: 6.53866, in 0.009s
[62/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41612, val loss: 6.53862, in 0.009s
[63/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41594, val loss: 6.53855, in 0.009s
[64/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41577, val loss: 6.53851, in 0.009s
[65/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41558, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.0s finished


### evaluate on baseline with embeddings and id features

In [19]:
features = baseline_feature_names + embedding_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [20]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings_and_id_features"] = scores

Binning 0.023 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.375 s
Binning 0.003 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34820, val loss: 8.13106, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34733, val loss: 8.13064, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34649, val loss: 8.13015, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34565, val loss: 8.12964, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34484, val loss: 8.12917, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34403, val loss: 8.12869, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34325, val loss: 8.12822, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34247, val loss: 8.12786, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34170, val loss: 8.12743, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34095, val loss: 8.12701, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30014, val loss: 8.10956, in 0.012s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.29974, val loss: 8.10940, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.29940, val loss: 8.10929, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.29900, val loss: 8.10909, in 0.012s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.29865, val loss: 8.10898, in 0.012s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.29826, val loss: 8.10878, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.29792, val loss: 8.10871, in 0.012s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.29754, val loss: 8.10850, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.29717, val loss: 8.10826, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.29680, val loss: 8.10815, in 0.013s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.29642, val lo



Binning 0.023 GB of training data: 



0.381 s
Binning 0.003 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38647, val loss: 7.81538, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38559, val loss: 7.81472, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38472, val loss: 7.81415, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38387, val loss: 7.81352, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38304, val loss: 7.81297, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38222, val loss: 7.81239, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38142, val loss: 7.81187, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38063, val loss: 7.81143, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.37987, val loss: 7.81111, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.37910, val loss: 7.81071, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33719, val loss: 7.79010, in 0.011s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33677, val loss: 7.78994, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33638, val loss: 7.78983, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33598, val loss: 7.78941, in 0.012s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33558, val loss: 7.78928, in 0.011s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33518, val loss: 7.78911, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33476, val loss: 7.78880, in 0.012s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33437, val loss: 7.78869, in 0.011s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33399, val loss: 7.78840, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33359, val loss: 7.78827, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.33321, val lo



Binning 0.023 GB of training data: 



0.380 s
Binning 0.003 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46149, val loss: 7.24211, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46056, val loss: 7.24168, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45964, val loss: 7.24115, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45874, val loss: 7.24070, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45785, val loss: 7.24029, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45697, val loss: 7.23976, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45612, val loss: 7.23941, in 0.011s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45529, val loss: 7.23906, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45447, val loss: 7.23874, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45367, val loss: 7.23844, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41055, val loss: 7.22699, in 0.014s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41013, val loss: 7.22706, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40973, val loss: 7.22695, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40938, val loss: 7.22693, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40897, val loss: 7.22694, in 0.012s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40853, val loss: 7.22685, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40806, val loss: 7.22683, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40770, val loss: 7.22681, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40730, val loss: 7.22680, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40684, val loss: 7.22678, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40645, val lo



Binning 0.023 GB of training data: 



0.384 s
Binning 0.003 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51002, val loss: 6.62370, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50917, val loss: 6.62318, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50833, val loss: 6.62268, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50751, val loss: 6.62225, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50669, val loss: 6.62177, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50587, val loss: 6.62133, in 0.011s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50507, val loss: 6.62090, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50429, val loss: 6.62043, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50352, val loss: 6.62004, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50276, val loss: 6.61965, in 

[88/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45993, val loss: 6.60309, in 0.012s
[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45943, val loss: 6.60303, in 0.012s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45902, val loss: 6.60300, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45861, val loss: 6.60296, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45819, val loss: 6.60296, in 0.012s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45779, val loss: 6.60296, in 0.013s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45735, val loss: 6.60283, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45696, val loss: 6.60278, in 0.012s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45649, val loss: 6.60274, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45614, val loss: 6.60247, in 0.011s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45575, val lo



Binning 0.023 GB of training data: 



0.399 s
Binning 0.003 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43313, val loss: 6.54887, in 0.015s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43231, val loss: 6.54848, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43147, val loss: 6.54812, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43066, val loss: 6.54767, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42984, val loss: 6.54731, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42907, val loss: 6.54693, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42826, val loss: 6.54653, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42752, val loss: 6.54618, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42673, val loss: 6.54584, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42600, val loss: 6.54542, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38457, val loss: 6.52946, in 0.012s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38416, val loss: 6.52940, in 0.013s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38377, val loss: 6.52923, in 0.013s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38336, val loss: 6.52916, in 0.012s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38298, val loss: 6.52906, in 0.013s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38253, val loss: 6.52886, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38215, val loss: 6.52874, in 0.012s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38176, val loss: 6.52868, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38140, val loss: 6.52865, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38104, val loss: 6.52855, in 0.013s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38066, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.6s finished


In [21]:
from numpy import mean
import csv

def make_output(key):
    output = {}
    output["id_column"] = target_id_column
    output["type"] = key
    output["train_r2"] = mean(results[key]["train_r2"])
    output["test_r2"] = mean(results[key]["test_r2"])
    output["train_root_mean_squared_error"] = -1*mean(results[key]["train_neg_root_mean_squared_error"])
    output["test_root_mean_squared_error"] = -1*mean(results[key]["test_neg_root_mean_squared_error"])
    return output
def save(output):
    path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\results\node2vec_embeddings.csv"
    with open(path, 'a', newline='') as csv_file:
        dict_object = csv.DictWriter(csv_file, fieldnames=list(output.keys())) 
        dict_object.writerow(output)

In [22]:
print(make_output("baseline"))
print(make_output("baseline_with_embeddings"))
print(make_output("baseline_with_id_features"))
print(make_output("baseline_with_embeddings_and_id_features"))

{'id_column': 'merchant_category_id', 'type': 'baseline', 'train_r2': 0.0001655841013616044, 'test_r2': 0.00013871442920936338, 'train_root_mean_squared_error': 3.8501609494819595, 'test_root_mean_squared_error': 3.85005310803518}
{'id_column': 'merchant_category_id', 'type': 'baseline_with_embeddings', 'train_r2': 0.004496752884389221, 'test_r2': 0.001508908360015293, 'train_root_mean_squared_error': 3.8418078020600257, 'test_root_mean_squared_error': 3.84741997509622}
{'id_column': 'merchant_category_id', 'type': 'baseline_with_id_features', 'train_r2': 0.0032413793418800685, 'test_r2': 0.00209346390784948, 'train_root_mean_squared_error': 3.8442337668305235, 'test_root_mean_squared_error': 3.846283973458732}
{'id_column': 'merchant_category_id', 'type': 'baseline_with_embeddings_and_id_features', 'train_r2': 0.006939490015696337, 'test_r2': 0.003562334385953947, 'train_root_mean_squared_error': 3.837096047525718, 'test_root_mean_squared_error': 3.8434533005164098}


### save

In [24]:
save(make_output("baseline"))
save(make_output("baseline_with_embeddings"))
save(make_output("baseline_with_id_features"))
save(make_output("baseline_with_embeddings_and_id_features"))