# ABOUT: 
- this code evaluates the node2vec embeddings on all node2vec embeddings generated
- findings: 
    - using card_id embeddings appear to cause overfitting
        
- details:       
    - i.e compared to baseline, performance on training set is better but performance on validation set is worse
    - baseline - using just feature_2 as feature
    - model used is Histogram Gradient boosting
    - metrics used are r2 and rmse
    - 3 fold cross validated

In [1]:
from config import *

import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nodevectors
from sklearn.model_selection import cross_validate

In [2]:
target_id_column = "city_id"
node2vec_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\model\node2vec_card_id_city_id.zip"
embedding_size = 8

### prepare data

In [3]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
id_columns = pd.read_csv(path, usecols = ["card_id", target_id_column])
id_columns.head()

Unnamed: 0,card_id,city_id
0,C_ID_4e6213e9bc,city_id_88
1,C_ID_4e6213e9bc,city_id_88
2,C_ID_4e6213e9bc,city_id_88
3,C_ID_4e6213e9bc,city_id_88
4,C_ID_4e6213e9bc,city_id_88


In [4]:
# group by card_id, then acquire nunique_merchant_id, count_merchant_id, nunique_count_frac_merchant_id
id_features = id_columns.groupby("card_id").agg(["nunique", "count"])
id_features = id_features.reset_index()
id_features.columns = ["card_id", f"nunique_{target_id_column}", f"count_{target_id_column}"]
id_features[f"nunique_count_frac_{target_id_column}"] = id_features[f"nunique_{target_id_column}"]/id_features[f"count_{target_id_column}"]
id_features

Unnamed: 0,card_id,nunique_city_id,count_city_id,nunique_count_frac_city_id
0,C_ID_00007093c1,5,151,0.033113
1,C_ID_0001238066,19,149,0.127517
2,C_ID_0001506ef0,3,68,0.044118
3,C_ID_0001793786,11,247,0.044534
4,C_ID_000183fdda,10,155,0.064516
...,...,...,...,...
325535,C_ID_ffff1d9928,3,16,0.187500
325536,C_ID_ffff579d3a,7,115,0.060870
325537,C_ID_ffff756266,2,25,0.080000
325538,C_ID_ffff828181,12,198,0.060606


In [5]:
# load train target variable
train_file = pd.read_csv(train_path, usecols = ["card_id","target", "feature_2"])
train_file.head()

Unnamed: 0,card_id,feature_2,target
0,C_ID_92a2005557,2,-0.820283
1,C_ID_3d0044924f,1,0.392913
2,C_ID_d639edf6cd,2,0.688056
3,C_ID_186d6a6901,3,0.142495
4,C_ID_cdbd2c0db2,3,-0.159749


In [6]:
# load trained node2vec
node2vec = nodevectors.GGVec.load(node2vec_path)
# convert embeddings to dataframe
node2vec_embeddings = pd.DataFrame.from_dict(node2vec.model, orient = "index")
node2vec_embeddings = node2vec_embeddings.reset_index()
node2vec_embeddings

Unnamed: 0,index,0,1,2,3,4,5,6,7
0,C_ID_00007093c1,0.146330,-0.234454,0.009972,0.195243,0.003528,-0.218631,0.179014,-0.057203
1,C_ID_0001238066,-0.243911,-0.113360,-0.342723,0.234269,-0.041521,0.018436,-0.039582,0.178394
2,C_ID_0001506ef0,-0.177661,-0.035629,0.261848,-0.236009,0.106220,0.241713,-0.057614,-0.236450
3,C_ID_0001793786,0.126791,0.106247,0.294735,0.028365,-0.179646,-0.211818,0.190864,0.219684
4,C_ID_000183fdda,0.272787,-0.046990,-0.131139,-0.035169,0.217946,-0.223275,0.049937,-0.144918
...,...,...,...,...,...,...,...,...,...
325843,city_id_92,-0.089351,0.014399,0.085515,-0.099516,0.082379,0.099564,-0.030618,0.010404
325844,city_id_94,-0.000731,0.000320,0.000803,-0.001293,-0.000404,-0.000607,-0.001366,0.000437
325845,city_id_96,-0.001952,-0.000102,-0.000547,0.000076,0.000544,0.000828,-0.000258,-0.001114
325846,city_id_97,-0.000281,-0.000326,0.000565,0.000583,-0.002336,0.001084,-0.000623,-0.000802


In [7]:
# group and aggregate the id embeddings (e.g city_id embeddings) by the "card_id"
node2vec_embeddings = id_columns.merge(node2vec_embeddings, how = "left", left_on = target_id_column, right_on = "index")
node2vec_embeddings = node2vec_embeddings.drop("index", axis = 1)
node2vec_embeddings = node2vec_embeddings.groupby("card_id").mean().reset_index()

In [8]:
node2vec_embeddings

Unnamed: 0,card_id,0,1,2,3,4,5,6,7
0,C_ID_00007093c1,-0.001086,-0.000568,0.000211,-0.000070,-0.001097,0.000505,-0.001578,-0.000753
1,C_ID_0001238066,-0.001173,-0.000181,-0.000125,-0.000525,-0.000627,0.000172,-0.000779,-0.000930
2,C_ID_0001506ef0,-0.002752,-0.000721,0.001342,0.000701,-0.001617,0.000871,-0.001400,-0.000263
3,C_ID_0001793786,0.002905,-0.004503,0.000628,0.000239,0.000812,0.001749,-0.001212,0.000331
4,C_ID_000183fdda,-0.001930,-0.000990,-0.000296,-0.000287,-0.000792,0.002173,-0.000050,-0.000092
...,...,...,...,...,...,...,...,...,...
325535,C_ID_ffff1d9928,0.000457,0.001614,0.000011,0.000318,-0.001566,0.001710,0.000491,-0.001576
325536,C_ID_ffff579d3a,-0.000042,-0.000264,0.000143,0.000191,-0.001195,-0.000355,-0.000781,-0.000701
325537,C_ID_ffff756266,-0.001972,-0.001017,-0.000897,-0.000746,-0.000267,-0.000886,-0.001517,-0.000489
325538,C_ID_ffff828181,-0.001868,-0.000070,-0.000520,0.000010,0.000243,0.000638,-0.000323,-0.000988


In [9]:
# merge id embeddings with train.csv 
dataset = train_file.merge(node2vec_embeddings, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7
0,C_ID_92a2005557,2,-0.820283,-0.001113,-0.000785,-0.000458,-0.000573,0.001213,0.000171,-0.001758,0.000555
1,C_ID_3d0044924f,1,0.392913,-0.001195,-0.000665,-0.000405,-0.000619,0.00105,0.000175,-0.001689,0.000469
2,C_ID_d639edf6cd,2,0.688056,-0.000662,-0.001357,-0.000292,-0.000161,-0.000763,0.000133,0.000339,-0.000884
3,C_ID_186d6a6901,3,0.142495,-0.000428,-0.001294,-0.000256,-0.000475,-0.000261,0.001616,0.000372,-6.2e-05
4,C_ID_cdbd2c0db2,3,-0.159749,-0.000226,-0.001444,-0.000314,-0.000317,-0.00039,0.001629,0.000598,-0.000242


In [10]:
# merge id features 
dataset = dataset.merge(id_features, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7,nunique_city_id,count_city_id,nunique_count_frac_city_id
0,C_ID_92a2005557,2,-0.820283,-0.001113,-0.000785,-0.000458,-0.000573,0.001213,0.000171,-0.001758,0.000555,9,283,0.031802
1,C_ID_3d0044924f,1,0.392913,-0.001195,-0.000665,-0.000405,-0.000619,0.00105,0.000175,-0.001689,0.000469,9,356,0.025281
2,C_ID_d639edf6cd,2,0.688056,-0.000662,-0.001357,-0.000292,-0.000161,-0.000763,0.000133,0.000339,-0.000884,5,44,0.113636
3,C_ID_186d6a6901,3,0.142495,-0.000428,-0.001294,-0.000256,-0.000475,-0.000261,0.001616,0.000372,-6.2e-05,7,84,0.083333
4,C_ID_cdbd2c0db2,3,-0.159749,-0.000226,-0.001444,-0.000314,-0.000317,-0.00039,0.001629,0.000598,-0.000242,7,169,0.04142


In [11]:
dataset

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7,nunique_city_id,count_city_id,nunique_count_frac_city_id
0,C_ID_92a2005557,2,-0.820283,-0.001113,-0.000785,-0.000458,-0.000573,0.001213,0.000171,-0.001758,0.000555,9,283,0.031802
1,C_ID_3d0044924f,1,0.392913,-0.001195,-0.000665,-0.000405,-0.000619,0.001050,0.000175,-0.001689,0.000469,9,356,0.025281
2,C_ID_d639edf6cd,2,0.688056,-0.000662,-0.001357,-0.000292,-0.000161,-0.000763,0.000133,0.000339,-0.000884,5,44,0.113636
3,C_ID_186d6a6901,3,0.142495,-0.000428,-0.001294,-0.000256,-0.000475,-0.000261,0.001616,0.000372,-0.000062,7,84,0.083333
4,C_ID_cdbd2c0db2,3,-0.159749,-0.000226,-0.001444,-0.000314,-0.000317,-0.000390,0.001629,0.000598,-0.000242,7,169,0.041420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,2,-2.740821,-0.000794,-0.000956,0.000485,-0.000028,-0.000279,0.000809,-0.001207,-0.000338,5,47,0.106383
201913,C_ID_1314773c0b,1,0.312917,-0.000033,0.001951,-0.000019,-0.001803,-0.000634,0.000875,0.000648,0.000020,3,48,0.062500
201914,C_ID_7666735b3d,3,0.093494,-0.001109,-0.000773,-0.000683,0.000323,-0.002090,-0.000708,-0.001432,0.000694,10,90,0.111111
201915,C_ID_73f5a0efd0,2,-4.676589,-0.001156,-0.000807,-0.000494,-0.000605,0.001244,0.000190,-0.001775,0.000584,2,31,0.064516


### evaluate on baseline dataset 

In [12]:
# define columns for training
baseline_feature_names = ["feature_2"]
embedding_feature_names = list(range(embedding_size))
id_feature_feature_names = [f"nunique_{target_id_column}",f"count_{target_id_column}",f"nunique_count_frac_{target_id_column}"]
categorical_feature_names = ["feature_2"]
target_col = "target"
results = {}

In [13]:
features = baseline_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [14]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = 10,
                        return_train_score=True)
results["baseline"] = scores

[CV] START .....................................................................
Binning 0.001 GB of training data: 0.002 s
Binning 0.000 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34904, val loss: 8.13150, in 0.007s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34900, val loss: 8.13149, in 0.006s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34896, val loss: 8.13148, in 0.006s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34892, val loss: 8.13147, in 0.006s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34888, val loss: 8.13147, in 0.006s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34884, val loss: 8.13146, in 0.006s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34880, val loss: 8.13146, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34877, val loss: 8.13145, in 0.006s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34873, val loss

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 3 leaves, max depth = 2, train loss: 7.34821, val loss: 8.13141, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34819, val loss: 8.13141, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34816, val loss: 8.13141, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34814, val loss: 8.13141, in 0.006s
Fit 29 trees in 0.191 s, (87 total leaves)
Time spent computing histograms: 0.015s
Time spent finding best splits:  0.003s
Time spent applying splits:      0.024s
Time spent predicting:           0.005s
[CV] END  neg_root_mean_squared_error: (train=-3.854, test=-3.836) r2: (train=0.000, test=0.000) total time=   0.1s
[CV] START .....................................................................
Binning 0.001 GB of training data: 0.001 s
Binning 0.000 GB of validation data: 0.000 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.38733, val loss: 7.81605, in 0.007s
[2/100] 1 tree, 3 leaves, max

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.38682, val loss: 7.81603, in 0.006s
Fit 17 trees in 0.130 s, (51 total leaves)
Time spent computing histograms: 0.009s
Time spent finding best splits:  0.002s
Time spent applying splits:      0.012s
Time spent predicting:           0.003s
[CV] END  neg_root_mean_squared_error: (train=-3.855, test=-3.833) r2: (train=0.000, test=0.000) total time=   0.1s
[CV] START .....................................................................
Binning 0.001 GB of training data: 0.001 s
Binning 0.000 GB of validation data: 0.000 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46241, val loss: 7.24254, in 0.006s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46237, val loss: 7.24251, in 0.006s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46233, val loss: 7.24248, in 0.006s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46229, val loss: 7.24246, in 0.006s
[5/100] 1 tree, 3 leaves, max de

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.46162, val loss: 7.24204, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46159, val loss: 7.24202, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46157, val loss: 7.24201, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46154, val loss: 7.24199, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46152, val loss: 7.24198, in 0.005s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46150, val loss: 7.24197, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46147, val loss: 7.24195, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46145, val loss: 7.24194, in 0.006s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46143, val loss: 7.24193, in 0.006s
[34/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46141, val loss: 7.24192, in 0.006s
[35/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46139, val loss: 7.24191, in 0.00

[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51058, val loss: 6.62401, in 0.006s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51055, val loss: 6.62399, in 0.006s
[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51052, val loss: 6.62397, in 0.006s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51048, val loss: 6.62395, in 0.006s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51045, val loss: 6.62393, in 0.005s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51042, val loss: 6.62392, in 0.006s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51040, val loss: 6.62390, in 0.005s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51037, val loss: 6.62388, in 0.006s
[17/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51034, val loss: 6.62386, in 0.006s
[18/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51031, val loss: 6.62385, in 0.006s
[19/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51028, val loss: 6.62383,

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.3s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.51016, val loss: 6.62376, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51013, val loss: 6.62375, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51011, val loss: 6.62373, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51009, val loss: 6.62372, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51007, val loss: 6.62371, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51004, val loss: 6.62370, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51002, val loss: 6.62368, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51000, val loss: 6.62367, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50998, val loss: 6.62366, in 0.006s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50996, val loss: 6.62365, in 0.006s
[34/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50994, val loss: 6.62364, in 0.00

[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43374, val loss: 6.54893, in 0.007s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43371, val loss: 6.54888, in 0.007s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43369, val loss: 6.54884, in 0.006s
[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43366, val loss: 6.54879, in 0.006s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43364, val loss: 6.54875, in 0.007s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43361, val loss: 6.54870, in 0.006s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43358, val loss: 6.54866, in 0.006s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43356, val loss: 6.54862, in 0.007s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43354, val loss: 6.54858, in 0.007s
[17/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43351, val loss: 6.54854, in 0.007s
[18/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43349, val loss: 6.54850, 

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.3s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.43340, val loss: 6.54834, in 0.007s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43338, val loss: 6.54830, in 0.006s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43336, val loss: 6.54827, in 0.006s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43334, val loss: 6.54823, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43332, val loss: 6.54819, in 0.006s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43330, val loss: 6.54816, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43328, val loss: 6.54812, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43327, val loss: 6.54809, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43325, val loss: 6.54806, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43323, val loss: 6.54802, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43321, val loss: 6.54799, in 0.00

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.4s finished


### evaluate on baseline with embeddings

In [15]:
features = baseline_feature_names + embedding_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [16]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings"] = scores

Binning 0.010 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.183 s
Binning 0.001 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34879, val loss: 8.13138, in 0.015s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34850, val loss: 8.13126, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34822, val loss: 8.13113, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34794, val loss: 8.13102, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34766, val loss: 8.13092, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34739, val loss: 8.13085, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34710, val loss: 8.13072, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34683, val loss: 8.13063, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34655, val loss: 8.13050, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34629, val loss: 8.13038, in 



Binning 0.010 GB of training data: 



0.201 s
Binning 0.001 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38707, val loss: 7.81584, in 0.019s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38677, val loss: 7.81563, in 0.015s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38651, val loss: 7.81544, in 0.017s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38622, val loss: 7.81528, in 0.014s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38594, val loss: 7.81509, in 0.016s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38566, val loss: 7.81496, in 0.017s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38537, val loss: 7.81478, in 0.015s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38510, val loss: 7.81466, in 0.017s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38480, val loss: 7.81458, in 0.015s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38454, val loss: 7.81444, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36860, val loss: 7.80942, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36845, val loss: 7.80944, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36827, val loss: 7.80941, in 0.011s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36813, val loss: 7.80936, in 0.011s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36796, val loss: 7.80936, in 0.013s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36780, val loss: 7.80936, in 0.011s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36762, val loss: 7.80930, in 0.012s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36746, val loss: 7.80928, in 0.010s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36730, val loss: 7.80925, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36715, val loss: 7.80928, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36700, val lo



Binning 0.010 GB of training data: 



0.170 s
Binning 0.001 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46217, val loss: 7.24240, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46190, val loss: 7.24225, in 0.015s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46162, val loss: 7.24209, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46135, val loss: 7.24195, in 0.014s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46108, val loss: 7.24186, in 0.015s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46081, val loss: 7.24172, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46054, val loss: 7.24158, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46029, val loss: 7.24146, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46003, val loss: 7.24138, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45975, val loss: 7.24126, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44380, val loss: 7.23710, in 0.014s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44365, val loss: 7.23709, in 0.013s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44346, val loss: 7.23705, in 0.013s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44330, val loss: 7.23706, in 0.012s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44311, val loss: 7.23703, in 0.014s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44290, val loss: 7.23705, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44274, val loss: 7.23705, in 0.015s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44257, val loss: 7.23702, in 0.014s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44243, val loss: 7.23702, in 0.013s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44225, val loss: 7.23700, in 0.014s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44211, val lo



Binning 0.010 GB of training data: 



0.159 s
Binning 0.001 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51055, val loss: 6.62415, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51021, val loss: 6.62409, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50987, val loss: 6.62403, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50954, val loss: 6.62399, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50922, val loss: 6.62392, in 0.012s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50890, val loss: 6.62388, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50861, val loss: 6.62382, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50831, val loss: 6.62377, in 0.013s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50803, val loss: 6.62371, in 0.013s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50775, val loss: 6.62366, in 



Binning 0.010 GB of training data: 



0.160 s
Binning 0.001 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43367, val loss: 6.54929, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43337, val loss: 6.54926, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43307, val loss: 6.54923, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43279, val loss: 6.54922, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43250, val loss: 6.54920, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43223, val loss: 6.54918, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43195, val loss: 6.54918, in 0.012s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43168, val loss: 6.54915, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43143, val loss: 6.54914, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43116, val loss: 6.54910, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41415, val loss: 6.54727, in 0.012s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41398, val loss: 6.54727, in 0.011s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41382, val loss: 6.54728, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41366, val loss: 6.54728, in 0.011s
Fit 92 trees in 1.378 s, (1840 total leaves)
Time spent computing histograms: 0.201s
Time spent finding best splits:  0.064s
Time spent applying splits:      0.261s
Time spent predicting:           0.026s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.8s finished


### evaluate on baseline with id features

In [17]:
features = baseline_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [18]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_id_features"] = scores

Binning 0.005 GB of training data: 0.038 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34860, val loss: 8.13119, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34813, val loss: 8.13089, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34767, val loss: 8.13057, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34721, val loss: 8.13028, in 0.012s
[5/100] 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 20 leaves, max depth = 5, train loss: 7.34677, val loss: 8.12998, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34633, val loss: 8.12970, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34591, val loss: 8.12942, in 0.014s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34548, val loss: 8.12918, in 0.014s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34507, val loss: 8.12893, in 0.011s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34467, val loss: 8.12869, in 0.014s
[11/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34427, val loss: 8.12844, in 0.013s
[12/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34387, val loss: 8.12823, in 0.012s
[13/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34349, val loss: 8.12799, in 0.012s
[14/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34312, val loss: 8.12776, in 0.012s
[15/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34274, val loss: 8.12758, 

[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32586, val loss: 8.12154, in 0.010s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32575, val loss: 8.12152, in 0.011s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32563, val loss: 8.12145, in 0.013s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32552, val loss: 8.12141, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32541, val loss: 8.12139, in 0.013s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32530, val loss: 8.12135, in 0.011s
[100/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32519, val loss: 8.12128, in 0.012s
Fit 100 trees in 1.243 s, (2000 total leaves)
Time spent computing histograms: 0.162s
Time spent finding best splits:  0.059s
Time spent applying splits:      0.294s
Time spent predicting:           0.030s
Binning 0.005 GB of training data: 0.048 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 le

[79/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36609, val loss: 7.80665, in 0.011s
[80/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36593, val loss: 7.80661, in 0.011s
[81/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36581, val loss: 7.80661, in 0.010s
[82/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36568, val loss: 7.80659, in 0.011s
[83/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36554, val loss: 7.80657, in 0.012s
[84/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36541, val loss: 7.80653, in 0.009s
[85/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36528, val loss: 7.80647, in 0.011s
[86/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36513, val loss: 7.80641, in 0.011s
[87/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36500, val loss: 7.80637, in 0.012s
[88/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36488, val loss: 7.80633, in 0.010s
[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36475, val lo

[67/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44074, val loss: 7.23460, in 0.011s
[68/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44056, val loss: 7.23458, in 0.011s
[69/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44040, val loss: 7.23456, in 0.010s
[70/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44023, val loss: 7.23453, in 0.011s
[71/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44003, val loss: 7.23448, in 0.011s
[72/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43987, val loss: 7.23446, in 0.010s
[73/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43970, val loss: 7.23443, in 0.011s
[74/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43951, val loss: 7.23439, in 0.011s
[75/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43935, val loss: 7.23435, in 0.011s
[76/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43917, val loss: 7.23432, in 0.011s
[77/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43900, val lo

[58/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49310, val loss: 6.61532, in 0.011s
[59/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49291, val loss: 6.61526, in 0.011s
[60/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49273, val loss: 6.61520, in 0.010s
[61/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49255, val loss: 6.61514, in 0.011s
[62/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49236, val loss: 6.61508, in 0.011s
[63/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49218, val loss: 6.61502, in 0.011s
[64/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49201, val loss: 6.61500, in 0.011s
[65/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49183, val loss: 6.61495, in 0.011s
[66/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49166, val loss: 6.61490, in 0.011s
[67/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49149, val loss: 6.61487, in 0.011s
[68/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49132, val lo

[45/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42028, val loss: 6.53950, in 0.010s
[46/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42008, val loss: 6.53936, in 0.010s
[47/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41986, val loss: 6.53922, in 0.010s
[48/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41965, val loss: 6.53910, in 0.010s
[49/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41945, val loss: 6.53899, in 0.010s
[50/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41925, val loss: 6.53885, in 0.009s
[51/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41906, val loss: 6.53874, in 0.010s
[52/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41886, val loss: 6.53865, in 0.009s
[53/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41866, val loss: 6.53853, in 0.010s
[54/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41848, val loss: 6.53843, in 0.010s
[55/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41829, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.0s finished


### evaluate on baseline with embeddings and id features

In [19]:
features = baseline_feature_names + embedding_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [20]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings_and_id_features"] = scores

Binning 0.014 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.232 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34845, val loss: 8.13118, in 0.033s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34783, val loss: 8.13086, in 0.021s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34721, val loss: 8.13052, in 0.015s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34660, val loss: 8.13019, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34599, val loss: 8.12989, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34541, val loss: 8.12956, in 0.014s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34482, val loss: 8.12928, in 0.014s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34427, val loss: 8.12896, in 0.014s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34370, val loss: 8.12869, in 0.013s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34316, val loss: 8.12839, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31659, val loss: 8.11583, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31637, val loss: 8.11577, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31610, val loss: 8.11575, in 0.013s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31587, val loss: 8.11562, in 0.013s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31566, val loss: 8.11555, in 0.012s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31541, val loss: 8.11556, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31520, val loss: 8.11545, in 0.014s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31498, val loss: 8.11541, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31476, val loss: 8.11529, in 0.014s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31452, val loss: 8.11522, in 0.014s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31429, val lo



Binning 0.014 GB of training data: 



0.210 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38676, val loss: 7.81570, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38616, val loss: 7.81535, in 0.014s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38557, val loss: 7.81501, in 0.014s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38499, val loss: 7.81464, in 0.014s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38441, val loss: 7.81431, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38385, val loss: 7.81396, in 0.014s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38331, val loss: 7.81365, in 0.014s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38277, val loss: 7.81328, in 0.014s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38225, val loss: 7.81299, in 0.014s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38173, val loss: 7.81264, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35444, val loss: 7.79947, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35422, val loss: 7.79941, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35400, val loss: 7.79929, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35376, val loss: 7.79921, in 0.013s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35352, val loss: 7.79916, in 0.013s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35332, val loss: 7.79910, in 0.013s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35310, val loss: 7.79899, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35287, val loss: 7.79888, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35267, val loss: 7.79879, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35240, val loss: 7.79869, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.35221, val lo



Binning 0.014 GB of training data: 



0.198 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46178, val loss: 7.24217, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46112, val loss: 7.24179, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46047, val loss: 7.24141, in 0.011s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45984, val loss: 7.24105, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45922, val loss: 7.24070, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45861, val loss: 7.24036, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45802, val loss: 7.24002, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45744, val loss: 7.23969, in 0.013s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45686, val loss: 7.23937, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45630, val loss: 7.23911, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42769, val loss: 7.22987, in 0.014s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42747, val loss: 7.22979, in 0.013s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42720, val loss: 7.22977, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42697, val loss: 7.22972, in 0.013s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42671, val loss: 7.22967, in 0.012s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42648, val loss: 7.22961, in 0.014s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42623, val loss: 7.22959, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42602, val loss: 7.22954, in 0.013s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42573, val loss: 7.22954, in 0.014s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42550, val loss: 7.22947, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42526, val lo



Binning 0.014 GB of training data: 



0.224 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51029, val loss: 6.62373, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50968, val loss: 6.62338, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50910, val loss: 6.62306, in 0.015s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50851, val loss: 6.62262, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50794, val loss: 6.62229, in 0.015s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50738, val loss: 6.62194, in 0.014s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50682, val loss: 6.62161, in 0.015s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50627, val loss: 6.62121, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50575, val loss: 6.62092, in 0.013s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50521, val loss: 6.62053, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47832, val loss: 6.60984, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47810, val loss: 6.60984, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47786, val loss: 6.60976, in 0.013s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47763, val loss: 6.60969, in 0.012s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47739, val loss: 6.60957, in 0.013s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47717, val loss: 6.60952, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47692, val loss: 6.60947, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47671, val loss: 6.60945, in 0.013s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47648, val loss: 6.60940, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47624, val loss: 6.60935, in 0.013s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47602, val lo



Binning 0.014 GB of training data: 



0.197 s
Binning 0.002 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43341, val loss: 6.54895, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43285, val loss: 6.54857, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43230, val loss: 6.54826, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43175, val loss: 6.54790, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43122, val loss: 6.54760, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43070, val loss: 6.54730, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43018, val loss: 6.54697, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42968, val loss: 6.54667, in 0.013s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42919, val loss: 6.54639, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42868, val loss: 6.54605, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40323, val loss: 6.53318, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40301, val loss: 6.53311, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40277, val loss: 6.53305, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40255, val loss: 6.53302, in 0.012s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40232, val loss: 6.53294, in 0.013s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40210, val loss: 6.53288, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40186, val loss: 6.53288, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40163, val loss: 6.53284, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40141, val loss: 6.53276, in 0.012s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40118, val loss: 6.53276, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.40097, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.2s finished


In [21]:
from numpy import mean
import csv

def make_output(key):
    output = {}
    output["id_column"] = target_id_column
    output["type"] = key
    output["train_r2"] = mean(results[key]["train_r2"])
    output["test_r2"] = mean(results[key]["test_r2"])
    output["train_root_mean_squared_error"] = -1*mean(results[key]["train_neg_root_mean_squared_error"])
    output["test_root_mean_squared_error"] = -1*mean(results[key]["test_neg_root_mean_squared_error"])
    output["embedding_dimension"] = embedding_size
    return output
def save(output):
    path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\results\node2vec_embeddings.csv"
    with open(path, 'a', newline='') as csv_file:
        dict_object = csv.DictWriter(csv_file, fieldnames=list(output.keys())) 
        dict_object.writerow(output)

In [22]:
print(make_output("baseline"))
print(make_output("baseline_with_embeddings"))
print(make_output("baseline_with_id_features"))
print(make_output("baseline_with_embeddings_and_id_features"))

{'id_column': 'city_id', 'type': 'baseline', 'train_r2': 0.0001655841013616044, 'test_r2': 0.00013871442920936338, 'train_root_mean_squared_error': 3.8501609494819595, 'test_root_mean_squared_error': 3.85005310803518, 'embedding_dimension': 8}
{'id_column': 'city_id', 'type': 'baseline_with_embeddings', 'train_r2': 0.002235665534918074, 'test_r2': 0.000721306088301743, 'train_root_mean_squared_error': 3.8461733695027363, 'test_root_mean_squared_error': 3.8489315463932634, 'embedding_dimension': 8}
{'id_column': 'city_id', 'type': 'baseline_with_id_features', 'train_r2': 0.003026003008263678, 'test_r2': 0.0020015141675692183, 'train_root_mean_squared_error': 3.844649062660546, 'test_root_mean_squared_error': 3.8464626448809325, 'embedding_dimension': 8}
{'id_column': 'city_id', 'type': 'baseline_with_embeddings_and_id_features', 'train_r2': 0.004486630853329765, 'test_r2': 0.0025666350139227577, 'train_root_mean_squared_error': 3.8418316272748774, 'test_root_mean_squared_error': 3.84537

### save

In [23]:
save(make_output("baseline"))
save(make_output("baseline_with_embeddings"))
save(make_output("baseline_with_id_features"))
save(make_output("baseline_with_embeddings_and_id_features"))