# ABOUT: 
- this code evaluates the node2vec embeddings on all node2vec embeddings generated
- findings: 
    - using card_id embeddings appear to cause overfitting
        
- details:       
    - i.e compared to baseline, performance on training set is better but performance on validation set is worse
    - baseline - using just feature_2 as feature
    - model used is Histogram Gradient boosting
    - metrics used are r2 and rmse
    - 3 fold cross validated

In [1]:
from config import *

import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nodevectors
from sklearn.model_selection import cross_validate

In [2]:
target_id_column = "city_id"
node2vec_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\model\node2vec_card_id_city_id.zip"
embedding_size = 16

### prepare data

In [4]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
id_columns = pd.read_csv(path, usecols = ["card_id", target_id_column])
id_columns.head()

Unnamed: 0,card_id,city_id
0,C_ID_4e6213e9bc,city_id_88
1,C_ID_4e6213e9bc,city_id_88
2,C_ID_4e6213e9bc,city_id_88
3,C_ID_4e6213e9bc,city_id_88
4,C_ID_4e6213e9bc,city_id_88


In [5]:
# group by card_id, then acquire nunique_merchant_id, count_merchant_id, nunique_count_frac_merchant_id
id_features = id_columns.groupby("card_id").agg(["nunique", "count"])
id_features = id_features.reset_index()
id_features.columns = ["card_id", f"nunique_{target_id_column}", f"count_{target_id_column}"]
id_features[f"nunique_count_frac_{target_id_column}"] = id_features[f"nunique_{target_id_column}"]/id_features[f"count_{target_id_column}"]
id_features

Unnamed: 0,card_id,nunique_city_id,count_city_id,nunique_count_frac_city_id
0,C_ID_00007093c1,5,151,0.033113
1,C_ID_0001238066,19,149,0.127517
2,C_ID_0001506ef0,3,68,0.044118
3,C_ID_0001793786,11,247,0.044534
4,C_ID_000183fdda,10,155,0.064516
...,...,...,...,...
325535,C_ID_ffff1d9928,3,16,0.187500
325536,C_ID_ffff579d3a,7,115,0.060870
325537,C_ID_ffff756266,2,25,0.080000
325538,C_ID_ffff828181,12,198,0.060606


In [6]:
# load train target variable
train_file = pd.read_csv(train_path, usecols = ["card_id","target", "feature_2"])
train_file.head()

Unnamed: 0,card_id,feature_2,target
0,C_ID_92a2005557,2,-0.820283
1,C_ID_3d0044924f,1,0.392913
2,C_ID_d639edf6cd,2,0.688056
3,C_ID_186d6a6901,3,0.142495
4,C_ID_cdbd2c0db2,3,-0.159749


In [7]:
# load trained node2vec
node2vec = nodevectors.GGVec.load(node2vec_path)
# convert embeddings to dataframe
node2vec_embeddings = pd.DataFrame.from_dict(node2vec.model, orient = "index")
node2vec_embeddings = node2vec_embeddings.reset_index()
node2vec_embeddings

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,C_ID_00007093c1,0.262889,-0.090986,-0.208524,0.071666,-3.704968e-02,-0.252409,0.091880,0.226308,-0.173623,-0.168148,-0.174751,0.115880,-0.202436,0.078944,0.101334,0.125913
1,C_ID_0001238066,0.044111,0.236128,-0.319704,-0.247515,5.264565e-02,-0.201891,-0.167338,-0.124595,0.007986,-0.328404,0.160148,-0.201556,0.153158,0.005402,-0.307129,-0.114980
2,C_ID_0001506ef0,-0.006761,-0.064617,-0.050889,-0.294524,-2.481558e-01,0.193569,0.120273,-0.057487,-0.283293,0.044175,-0.090897,0.185546,0.061776,-0.124701,0.110127,-0.043696
3,C_ID_0001793786,0.222008,-0.100929,-0.173089,-0.083545,-1.147595e-01,-0.051850,-0.201038,-0.024629,-0.077186,0.019440,0.184594,0.091809,-0.187350,-0.190229,-0.129124,-0.152603
4,C_ID_000183fdda,0.208515,0.216167,0.095760,-0.137067,-2.688479e-02,-0.152185,0.193984,0.188744,0.029499,-0.055846,-0.230720,0.149671,-0.146845,0.185582,-0.074875,0.272160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325843,city_id_92,-0.301738,0.224066,0.260797,0.150156,-1.642484e-01,-0.056709,-0.079886,0.320980,-0.041497,0.299164,-0.357450,0.198527,-0.037694,0.190856,0.147655,0.022618
325844,city_id_94,-0.000830,0.002314,0.000959,0.000007,1.409916e-04,0.000507,-0.001626,-0.001562,-0.002271,0.002103,-0.000302,-0.000545,-0.001087,-0.001036,-0.000559,0.000736
325845,city_id_96,-0.002125,-0.000591,-0.002209,-0.000681,4.664234e-04,-0.003131,0.001142,0.000066,0.000104,0.000447,-0.001175,-0.001785,0.001070,-0.001123,0.000967,-0.000926
325846,city_id_97,0.002137,-0.000901,-0.000328,0.000011,-7.168526e-04,0.000594,0.001206,0.001520,-0.000930,0.000172,0.003130,0.000266,0.000671,-0.000589,0.000063,-0.001141


In [8]:
# group and aggregate the id embeddings (e.g city_id embeddings) by the "card_id"
node2vec_embeddings = id_columns.merge(node2vec_embeddings, how = "left", left_on = target_id_column, right_on = "index")
node2vec_embeddings = node2vec_embeddings.drop("index", axis = 1)
node2vec_embeddings = node2vec_embeddings.groupby("card_id").mean().reset_index()

In [9]:
node2vec_embeddings

Unnamed: 0,card_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,C_ID_00007093c1,-0.001148,0.000627,-0.000708,-0.000829,0.001012,-0.000449,-0.002069,-0.000810,0.000246,-0.000350,-0.000958,0.000566,0.001182,-0.000994,0.000552,-0.001277
1,C_ID_0001238066,-0.001137,-0.000357,-0.000118,-0.000404,0.000889,0.000013,0.000453,-0.000494,0.000050,0.001225,-0.001681,-0.000373,0.000826,-0.000186,0.000896,-0.000561
2,C_ID_0001506ef0,-0.001713,0.003252,0.000660,-0.002499,0.001556,-0.001718,0.000457,-0.001280,-0.001586,-0.000263,0.000048,-0.002230,0.002771,-0.002494,0.002155,0.002446
3,C_ID_0001793786,-0.001045,0.001256,0.004397,0.001017,-0.000360,0.003427,0.001706,0.004984,0.001384,-0.000465,0.006301,-0.000493,-0.002375,-0.006762,-0.001945,-0.000945
4,C_ID_000183fdda,0.001363,-0.001268,0.000978,-0.000053,-0.000364,-0.000157,-0.000019,0.000147,-0.001225,0.001395,0.002107,-0.000280,-0.001334,-0.001224,0.000636,-0.001403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325535,C_ID_ffff1d9928,-0.001141,-0.001399,-0.000325,-0.000155,0.000255,-0.003344,0.000506,0.002431,-0.000412,0.000810,0.001229,0.001828,0.001392,-0.002580,0.004029,-0.000418
325536,C_ID_ffff579d3a,-0.001739,0.000469,-0.000030,-0.001987,0.000404,-0.000103,-0.000713,0.000494,-0.000537,0.000936,-0.000945,-0.000439,0.001445,0.000616,-0.001039,-0.000869
325537,C_ID_ffff756266,0.000207,-0.000131,0.000859,-0.000584,0.000059,0.000145,0.000888,-0.000150,-0.000086,0.000774,-0.002141,0.000209,0.002199,-0.000166,0.001070,0.001656
325538,C_ID_ffff828181,-0.001845,-0.000593,-0.001706,-0.000660,0.000561,-0.002523,0.000844,-0.000072,0.000180,0.000612,-0.001164,-0.001304,0.001078,-0.000795,0.001017,-0.000824


In [10]:
# merge id embeddings with train.csv 
dataset = train_file.merge(node2vec_embeddings, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,C_ID_92a2005557,2,-0.820283,-0.002389,-0.000725,2.2e-05,-0.001412,0.001217,-0.00023,-0.000949,0.001049,0.001529,0.000368,-0.003651,-0.001193,0.001162,-2.6e-05,-0.001938,-0.001043
1,C_ID_3d0044924f,1,0.392913,-0.002044,-0.000708,0.000102,-0.001331,0.001206,-0.000324,-0.000836,0.000665,0.001531,0.000388,-0.003404,-0.000916,0.001184,-9e-05,-0.001431,-0.001084
2,C_ID_d639edf6cd,2,0.688056,-0.001825,-0.000156,0.001948,-0.001347,0.002212,-0.00119,0.000361,0.000102,0.000795,0.002661,-0.001016,0.001574,0.001283,-0.000386,-0.002404,-0.002029
3,C_ID_186d6a6901,3,0.142495,0.000229,0.000519,-0.000359,-0.000795,0.000131,-0.002327,0.000817,0.000164,-0.00024,0.000336,5.7e-05,0.001165,0.000753,-0.000759,0.001515,-0.001457
4,C_ID_cdbd2c0db2,3,-0.159749,0.000143,0.000642,-0.000577,-0.000801,0.000202,-0.002364,0.000875,0.0002,-0.000445,0.000303,0.000413,0.001188,0.000624,-0.000854,0.001588,-0.001365


In [11]:
# merge id features 
dataset = dataset.merge(id_features, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,...,9,10,11,12,13,14,15,nunique_city_id,count_city_id,nunique_count_frac_city_id
0,C_ID_92a2005557,2,-0.820283,-0.002389,-0.000725,2.2e-05,-0.001412,0.001217,-0.00023,-0.000949,...,0.000368,-0.003651,-0.001193,0.001162,-2.6e-05,-0.001938,-0.001043,9,283,0.031802
1,C_ID_3d0044924f,1,0.392913,-0.002044,-0.000708,0.000102,-0.001331,0.001206,-0.000324,-0.000836,...,0.000388,-0.003404,-0.000916,0.001184,-9e-05,-0.001431,-0.001084,9,356,0.025281
2,C_ID_d639edf6cd,2,0.688056,-0.001825,-0.000156,0.001948,-0.001347,0.002212,-0.00119,0.000361,...,0.002661,-0.001016,0.001574,0.001283,-0.000386,-0.002404,-0.002029,5,44,0.113636
3,C_ID_186d6a6901,3,0.142495,0.000229,0.000519,-0.000359,-0.000795,0.000131,-0.002327,0.000817,...,0.000336,5.7e-05,0.001165,0.000753,-0.000759,0.001515,-0.001457,7,84,0.083333
4,C_ID_cdbd2c0db2,3,-0.159749,0.000143,0.000642,-0.000577,-0.000801,0.000202,-0.002364,0.000875,...,0.000303,0.000413,0.001188,0.000624,-0.000854,0.001588,-0.001365,7,169,0.04142


In [12]:
dataset

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,...,9,10,11,12,13,14,15,nunique_city_id,count_city_id,nunique_count_frac_city_id
0,C_ID_92a2005557,2,-0.820283,-0.002389,-0.000725,0.000022,-0.001412,0.001217,-0.000230,-0.000949,...,0.000368,-0.003651,-0.001193,0.001162,-0.000026,-0.001938,-0.001043,9,283,0.031802
1,C_ID_3d0044924f,1,0.392913,-0.002044,-0.000708,0.000102,-0.001331,0.001206,-0.000324,-0.000836,...,0.000388,-0.003404,-0.000916,0.001184,-0.000090,-0.001431,-0.001084,9,356,0.025281
2,C_ID_d639edf6cd,2,0.688056,-0.001825,-0.000156,0.001948,-0.001347,0.002212,-0.001190,0.000361,...,0.002661,-0.001016,0.001574,0.001283,-0.000386,-0.002404,-0.002029,5,44,0.113636
3,C_ID_186d6a6901,3,0.142495,0.000229,0.000519,-0.000359,-0.000795,0.000131,-0.002327,0.000817,...,0.000336,0.000057,0.001165,0.000753,-0.000759,0.001515,-0.001457,7,84,0.083333
4,C_ID_cdbd2c0db2,3,-0.159749,0.000143,0.000642,-0.000577,-0.000801,0.000202,-0.002364,0.000875,...,0.000303,0.000413,0.001188,0.000624,-0.000854,0.001588,-0.001365,7,169,0.041420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,2,-2.740821,-0.002142,0.000060,-0.000399,0.000126,0.000409,-0.000077,0.000281,...,0.000716,-0.000480,0.000450,0.000891,-0.000723,-0.001617,-0.000591,5,47,0.106383
201913,C_ID_1314773c0b,1,0.312917,0.000860,-0.000166,-0.001173,-0.001086,0.000101,-0.001121,0.000367,...,0.001985,-0.000625,0.000505,0.000741,-0.000507,0.001102,-0.000619,3,48,0.062500
201914,C_ID_7666735b3d,3,0.093494,-0.001203,0.000879,-0.000229,-0.002118,0.000827,-0.001012,-0.000174,...,0.000905,-0.002072,0.001104,-0.000817,-0.000714,-0.000883,-0.000686,10,90,0.111111
201915,C_ID_73f5a0efd0,2,-4.676589,-0.002343,-0.000823,0.000026,-0.001454,0.001174,-0.000277,-0.001002,...,0.000334,-0.003662,-0.001266,0.001223,-0.000058,-0.002012,-0.000995,2,31,0.064516


### evaluate on baseline dataset 

In [13]:
# define columns for training
baseline_feature_names = ["feature_2"]
embedding_feature_names = list(range(embedding_size))
id_feature_feature_names = [f"nunique_{target_id_column}",f"count_{target_id_column}",f"nunique_count_frac_{target_id_column}"]
categorical_feature_names = ["feature_2"]
target_col = "target"
results = {}

In [14]:
features = baseline_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [15]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = 10,
                        return_train_score=True)
results["baseline"] = scores

[CV] START .....................................................................
Binning 0.001 GB of training data: 0.004 s
Binning 0.000 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34904, val loss: 8.13150, in 0.014s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34900, val loss: 8.13149, in 0.007s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34896, val loss: 8.13148, in 0.007s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34892, val loss: 8.13147, in 0.007s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34888, val loss: 8.13147, in 0.008s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34884, val loss: 8.13146, in 0.008s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34880, val loss: 8.13146, in 0.007s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34877, val loss: 8.13145, in 0.007s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34873, val loss

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 3 leaves, max depth = 2, train loss: 7.34841, val loss: 8.13141, in 0.008s
[20/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34838, val loss: 8.13141, in 0.007s
[21/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34835, val loss: 8.13141, in 0.006s
[22/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34832, val loss: 8.13141, in 0.007s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34829, val loss: 8.13141, in 0.006s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34827, val loss: 8.13141, in 0.007s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34824, val loss: 8.13141, in 0.007s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34821, val loss: 8.13141, in 0.007s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34819, val loss: 8.13141, in 0.007s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34816, val loss: 8.13141, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34814, val loss: 8.13141, in 0.00

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] END  neg_root_mean_squared_error: (train=-3.855, test=-3.833) r2: (train=0.000, test=0.000) total time=   0.1s
[CV] START .....................................................................
Binning 0.001 GB of training data: 0.003 s
Binning 0.000 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46241, val loss: 7.24254, in 0.008s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46237, val loss: 7.24251, in 0.008s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46233, val loss: 7.24248, in 0.009s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46229, val loss: 7.24246, in 0.008s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46225, val loss: 7.24243, in 0.009s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46221, val loss: 7.24241, in 0.008s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46217, val loss: 7.24238, in 0.008s
[8/100] 1 tree, 3 leaves, max depth = 2, trai

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.46184, val loss: 7.24217, in 0.011s
[18/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46181, val loss: 7.24215, in 0.010s
[19/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46178, val loss: 7.24213, in 0.009s
[20/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46175, val loss: 7.24212, in 0.010s
[21/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46173, val loss: 7.24210, in 0.010s
[22/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46170, val loss: 7.24208, in 0.009s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46167, val loss: 7.24207, in 0.011s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46164, val loss: 7.24205, in 0.009s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46162, val loss: 7.24204, in 0.010s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46159, val loss: 7.24202, in 0.010s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46157, val loss: 7.24201, in 0.00

Binning 0.000 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51086, val loss: 6.62419, in 0.006s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51082, val loss: 6.62417, in 0.007s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51079, val loss: 6.62414, in 0.006s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51075, val loss: 6.62412, in 0.006s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51072, val loss: 6.62410, in 0.005s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51068, val loss: 6.62408, in 0.006s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51065, val loss: 6.62405, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51061, val loss: 6.62403, in 0.006s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51058, val loss: 6.62401, in 0.006s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51055, val loss: 6.62399, in 0.007s
[11/100] 1 

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.7s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.51021, val loss: 6.62379, in 0.006s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51018, val loss: 6.62377, in 0.007s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51016, val loss: 6.62376, in 0.007s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51013, val loss: 6.62375, in 0.007s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51011, val loss: 6.62373, in 0.007s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51009, val loss: 6.62372, in 0.007s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51007, val loss: 6.62371, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51004, val loss: 6.62370, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51002, val loss: 6.62368, in 0.007s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51000, val loss: 6.62367, in 0.007s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50998, val loss: 6.62366, in 0.00

[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43380, val loss: 6.54903, in 0.006s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43377, val loss: 6.54898, in 0.006s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43374, val loss: 6.54893, in 0.005s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43371, val loss: 6.54888, in 0.006s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43369, val loss: 6.54884, in 0.006s
[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43366, val loss: 6.54879, in 0.006s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43364, val loss: 6.54875, in 0.006s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43361, val loss: 6.54870, in 0.006s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43358, val loss: 6.54866, in 0.005s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43356, val loss: 6.54862, in 0.006s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43354, val loss: 6.54858, in

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.7s remaining:    0.0s


1 tree, 3 leaves, max depth = 2, train loss: 7.43336, val loss: 6.54827, in 0.007s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43334, val loss: 6.54823, in 0.006s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43332, val loss: 6.54819, in 0.005s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43330, val loss: 6.54816, in 0.006s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43328, val loss: 6.54812, in 0.006s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43327, val loss: 6.54809, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43325, val loss: 6.54806, in 0.006s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43323, val loss: 6.54802, in 0.005s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43321, val loss: 6.54799, in 0.006s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43319, val loss: 6.54796, in 0.005s
[34/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43318, val loss: 6.54793, in 0.00

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.7s finished


### evaluate on baseline with embeddings

In [16]:
features = baseline_feature_names + embedding_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [17]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings"] = scores

Binning 0.020 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.347 s
Binning 0.002 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34874, val loss: 8.13137, in 0.017s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34836, val loss: 8.13113, in 0.016s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34802, val loss: 8.13093, in 0.015s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34770, val loss: 8.13072, in 0.016s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34737, val loss: 8.13052, in 0.015s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34705, val loss: 8.13033, in 0.014s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34675, val loss: 8.13012, in 0.015s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34643, val loss: 8.12993, in 0.015s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34614, val loss: 8.12977, in 0.016s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34584, val loss: 8.12959, in 



Binning 0.020 GB of training data: 



0.387 s
Binning 0.002 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38699, val loss: 7.81584, in 0.018s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38662, val loss: 7.81564, in 0.017s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38625, val loss: 7.81544, in 0.016s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38589, val loss: 7.81525, in 0.016s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38554, val loss: 7.81505, in 0.015s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38519, val loss: 7.81482, in 0.014s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38485, val loss: 7.81464, in 0.015s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38452, val loss: 7.81442, in 0.012s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38419, val loss: 7.81425, in 0.015s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38388, val loss: 7.81404, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36426, val loss: 7.80590, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36411, val loss: 7.80591, in 0.012s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36388, val loss: 7.80584, in 0.014s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36366, val loss: 7.80582, in 0.014s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36351, val loss: 7.80581, in 0.013s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36333, val loss: 7.80577, in 0.013s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36316, val loss: 7.80569, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36301, val loss: 7.80570, in 0.013s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36282, val loss: 7.80562, in 0.014s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36263, val loss: 7.80556, in 0.014s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36244, val lo



Binning 0.020 GB of training data: 



0.327 s
Binning 0.002 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46210, val loss: 7.24248, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46177, val loss: 7.24238, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46144, val loss: 7.24232, in 0.014s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46111, val loss: 7.24222, in 0.014s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46079, val loss: 7.24216, in 0.014s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46042, val loss: 7.24205, in 0.013s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46006, val loss: 7.24195, in 0.015s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45972, val loss: 7.24183, in 0.015s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45934, val loss: 7.24174, in 0.014s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45900, val loss: 7.24164, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43881, val loss: 7.23703, in 0.014s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43863, val loss: 7.23700, in 0.013s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43845, val loss: 7.23697, in 0.012s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43823, val loss: 7.23688, in 0.014s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43803, val loss: 7.23682, in 0.014s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43782, val loss: 7.23677, in 0.013s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43762, val loss: 7.23674, in 0.013s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43744, val loss: 7.23672, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43723, val loss: 7.23669, in 0.013s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43703, val loss: 7.23660, in 0.012s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43685, val lo



Binning 0.020 GB of training data: 



0.327 s
Binning 0.002 GB of validation data: 0.004 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51058, val loss: 6.62409, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51026, val loss: 6.62397, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50994, val loss: 6.62386, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50964, val loss: 6.62374, in 0.012s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50932, val loss: 6.62364, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50898, val loss: 6.62357, in 0.013s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50869, val loss: 6.62346, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50836, val loss: 6.62337, in 0.013s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50806, val loss: 6.62325, in 0.013s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50772, val loss: 6.62319, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48797, val loss: 6.61945, in 0.013s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48776, val loss: 6.61944, in 0.014s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48759, val loss: 6.61948, in 0.014s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48743, val loss: 6.61949, in 0.017s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48725, val loss: 6.61950, in 0.016s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48700, val loss: 6.61954, in 0.014s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.48678, val loss: 6.61952, in 0.019s
Fit 95 trees in 1.684 s, (1900 total leaves)
Time spent computing histograms: 0.272s
Time spent finding best splits:  0.079s
Time spent applying splits:      0.253s
Time spent predicting:           0.027s




Binning 0.020 GB of training data: 



0.366 s
Binning 0.002 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43360, val loss: 6.54923, in 0.016s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43323, val loss: 6.54912, in 0.014s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43287, val loss: 6.54901, in 0.014s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43250, val loss: 6.54892, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43215, val loss: 6.54882, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43181, val loss: 6.54874, in 0.013s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43146, val loss: 6.54868, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43113, val loss: 6.54854, in 0.013s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43080, val loss: 6.54847, in 0.014s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43047, val loss: 6.54838, in 

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.5s finished


### evaluate on baseline with id features

In [18]:
features = baseline_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [19]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_id_features"] = scores

Binning 0.005 GB of training data: 0.038 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34860, val loss: 8.13119, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34813, val loss: 8.13089, in 0.012s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34767, val loss: 8.13057, in 0.012s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34721, val loss: 8.13028, in 0.011s
[5/100] 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 20 leaves, max depth = 5, train loss: 7.34677, val loss: 8.12998, in 0.014s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34633, val loss: 8.12970, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34591, val loss: 8.12942, in 0.015s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34548, val loss: 8.12918, in 0.014s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34507, val loss: 8.12893, in 0.013s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34467, val loss: 8.12869, in 0.012s
[11/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34427, val loss: 8.12844, in 0.013s
[12/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34387, val loss: 8.12823, in 0.012s
[13/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34349, val loss: 8.12799, in 0.012s
[14/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34312, val loss: 8.12776, in 0.012s
[15/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34274, val loss: 8.12758, 

[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32586, val loss: 8.12154, in 0.012s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32575, val loss: 8.12152, in 0.012s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32563, val loss: 8.12145, in 0.012s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32552, val loss: 8.12141, in 0.014s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32541, val loss: 8.12139, in 0.010s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32530, val loss: 8.12135, in 0.011s
[100/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.32519, val loss: 8.12128, in 0.011s
Fit 100 trees in 1.206 s, (2000 total leaves)
Time spent computing histograms: 0.149s
Time spent finding best splits:  0.058s
Time spent applying splits:      0.287s
Time spent predicting:           0.028s
Binning 0.005 GB of training data: 0.047 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 le

[79/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36609, val loss: 7.80665, in 0.009s
[80/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36593, val loss: 7.80661, in 0.010s
[81/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36581, val loss: 7.80661, in 0.010s
[82/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36568, val loss: 7.80659, in 0.011s
[83/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36554, val loss: 7.80657, in 0.011s
[84/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36541, val loss: 7.80653, in 0.010s
[85/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36528, val loss: 7.80647, in 0.011s
[86/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36513, val loss: 7.80641, in 0.011s
[87/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36500, val loss: 7.80637, in 0.011s
[88/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36488, val loss: 7.80633, in 0.010s
[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.36475, val lo

[67/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44074, val loss: 7.23460, in 0.011s
[68/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44056, val loss: 7.23458, in 0.010s
[69/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44040, val loss: 7.23456, in 0.011s
[70/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44023, val loss: 7.23453, in 0.010s
[71/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.44003, val loss: 7.23448, in 0.010s
[72/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43987, val loss: 7.23446, in 0.010s
[73/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43970, val loss: 7.23443, in 0.011s
[74/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43951, val loss: 7.23439, in 0.010s
[75/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43935, val loss: 7.23435, in 0.010s
[76/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43917, val loss: 7.23432, in 0.009s
[77/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43900, val lo

[58/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49310, val loss: 6.61532, in 0.010s
[59/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49291, val loss: 6.61526, in 0.010s
[60/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49273, val loss: 6.61520, in 0.010s
[61/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49255, val loss: 6.61514, in 0.011s
[62/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49236, val loss: 6.61508, in 0.010s
[63/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49218, val loss: 6.61502, in 0.011s
[64/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49201, val loss: 6.61500, in 0.011s
[65/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49183, val loss: 6.61495, in 0.010s
[66/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49166, val loss: 6.61490, in 0.012s
[67/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49149, val loss: 6.61487, in 0.011s
[68/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.49132, val lo

[45/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42028, val loss: 6.53950, in 0.010s
[46/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42008, val loss: 6.53936, in 0.010s
[47/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41986, val loss: 6.53922, in 0.009s
[48/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41965, val loss: 6.53910, in 0.010s
[49/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41945, val loss: 6.53899, in 0.010s
[50/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41925, val loss: 6.53885, in 0.010s
[51/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41906, val loss: 6.53874, in 0.010s
[52/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41886, val loss: 6.53865, in 0.010s
[53/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41866, val loss: 6.53853, in 0.010s
[54/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41848, val loss: 6.53843, in 0.010s
[55/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.41829, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.5s finished


### evaluate on baseline with embeddings and id features

In [20]:
features = baseline_feature_names + embedding_feature_names + id_feature_feature_names
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(categorical_feature_names),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [21]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["baseline_with_embeddings_and_id_features"] = scores

Binning 0.023 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.392 s
Binning 0.003 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34838, val loss: 8.13121, in 0.018s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34770, val loss: 8.13088, in 0.018s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34702, val loss: 8.13054, in 0.018s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34636, val loss: 8.13023, in 0.016s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34570, val loss: 8.12990, in 0.017s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34507, val loss: 8.12961, in 0.016s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34444, val loss: 8.12932, in 0.016s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34383, val loss: 8.12907, in 0.016s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34322, val loss: 8.12874, in 0.016s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34262, val loss: 8.12844, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31083, val loss: 8.11686, in 0.018s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31059, val loss: 8.11672, in 0.018s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31032, val loss: 8.11676, in 0.016s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.31008, val loss: 8.11670, in 0.016s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30981, val loss: 8.11661, in 0.016s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30958, val loss: 8.11650, in 0.017s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30929, val loss: 8.11636, in 0.016s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30902, val loss: 8.11627, in 0.015s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30880, val loss: 8.11618, in 0.014s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30853, val loss: 8.11621, in 0.015s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.30828, val lo



Binning 0.023 GB of training data: 



0.417 s
Binning 0.003 GB of validation data: 0.006 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38667, val loss: 7.81559, in 0.017s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38601, val loss: 7.81514, in 0.015s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38536, val loss: 7.81467, in 0.015s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38468, val loss: 7.81424, in 0.016s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38402, val loss: 7.81384, in 0.014s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38339, val loss: 7.81348, in 0.014s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38275, val loss: 7.81308, in 0.015s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38212, val loss: 7.81270, in 0.015s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38152, val loss: 7.81235, in 0.014s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38093, val loss: 7.81194, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34918, val loss: 7.79815, in 0.014s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34886, val loss: 7.79805, in 0.015s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34855, val loss: 7.79792, in 0.015s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34826, val loss: 7.79777, in 0.014s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34796, val loss: 7.79770, in 0.015s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34774, val loss: 7.79758, in 0.013s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34745, val loss: 7.79753, in 0.015s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34718, val loss: 7.79747, in 0.014s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34689, val loss: 7.79741, in 0.015s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34660, val loss: 7.79729, in 0.015s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34631, val lo



Binning 0.023 GB of training data: 



0.394 s
Binning 0.003 GB of validation data: 0.008 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46171, val loss: 7.24216, in 0.016s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46099, val loss: 7.24177, in 0.014s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46028, val loss: 7.24139, in 0.015s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45958, val loss: 7.24102, in 0.015s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45890, val loss: 7.24066, in 0.015s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45823, val loss: 7.24030, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45757, val loss: 7.23999, in 0.026s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45693, val loss: 7.23966, in 0.014s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45631, val loss: 7.23945, in 0.016s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45569, val loss: 7.23915, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42393, val loss: 7.23082, in 0.014s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42366, val loss: 7.23082, in 0.014s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42337, val loss: 7.23080, in 0.014s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42312, val loss: 7.23081, in 0.016s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42283, val loss: 7.23073, in 0.015s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42256, val loss: 7.23075, in 0.015s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42227, val loss: 7.23074, in 0.015s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42201, val loss: 7.23076, in 0.014s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42175, val loss: 7.23073, in 0.014s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42148, val loss: 7.23073, in 0.016s
Fit 98 trees in 1.983 s, (1960 total leaves)
Time spent computing hist



Binning 0.023 GB of training data: 



0.409 s
Binning 0.003 GB of validation data: 0.006 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51021, val loss: 6.62385, in 0.017s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50954, val loss: 6.62352, in 0.016s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50889, val loss: 6.62321, in 0.016s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50826, val loss: 6.62284, in 0.016s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50763, val loss: 6.62254, in 0.015s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50702, val loss: 6.62217, in 0.015s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50640, val loss: 6.62184, in 0.014s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50580, val loss: 6.62150, in 0.014s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50520, val loss: 6.62120, in 0.016s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50460, val loss: 6.62090, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47325, val loss: 6.60814, in 0.015s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47300, val loss: 6.60814, in 0.015s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47276, val loss: 6.60811, in 0.015s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47248, val loss: 6.60808, in 0.016s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47223, val loss: 6.60809, in 0.013s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47197, val loss: 6.60804, in 0.014s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47169, val loss: 6.60798, in 0.016s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47143, val loss: 6.60797, in 0.016s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47119, val loss: 6.60798, in 0.015s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47087, val loss: 6.60794, in 0.016s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.47063, val lo



Binning 0.023 GB of training data: 



0.396 s
Binning 0.003 GB of validation data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43335, val loss: 6.54898, in 0.016s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43272, val loss: 6.54865, in 0.015s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43210, val loss: 6.54829, in 0.016s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43146, val loss: 6.54794, in 0.014s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43087, val loss: 6.54758, in 0.015s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43024, val loss: 6.54724, in 0.014s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42968, val loss: 6.54691, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42906, val loss: 6.54658, in 0.015s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42850, val loss: 6.54631, in 0.015s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.42795, val loss: 6.54598, in 

[89/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39759, val loss: 6.53348, in 0.014s
[90/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39732, val loss: 6.53341, in 0.015s
[91/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39704, val loss: 6.53336, in 0.015s
[92/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39677, val loss: 6.53332, in 0.015s
[93/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39651, val loss: 6.53329, in 0.014s
[94/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39623, val loss: 6.53328, in 0.016s
[95/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39594, val loss: 6.53318, in 0.015s
[96/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39567, val loss: 6.53314, in 0.014s
[97/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39541, val loss: 6.53311, in 0.015s
[98/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39517, val loss: 6.53307, in 0.014s
[99/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.39488, val lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.5s finished


In [23]:
from numpy import mean
import csv

def make_output(key):
    output = {}
    output["id_column"] = target_id_column
    output["type"] = key
    output["train_r2"] = mean(results[key]["train_r2"])
    output["test_r2"] = mean(results[key]["test_r2"])
    output["train_root_mean_squared_error"] = -1*mean(results[key]["train_neg_root_mean_squared_error"])
    output["test_root_mean_squared_error"] = -1*mean(results[key]["test_neg_root_mean_squared_error"])
    return output
def save(output):
    path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\results\node2vec_embeddings.csv"
    with open(path, 'a', newline='') as csv_file:
        dict_object = csv.DictWriter(csv_file, fieldnames=list(output.keys())) 
        dict_object.writerow(output)

In [24]:
print(make_output("baseline"))
print(make_output("baseline_with_embeddings"))
print(make_output("baseline_with_id_features"))
print(make_output("baseline_with_embeddings_and_id_features"))

{'id_column': 'city_id', 'type': 'baseline', 'train_r2': 0.0001655841013616044, 'test_r2': 0.00013871442920936338, 'train_root_mean_squared_error': 3.8501609494819595, 'test_root_mean_squared_error': 3.85005310803518}
{'id_column': 'city_id', 'type': 'baseline_with_embeddings', 'train_r2': 0.0025883993103015257, 'test_r2': 0.0007794791182801575, 'train_root_mean_squared_error': 3.8454919942240395, 'test_root_mean_squared_error': 3.848819861773644}
{'id_column': 'city_id', 'type': 'baseline_with_id_features', 'train_r2': 0.003026003008263678, 'test_r2': 0.0020015141675692183, 'train_root_mean_squared_error': 3.844649062660546, 'test_root_mean_squared_error': 3.8464626448809325}
{'id_column': 'city_id', 'type': 'baseline_with_embeddings_and_id_features', 'train_r2': 0.005143861448704578, 'test_r2': 0.002782436678595146, 'train_root_mean_squared_error': 3.8405635200108166, 'test_root_mean_squared_error': 3.844956103267922}


### save

In [26]:
save(make_output("baseline"))
save(make_output("baseline_with_embeddings"))
save(make_output("baseline_with_id_features"))
save(make_output("baseline_with_embeddings_and_id_features"))