# ABOUT: 
- this code evaluates the node2vec embeddings on all node2vec embeddings generated
- findings: 
    - using card_id embeddings appear to cause overfitting
        
- details:       
    - i.e compared to baseline, performance on training set is better but performance on validation set is worse
    - baseline - using just feature_2 as feature
    - model used is Histogram Gradient boosting
    - metrics used are r2 and rmse
    - 3 fold cross validated

In [1]:
from config import *

import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nodevectors
from sklearn.model_selection import cross_validate

In [32]:
target_id_column = "merchant_id"
node2vec_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\model\node2vec_card_id_merchant_id.zip"
embedding_size = 8

In [33]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
id_columns = pd.read_csv(path, nrows = None, usecols = ["card_id", target_id_column])
id_columns.head()

Unnamed: 0,card_id,merchant_id
0,C_ID_4e6213e9bc,M_ID_e020e9b302
1,C_ID_4e6213e9bc,M_ID_86ec983688
2,C_ID_4e6213e9bc,M_ID_979ed661fc
3,C_ID_4e6213e9bc,M_ID_e6d5ae8ea6
4,C_ID_4e6213e9bc,M_ID_e020e9b302


In [34]:
# load train target variable
train_file = pd.read_csv(train_path, usecols = ["card_id","target", "feature_2"])
train_file.head()

Unnamed: 0,card_id,feature_2,target
0,C_ID_92a2005557,2,-0.820283
1,C_ID_3d0044924f,1,0.392913
2,C_ID_d639edf6cd,2,0.688056
3,C_ID_186d6a6901,3,0.142495
4,C_ID_cdbd2c0db2,3,-0.159749


In [20]:
# load trained node2vec
node2vec = nodevectors.GGVec.load(node2vec_path)
# convert embeddings to dataframe
node2vec_embeddings = pd.DataFrame.from_dict(node2vec.model, orient = "index")
node2vec_embeddings = node2vec_embeddings.reset_index()
node2vec_embeddings

Unnamed: 0,index,0,1,2,3,4,5,6,7
0,C_ID_00007093c1,0.181462,-0.180385,0.069815,-0.205405,0.064675,-0.067363,0.225713,-0.102242
1,C_ID_0001238066,0.274852,-0.261882,-0.029394,-0.153353,0.382448,-0.186738,-0.241454,-0.106915
2,C_ID_0001506ef0,-0.037763,-0.185104,-0.134440,-0.279209,-0.282793,-0.250288,-0.155339,-0.232126
3,C_ID_0001793786,0.271213,-0.072301,-0.186966,-0.109803,-0.103498,-0.274376,-0.163245,0.213963
4,C_ID_000183fdda,0.026458,-0.166246,0.074520,0.070202,-0.088945,-0.305559,0.081232,-0.186655
...,...,...,...,...,...,...,...,...,...
325866,merchant_category_id_885,0.000546,0.000255,-0.000658,0.000268,0.000461,-0.000809,-0.000153,-0.000121
325867,merchant_category_id_889,0.000436,0.000182,-0.000697,-0.000161,0.000338,-0.000511,-0.000102,0.000033
325868,merchant_category_id_891,0.000396,0.000159,-0.000728,0.000274,0.000154,-0.000856,-0.000117,-0.000141
325869,merchant_category_id_9,0.000430,0.000329,-0.000564,0.000289,0.000227,-0.000611,-0.000229,0.000205


In [21]:
# group and aggregate the id embeddings (e.g city_id embeddings) by the "card_id"
node2vec_embeddings = id_columns.merge(node2vec_embeddings, how = "left", left_on = "card_id", right_on = "index")
node2vec_embeddings = node2vec_embeddings.drop("index", axis = 1)
node2vec_embeddings = node2vec_embeddings.groupby("card_id").mean().reset_index()

In [22]:
# merge id embeddings with train.csv 
dataset = train_file.merge(node2vec_embeddings, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7
0,C_ID_92a2005557,2,-0.820283,-0.000124,0.153602,-0.080683,0.105355,0.13074,0.229402,-0.074917,0.171845
1,C_ID_3d0044924f,1,0.392913,-0.324256,-0.055721,0.258252,0.129115,0.289052,-0.328472,-0.001731,-0.092374
2,C_ID_d639edf6cd,2,0.688056,0.005886,0.025906,-0.215568,-0.161745,0.038759,-0.054218,0.045277,0.276956
3,C_ID_186d6a6901,3,0.142495,-0.130948,0.151948,-0.094438,0.009352,0.257981,0.093998,-0.087205,0.187554
4,C_ID_cdbd2c0db2,3,-0.159749,-0.319891,-0.083334,-0.260943,-0.316259,0.208033,0.255889,-0.097522,-0.014094


In [23]:
dataset

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,7
0,C_ID_92a2005557,2,-0.820283,-0.000124,0.153602,-0.080683,0.105355,0.130740,0.229402,-0.074917,0.171845
1,C_ID_3d0044924f,1,0.392913,-0.324256,-0.055721,0.258252,0.129115,0.289052,-0.328472,-0.001731,-0.092374
2,C_ID_d639edf6cd,2,0.688056,0.005886,0.025906,-0.215568,-0.161745,0.038759,-0.054218,0.045277,0.276956
3,C_ID_186d6a6901,3,0.142495,-0.130948,0.151948,-0.094438,0.009352,0.257981,0.093998,-0.087205,0.187554
4,C_ID_cdbd2c0db2,3,-0.159749,-0.319891,-0.083334,-0.260943,-0.316259,0.208033,0.255889,-0.097522,-0.014094
...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,2,-2.740821,-0.113607,-0.037266,-0.054408,-0.148246,-0.027460,0.229460,-0.069233,-0.307598
201913,C_ID_1314773c0b,1,0.312917,-0.215593,-0.185984,-0.183035,0.221628,0.253526,-0.216786,-0.353921,-0.225388
201914,C_ID_7666735b3d,3,0.093494,0.282668,-0.021271,-0.096789,0.196406,-0.125175,-0.080513,-0.152618,-0.146493
201915,C_ID_73f5a0efd0,2,-4.676589,0.232744,-0.287345,0.041690,-0.244558,0.193848,0.017767,0.109645,-0.044227


### evaluate on dataset with no embeddings

In [24]:
# define columns for training
train_cols_a = ["feature_2"]
train_cols_b = ["feature_2"] + list(range(embedding_size))
target_col = "target"

In [25]:
features = train_cols_a
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(["feature_2"]),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [26]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)

Binning 0.001 GB of training data: 0.001 s
Binning 0.000 GB of validation data: 0.000 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34904, val loss: 8.13150, in 0.007s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34900, val loss: 8.13149, in 0.009s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34896, val loss: 8.13148, in 0.019s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34892, val loss: 8.13147, in 0.011s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34888, val loss: 8.13147, in 0.008s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34884, val loss: 8.13146, in 0.008s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34880, val loss: 8.13146, in 0.007s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34877, val loss: 8.13145, in 0.008s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34873, val loss: 8.13144, in 0.008s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.3486

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 3 leaves, max depth = 2, train loss: 7.34841, val loss: 8.13141, in 0.009s
[20/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34838, val loss: 8.13141, in 0.008s
[21/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34835, val loss: 8.13141, in 0.007s
[22/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34832, val loss: 8.13141, in 0.007s
[23/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34829, val loss: 8.13141, in 0.010s
[24/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34827, val loss: 8.13141, in 0.008s
[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34824, val loss: 8.13141, in 0.009s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34821, val loss: 8.13141, in 0.009s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34819, val loss: 8.13141, in 0.009s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34816, val loss: 8.13141, in 0.008s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34814, val loss: 8.13141, in 0.00

[56/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46103, val loss: 7.24173, in 0.008s
[57/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46101, val loss: 7.24173, in 0.009s
[58/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46100, val loss: 7.24172, in 0.009s
[59/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46099, val loss: 7.24171, in 0.008s
[60/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46098, val loss: 7.24171, in 0.009s
[61/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46096, val loss: 7.24170, in 0.009s
[62/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46095, val loss: 7.24170, in 0.008s
[63/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46094, val loss: 7.24169, in 0.009s
[64/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46093, val loss: 7.24169, in 0.009s
[65/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46092, val loss: 7.24169, in 0.009s
[66/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46090, val loss: 7.24168

[42/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50979, val loss: 6.62357, in 0.008s
[43/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50978, val loss: 6.62356, in 0.007s
[44/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50976, val loss: 6.62355, in 0.008s
[45/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50975, val loss: 6.62354, in 0.007s
[46/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50973, val loss: 6.62354, in 0.008s
[47/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50971, val loss: 6.62353, in 0.009s
[48/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50970, val loss: 6.62352, in 0.007s
[49/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50968, val loss: 6.62352, in 0.008s
[50/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50967, val loss: 6.62351, in 0.009s
[51/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50966, val loss: 6.62350, in 0.009s
[52/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50964, val loss: 6.62350

[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43328, val loss: 6.54812, in 0.007s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43327, val loss: 6.54809, in 0.006s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43325, val loss: 6.54806, in 0.007s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43323, val loss: 6.54802, in 0.006s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43321, val loss: 6.54799, in 0.007s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43319, val loss: 6.54796, in 0.007s
[34/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43318, val loss: 6.54793, in 0.007s
[35/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43316, val loss: 6.54790, in 0.007s
[36/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43315, val loss: 6.54787, in 0.006s
[37/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43313, val loss: 6.54784, in 0.006s
[38/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43311, val loss: 6.54781

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.3s finished


In [27]:
results = {}
results["no_embeddings"] = scores

### evaluate on dataset with node2vec embeddings

In [28]:
features = train_cols_b
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(["feature_2"]),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [29]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["with_embeddings"] = scores

Binning 0.010 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.167 s
Binning 0.001 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34884, val loss: 8.13148, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34860, val loss: 8.13145, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34834, val loss: 8.13143, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34810, val loss: 8.13141, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34787, val loss: 8.13139, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34763, val loss: 8.13136, in 0.013s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34738, val loss: 8.13131, in 0.014s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34716, val loss: 8.13132, in 0.013s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34693, val loss: 8.13127, in 0.014s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34668, val loss: 8.13128, in 



0.160 s
Binning 0.001 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38711, val loss: 7.81610, in 0.014s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38686, val loss: 7.81613, in 0.014s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38661, val loss: 7.81620, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38637, val loss: 7.81623, in 0.014s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38614, val loss: 7.81628, in 0.013s
Fit 5 trees in 0.297 s, (100 total leaves)
Time spent computing histograms: 0.011s
Time spent finding best splits:  0.004s
Time spent applying splits:      0.016s
Time spent predicting:           0.002s
Binning 0.010 GB of training data: 



0.214 s
Binning 0.001 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46219, val loss: 7.24259, in 0.015s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46193, val loss: 7.24260, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46167, val loss: 7.24261, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46140, val loss: 7.24268, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46115, val loss: 7.24272, in 0.012s
Fit 5 trees in 0.354 s, (100 total leaves)
Time spent computing histograms: 0.012s
Time spent finding best splits:  0.003s
Time spent applying splits:      0.014s
Time spent predicting:           0.001s
Binning 0.010 GB of training data: 



0.162 s
Binning 0.001 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51067, val loss: 6.62419, in 0.012s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51041, val loss: 6.62416, in 0.013s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51018, val loss: 6.62414, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50993, val loss: 6.62413, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50972, val loss: 6.62414, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50950, val loss: 6.62408, in 0.013s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50928, val loss: 6.62413, in 0.014s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50907, val loss: 6.62410, in 0.013s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50887, val loss: 6.62411, in 0.013s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50868, val loss: 6.62408, in 



Binning 0.010 GB of training data: 



0.159 s
Binning 0.001 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43370, val loss: 6.54929, in 0.013s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43342, val loss: 6.54926, in 0.014s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43315, val loss: 6.54921, in 0.013s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43287, val loss: 6.54918, in 0.013s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43261, val loss: 6.54915, in 0.013s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43237, val loss: 6.54913, in 0.012s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43212, val loss: 6.54909, in 0.013s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43190, val loss: 6.54906, in 0.011s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43165, val loss: 6.54908, in 0.012s
[10/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43143, val loss: 6.54906, in 

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.1s finished


In [30]:
from numpy import mean
print(mean(results['no_embeddings']["test_neg_root_mean_squared_error"]))
print(mean(results['no_embeddings']["train_neg_root_mean_squared_error"]))
print(mean(results['with_embeddings']["test_neg_root_mean_squared_error"]))
print(mean(results['with_embeddings']["train_neg_root_mean_squared_error"]))

-3.85005310803518
-3.8501609494819595
-3.850244819699756
-3.8494747854783578


In [31]:
print(mean(results['no_embeddings']["test_r2"]))
print(mean(results['no_embeddings']["train_r2"]))
print(mean(results['with_embeddings']["test_r2"]))
print(mean(results['with_embeddings']["train_r2"]))

0.00013871442920936338
0.0001655841013616044
3.9373212538951205e-05
0.00052257569330183


In [7]:
# get id columns
def get_id_data():
    # read
    merchants = pd.read_csv(merchants_path, usecols = feature_names['merchants']['id'])#, nrows = 100000) 
    new_transactions = pd.read_csv(new_transactions_path, usecols = feature_names['transactions']['id'])#, nrows = 100000)
    hist_transactions = pd.read_csv(historical_transactions_path, usecols = feature_names['transactions']['id'])#, nrows = 100000)
    # process
    # remove duplicate merchant_id - which there are
    merchants = merchants[~merchants.merchant_id.duplicated()]  
    # concat historical and new transactions - they have the same columns
    id_columns = pd.concat([hist_transactions, new_transactions], axis = 0)
    # fill missing merchant_id with the most frequent one 
    id_columns['merchant_id'] = id_columns['merchant_id'].fillna('M_ID_00a6ca8a8a')
    # merge transactions data with merchant information - merchant information has an additional "merchant_group_id" column
    id_columns = id_columns.merge(merchants[["merchant_id","merchant_group_id"]], how = "left", on = "merchant_id")
    del new_transactions, hist_transactions, merchants
    # convert these columns to edge list 
    to_process_cols = ['city_id', 'merchant_category_id', 'state_id','subsector_id', 'merchant_group_id']
    for c in to_process_cols:
        id_columns[c] = f"{c}_" + id_columns[c].astype(str)
    return id_columns
# id_columns = get_id_data()
# path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
# id_columns.to_csv(path, index = False)
# id_columns