# ABOUT: 
- this code evaluates the node2vec embeddings on all node2vec embeddings generated
- findings: 
    - using card_id embeddings appear to cause overfitting
        
- details:       
    - i.e compared to baseline, performance on training set is better but performance on validation set is worse
    - baseline - using just feature_2 as feature
    - model used is Histogram Gradient boosting
    - metrics used are r2 and rmse
    - 3 fold cross validated

In [1]:
from config import *

import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nodevectors
from sklearn.model_selection import cross_validate

In [2]:
target_id_column = "city_id"
node2vec_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\model\node2vec_card_id_city_id.zip"

In [7]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
id_columns = pd.read_csv(path, nrows = None, usecols = ["card_id", target_id_column])
id_columns.head()

Unnamed: 0,card_id,city_id
0,C_ID_4e6213e9bc,city_id_88
1,C_ID_4e6213e9bc,city_id_88
2,C_ID_4e6213e9bc,city_id_88
3,C_ID_4e6213e9bc,city_id_88
4,C_ID_4e6213e9bc,city_id_88


In [8]:
# load train target variable
train_file = pd.read_csv(train_path, usecols = ["card_id","target", "feature_2"])
train_file.head()

Unnamed: 0,card_id,feature_2,target
0,C_ID_92a2005557,2,-0.820283
1,C_ID_3d0044924f,1,0.392913
2,C_ID_d639edf6cd,2,0.688056
3,C_ID_186d6a6901,3,0.142495
4,C_ID_cdbd2c0db2,3,-0.159749


In [9]:
# load trained node2vec
node2vec = nodevectors.GGVec.load(node2vec_path)
# convert embeddings to dataframe
node2vec_embeddings = pd.DataFrame.from_dict(node2vec.model, orient = "index")
node2vec_embeddings = node2vec_embeddings.reset_index()
node2vec_embeddings

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
0,C_ID_00007093c1,-0.106716,-0.089236,0.113657,-0.123730,0.014258,0.113594,-0.176164,0.083533,-0.039410,...,-0.077418,-0.050492,-0.135790,0.169363,-0.117154,0.197989,-0.074490,-0.074387,-0.155933,-0.153348
1,C_ID_0001238066,0.157534,-0.101552,0.174173,0.231312,-0.318610,-0.159484,-0.116667,0.105384,0.174811,...,0.308444,-0.225374,-0.182541,-0.128666,-0.039411,0.061187,0.025898,-0.296355,0.015855,-0.031663
2,C_ID_0001506ef0,-0.141772,0.120108,0.139707,-0.089195,0.121194,0.059623,-0.160276,-0.025165,-0.221492,...,-0.040747,0.059382,-0.030425,-0.194672,-0.021321,-0.060224,0.097151,0.063103,0.037582,-0.026990
3,C_ID_0001793786,-0.039819,-0.187320,0.003942,0.128835,0.125397,0.168516,0.125239,0.128109,0.035853,...,-0.093655,-0.131937,0.243757,-0.015062,0.030878,0.143987,-0.276977,0.055190,-0.084917,-0.077475
4,C_ID_000183fdda,-0.061255,-0.012943,-0.168752,0.159792,-0.254140,0.138611,0.280408,-0.057751,-0.052864,...,-0.112484,-0.260600,0.229047,-0.016090,-0.019483,-0.002663,-0.011464,0.131821,0.124212,-0.095945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325843,city_id_92,0.828420,0.472418,-0.370669,-0.262726,-0.663897,-0.650594,-0.170845,-0.401652,-0.158569,...,-0.630482,-0.588893,-0.453237,0.084014,-0.588800,-0.306644,0.003675,0.217924,0.335679,0.578960
325844,city_id_94,0.001069,0.001242,0.000663,0.001960,0.001424,0.001180,-0.000796,-0.000426,0.000822,...,-0.001277,0.000871,-0.000769,0.000146,0.001048,0.000038,-0.001088,-0.000144,0.000550,-0.002154
325845,city_id_96,0.000590,-0.000178,0.001212,-0.000166,0.000708,-0.001587,-0.000349,0.000135,0.001348,...,-0.000283,0.000279,0.001060,0.000751,-0.001065,0.001509,0.000648,0.000361,0.000809,0.000701
325846,city_id_97,-0.000647,0.000120,0.000260,-0.000200,0.000127,0.000299,-0.001357,0.002225,-0.000659,...,-0.000751,0.001632,0.001176,0.001696,0.001337,0.001670,-0.000294,0.000038,0.000105,0.000450


In [10]:
# group and aggregate the id embeddings (e.g city_id embeddings) by the "card_id"
node2vec_embeddings = id_columns.merge(node2vec_embeddings, how = "left", left_on = "card_id", right_on = "index")
node2vec_embeddings = node2vec_embeddings.drop("index", axis = 1)
node2vec_embeddings = node2vec_embeddings.groupby("card_id").mean().reset_index()

In [11]:
# merge id embeddings with train.csv 
dataset = train_file.merge(node2vec_embeddings, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,...,22,23,24,25,26,27,28,29,30,31
0,C_ID_92a2005557,2,-0.820283,0.069153,0.051837,-0.000265,-0.028155,0.049382,-0.085005,0.119039,...,-0.046368,0.124118,-0.197271,0.135623,-0.123432,0.01372,-0.008437,-0.055381,0.085001,-0.009585
1,C_ID_3d0044924f,1,0.392913,0.079955,-0.000385,0.192025,0.037655,-0.024281,0.01008,0.101211,...,-0.137529,-0.049119,-0.01298,-0.020049,0.184259,0.104667,-0.161247,-0.063553,-0.007475,-0.055708
2,C_ID_d639edf6cd,2,0.688056,-0.149637,-0.076601,-0.06551,-0.031559,-0.079825,0.068283,-0.012328,...,-0.042642,0.148946,0.171022,-0.113394,-0.062607,-0.109001,-0.034621,0.080301,0.031115,-0.114331
3,C_ID_186d6a6901,3,0.142495,-0.017425,-0.0205,-0.051019,0.085914,0.034674,-0.103122,-0.101948,...,0.058944,-0.018131,0.090689,-0.065021,0.164142,0.00436,-0.174505,0.147133,-0.088404,0.118497
4,C_ID_cdbd2c0db2,3,-0.159749,0.076518,-0.063663,-0.059358,-0.002621,-0.102248,-0.011622,0.147796,...,0.08888,-0.034169,0.028208,0.160501,0.104513,0.114578,0.037847,0.122616,0.012048,0.061013


## Evaluate effectiveness of card_id embeddings

In [14]:
# define columns for training
train_cols_a = ["feature_2"]
train_cols_b = ["feature_2"] + list(range(32))
target_col = "target"

In [16]:
features = train_cols_a
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.05,
    "max_iter":100,
    "categorical_features" : X.columns.isin(["feature_2"]),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5
}
model = HistGradientBoostingRegressor(**model_params)

In [17]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)

Binning 0.001 GB of training data: 0.052 s
Binning 0.000 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34887, val loss: 8.13147, in 0.019s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34869, val loss: 8.13144, in 0.013s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34852, val loss: 8.13142, in 0.007s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34836, val loss: 8.13141, in 0.007s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34823, val loss: 8.13141, in 0.007s
[6/100] 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 3 leaves, max depth = 2, train loss: 7.34810, val loss: 8.13141, in 0.007s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34799, val loss: 8.13143, in 0.007s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34789, val loss: 8.13144, in 0.008s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34780, val loss: 8.13146, in 0.007s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34771, val loss: 8.13148, in 0.007s
Fit 10 trees in 0.197 s, (30 total leaves)
Time spent computing histograms: 0.006s
Time spent finding best splits:  0.002s
Time spent applying splits:      0.011s
Time spent predicting:           0.004s
Binning 0.001 GB of training data: 0.001 s
Binning 0.000 GB of validation data: 0.000 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.38719, val loss: 7.81604, in 0.006s
[2/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.38702, val loss: 7.81603, in 0.007s
[3/100] 1 tree, 3 leaves, max depth = 2, train

[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43283, val loss: 6.54723, in 0.006s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43279, val loss: 6.54713, in 0.006s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43274, val loss: 6.54703, in 0.007s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43271, val loss: 6.54695, in 0.006s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43267, val loss: 6.54686, in 0.007s
[17/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43264, val loss: 6.54679, in 0.008s
[18/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43261, val loss: 6.54672, in 0.007s
[19/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43259, val loss: 6.54665, in 0.006s
[20/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43257, val loss: 6.54659, in 0.006s
[21/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43255, val loss: 6.54653, in 0.007s
[22/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43253, val loss: 6.54648

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.3s finished


In [19]:
results = {}
results["no_embeddings"] = scores

In [40]:
features = train_cols_b
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(["feature_2"]),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [41]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["with_embeddings"] = scores

Binning 0.038 GB of training data: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.680 s
Binning 0.004 GB of validation data: 0.009 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34879, val loss: 8.13150, in 0.019s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34850, val loss: 8.13150, in 0.020s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34811, val loss: 8.13150, in 0.016s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34772, val loss: 8.13150, in 0.019s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34728, val loss: 8.13150, in 0.019s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34691, val loss: 8.13151, in 0.018s
Fit 6 trees in 0.907 s, (120 total leaves)
Time spent computing histograms: 0.030s
Time spent finding best splits:  0.008s
Time spent applying splits:      0.017s
Time spent predicting:           0.002s




Binning 0.038 GB of training data: 0.838 s
Binning 0.004 GB of validation data: 0.012 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38711, val loss: 7.81606, in 0.018s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38685, val loss: 7.81607, in 0.018s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38660, val loss: 7.81608, in 0.020s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38635, val loss: 7.81609, in 0.018s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38611, val loss: 7.81611, in 0.019s
Fit 5 trees in 1.070 s, (100 total leaves)
Time spent computing histograms: 0.024s
Time spent finding best splits:  0.006s
Time spent applying splits:      0.016s
Time spent predicting:           0.002s




Binning 0.038 GB of training data: 0.786 s
Binning 0.004 GB of validation data: 0.010 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46209, val loss: 7.24253, in 0.018s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46174, val loss: 7.24249, in 0.019s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46139, val loss: 7.24249, in 0.019s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46106, val loss: 7.24247, in 0.017s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46071, val loss: 7.24246, in 0.022s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46035, val loss: 7.24240, in 0.020s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46000, val loss: 7.24231, in 0.019s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45964, val loss: 7.24226, in 0.019s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.45930, val loss: 7.24218, in 0.016s
[10/100] 1 tree, 20 leaves, max depth = 5, train lo



Binning 0.038 GB of training data: 



0.842 s
Binning 0.004 GB of validation data: 0.013 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51062, val loss: 6.62424, in 0.022s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51034, val loss: 6.62427, in 0.022s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51007, val loss: 6.62429, in 0.021s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50981, val loss: 6.62432, in 0.020s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50948, val loss: 6.62434, in 0.020s
Fit 5 trees in 1.104 s, (100 total leaves)
Time spent computing histograms: 0.031s
Time spent finding best splits:  0.008s
Time spent applying splits:      0.018s
Time spent predicting:           0.002s




Binning 0.038 GB of training data: 0.823 s
Binning 0.004 GB of validation data: 0.017 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43357, val loss: 6.54927, in 0.022s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43317, val loss: 6.54920, in 0.022s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43278, val loss: 6.54915, in 0.019s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43239, val loss: 6.54910, in 0.018s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43201, val loss: 6.54907, in 0.022s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43167, val loss: 6.54903, in 0.019s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43131, val loss: 6.54901, in 0.021s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43099, val loss: 6.54895, in 0.018s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43067, val loss: 6.54884, in 0.022s
[10/100] 1 tree, 20 leaves, max depth = 5, train lo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.1s finished


In [44]:
from numpy import mean
print(mean(results['no_embeddings']["test_neg_root_mean_squared_error"]))
print(mean(results['no_embeddings']["train_neg_root_mean_squared_error"]))
print(mean(results['with_embeddings']["test_neg_root_mean_squared_error"]))
print(mean(results['with_embeddings']["train_neg_root_mean_squared_error"]))

-3.8499928269358166
-3.850094014784659
-3.850309282642139
-3.848901897453035


In [7]:
# get id columns
def get_id_data():
    # read
    merchants = pd.read_csv(merchants_path, usecols = feature_names['merchants']['id'])#, nrows = 100000) 
    new_transactions = pd.read_csv(new_transactions_path, usecols = feature_names['transactions']['id'])#, nrows = 100000)
    hist_transactions = pd.read_csv(historical_transactions_path, usecols = feature_names['transactions']['id'])#, nrows = 100000)
    # process
    # remove duplicate merchant_id - which there are
    merchants = merchants[~merchants.merchant_id.duplicated()]  
    # concat historical and new transactions - they have the same columns
    id_columns = pd.concat([hist_transactions, new_transactions], axis = 0)
    # fill missing merchant_id with the most frequent one 
    id_columns['merchant_id'] = id_columns['merchant_id'].fillna('M_ID_00a6ca8a8a')
    # merge transactions data with merchant information - merchant information has an additional "merchant_group_id" column
    id_columns = id_columns.merge(merchants[["merchant_id","merchant_group_id"]], how = "left", on = "merchant_id")
    del new_transactions, hist_transactions, merchants
    # convert these columns to edge list 
    to_process_cols = ['city_id', 'merchant_category_id', 'state_id','subsector_id', 'merchant_group_id']
    for c in to_process_cols:
        id_columns[c] = f"{c}_" + id_columns[c].astype(str)
    return id_columns
# id_columns = get_id_data()
# path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
# id_columns.to_csv(path, index = False)
# id_columns