# ABOUT: 
- this code evaluates the node2vec embeddings on all node2vec embeddings generated
- findings: 
    - using card_id embeddings appear to cause overfitting
        
- details:       
    - i.e compared to baseline, performance on training set is better but performance on validation set is worse
    - baseline - using just feature_2 as feature
    - model used is Histogram Gradient boosting
    - metrics used are r2 and rmse
    - 3 fold cross validated

In [1]:
from config import *

import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nodevectors
from sklearn.model_selection import cross_validate

In [2]:
target_id_column = "merchant_category_id"
node2vec_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\model\node2vec_card_id_merchant_category_id.zip"

In [3]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
id_columns = pd.read_csv(path, nrows = None, usecols = ["card_id", target_id_column])
id_columns.head()

Unnamed: 0,card_id,merchant_category_id
0,C_ID_4e6213e9bc,merchant_category_id_80
1,C_ID_4e6213e9bc,merchant_category_id_367
2,C_ID_4e6213e9bc,merchant_category_id_80
3,C_ID_4e6213e9bc,merchant_category_id_560
4,C_ID_4e6213e9bc,merchant_category_id_80


In [4]:
# load train target variable
train_file = pd.read_csv(train_path, usecols = ["card_id","target", "feature_2"])
train_file.head()

Unnamed: 0,card_id,feature_2,target
0,C_ID_92a2005557,2,-0.820283
1,C_ID_3d0044924f,1,0.392913
2,C_ID_d639edf6cd,2,0.688056
3,C_ID_186d6a6901,3,0.142495
4,C_ID_cdbd2c0db2,3,-0.159749


In [5]:
# load trained node2vec
node2vec = nodevectors.GGVec.load(node2vec_path)
# convert embeddings to dataframe
node2vec_embeddings = pd.DataFrame.from_dict(node2vec.model, orient = "index")
node2vec_embeddings = node2vec_embeddings.reset_index()
node2vec_embeddings

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
0,C_ID_00007093c1,-0.711161,-0.726014,0.743626,0.774802,-0.787023,-0.762485,-0.721565,-0.839366,0.774019,...,0.720581,-0.822426,0.804105,-0.729099,0.756346,0.725666,-0.751030,-0.786627,0.779041,-0.681421
1,C_ID_0001238066,0.083626,0.146398,-0.191121,0.031053,-0.987720,-0.775962,-0.514045,-0.790308,0.213312,...,0.010769,-0.890207,-0.556724,-0.011869,-0.542368,-0.608573,0.622186,-0.880643,0.724820,0.867790
2,C_ID_0001506ef0,0.605687,-0.574744,-0.305123,0.392635,1.000000,-0.345872,-0.551220,0.354372,0.988053,...,0.224684,-0.823909,-0.192172,-0.784061,0.994550,0.506569,0.469637,-0.015037,0.103247,-0.280158
3,C_ID_0001793786,0.649605,-0.232497,0.006912,0.477335,0.125062,0.167437,-0.420117,0.543700,-0.288750,...,-0.252934,0.006459,-0.342390,-0.231364,0.070232,0.175987,0.041089,0.469319,-0.290688,-0.022376
4,C_ID_000183fdda,0.367779,0.128463,0.113040,-0.166852,0.777394,0.142162,-0.502617,-0.054007,0.266776,...,0.772080,0.902140,0.993499,-0.701727,0.997306,-0.162804,-0.332992,-0.322887,0.199605,-0.476724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325866,merchant_category_id_885,0.043426,0.013879,-0.075276,0.017759,-0.031334,-0.042183,-0.015771,-0.046125,0.054176,...,0.087926,-0.072253,0.006458,-0.018079,0.024041,0.007084,0.005956,-0.009067,0.017579,-0.052787
325867,merchant_category_id_889,-0.551628,0.988355,-0.980682,-0.261231,0.144907,-0.311586,0.932659,0.337135,0.030166,...,0.967572,0.015129,0.995234,-0.808044,0.104486,0.805447,0.993861,0.999270,0.871827,-0.461966
325868,merchant_category_id_891,0.088297,0.014548,-0.150495,0.030505,-0.042222,-0.069345,-0.038439,-0.109091,0.116619,...,0.141011,-0.109357,0.015460,-0.013491,0.048491,0.013096,-0.002667,-0.025140,0.028173,-0.096071
325869,merchant_category_id_9,-0.007631,0.006221,-0.025057,0.002821,-0.018910,-0.010514,-0.016558,-0.017576,0.013766,...,0.007104,-0.027755,0.010721,0.005480,-0.010033,-0.009516,-0.011372,-0.028306,0.021587,-0.015713


In [6]:
# group and aggregate the id embeddings (e.g city_id embeddings) by the "card_id"
node2vec_embeddings = id_columns.merge(node2vec_embeddings, how = "left", left_on = "card_id", right_on = "index")
node2vec_embeddings = node2vec_embeddings.drop("index", axis = 1)
node2vec_embeddings = node2vec_embeddings.groupby("card_id").mean().reset_index()

In [7]:
# merge id embeddings with train.csv 
dataset = train_file.merge(node2vec_embeddings, on = "card_id", how = "left")
dataset.head()

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,...,22,23,24,25,26,27,28,29,30,31
0,C_ID_92a2005557,2,-0.820283,0.070308,0.023228,0.026288,0.035687,-0.057915,-0.074439,-0.058549,...,0.00637,0.021035,0.031432,-0.046516,0.084686,0.001601,0.00842,0.049547,-0.007827,-0.020319
1,C_ID_3d0044924f,1,0.392913,0.0449,0.070127,-0.122354,-0.150865,-0.001735,0.044793,-0.005661,...,0.033553,0.046318,0.135659,-0.056377,0.051288,0.054495,0.07693,-0.01777,-0.00511,-0.00732
2,C_ID_d639edf6cd,2,0.688056,0.038051,0.027099,-0.036467,0.029561,0.021679,-0.001191,-0.050191,...,-0.045626,-0.004545,0.008468,-0.061854,-0.050155,-0.057598,-0.006994,0.022297,-0.061618,0.061411
3,C_ID_186d6a6901,3,0.142495,-0.020454,-0.032945,0.004613,0.046724,0.030506,0.035086,0.037255,...,-0.005426,-0.088129,-0.017288,-0.04194,-0.071266,0.009992,-0.013619,0.048536,0.041364,-0.032068
4,C_ID_cdbd2c0db2,3,-0.159749,0.016984,0.075866,0.04707,0.003436,-0.015197,-0.009185,0.032109,...,-0.036195,0.011768,0.031311,0.083233,-0.043211,0.089787,0.033057,-0.039036,0.027952,-0.019096


In [8]:
dataset

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,...,22,23,24,25,26,27,28,29,30,31
0,C_ID_92a2005557,2,-0.820283,0.070308,0.023228,0.026288,0.035687,-0.057915,-0.074439,-0.058549,...,0.006370,0.021035,0.031432,-0.046516,0.084686,0.001601,0.008420,0.049547,-0.007827,-0.020319
1,C_ID_3d0044924f,1,0.392913,0.044900,0.070127,-0.122354,-0.150865,-0.001735,0.044793,-0.005661,...,0.033553,0.046318,0.135659,-0.056377,0.051288,0.054495,0.076930,-0.017770,-0.005110,-0.007320
2,C_ID_d639edf6cd,2,0.688056,0.038051,0.027099,-0.036467,0.029561,0.021679,-0.001191,-0.050191,...,-0.045626,-0.004545,0.008468,-0.061854,-0.050155,-0.057598,-0.006994,0.022297,-0.061618,0.061411
3,C_ID_186d6a6901,3,0.142495,-0.020454,-0.032945,0.004613,0.046724,0.030506,0.035086,0.037255,...,-0.005426,-0.088129,-0.017288,-0.041940,-0.071266,0.009992,-0.013619,0.048536,0.041364,-0.032068
4,C_ID_cdbd2c0db2,3,-0.159749,0.016984,0.075866,0.047070,0.003436,-0.015197,-0.009185,0.032109,...,-0.036195,0.011768,0.031311,0.083233,-0.043211,0.089787,0.033057,-0.039036,0.027952,-0.019096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,2,-2.740821,0.015105,0.115888,0.044397,-0.084482,-0.020086,0.044771,-0.026798,...,0.022961,-0.004145,0.032660,0.036475,0.028201,0.020797,-0.007842,0.090610,-0.030629,-0.073059
201913,C_ID_1314773c0b,1,0.312917,-0.050937,0.006379,-0.037469,0.031873,0.041716,-0.062037,-0.024032,...,-0.063829,-0.098392,0.062554,0.056510,-0.035686,0.004304,-0.004383,-0.001546,0.050612,0.022927
201914,C_ID_7666735b3d,3,0.093494,0.101701,0.059454,0.015019,-0.049518,0.068437,0.069943,-0.038297,...,-0.072680,-0.051176,0.000435,0.089096,0.049994,0.047484,0.051700,0.088432,0.042026,-0.049972
201915,C_ID_73f5a0efd0,2,-4.676589,0.058470,-0.041716,-0.022984,-0.020506,-0.024398,-0.051296,-0.022272,...,-0.013345,-0.017586,0.017830,0.080248,-0.038077,-0.020905,-0.005659,0.012160,-0.066630,-0.010750


### evaluate on dataset with no embeddings

In [9]:
# define columns for training
train_cols_a = ["feature_2"]
train_cols_b = ["feature_2"] + list(range(32))
target_col = "target"

In [10]:
features = train_cols_a
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(["feature_2"]),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [11]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)

Binning 0.001 GB of training data: 0.065 s
Binning 0.000 GB of validation data: 0.000 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34904, val loss: 8.13150, in 0.021s
[2/100] 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


1 tree, 3 leaves, max depth = 2, train loss: 7.34900, val loss: 8.13149, in 0.009s
[3/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34896, val loss: 8.13148, in 0.007s
[4/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34892, val loss: 8.13147, in 0.008s
[5/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34888, val loss: 8.13147, in 0.007s
[6/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34884, val loss: 8.13146, in 0.009s
[7/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34880, val loss: 8.13146, in 0.007s
[8/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34877, val loss: 8.13145, in 0.007s
[9/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34873, val loss: 8.13144, in 0.007s
[10/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34869, val loss: 8.13144, in 0.007s
[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34866, val loss: 8.13144, in 0.008s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.34863, val loss: 8.13143, in 0.008s
[13/

[39/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46131, val loss: 7.24186, in 0.008s
[40/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46129, val loss: 7.24185, in 0.007s
[41/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46127, val loss: 7.24184, in 0.006s
[42/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46125, val loss: 7.24184, in 0.008s
[43/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46123, val loss: 7.24183, in 0.007s
[44/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46121, val loss: 7.24182, in 0.008s
[45/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46120, val loss: 7.24181, in 0.008s
[46/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46118, val loss: 7.24180, in 0.007s
[47/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46116, val loss: 7.24179, in 0.007s
[48/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46115, val loss: 7.24179, in 0.007s
[49/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.46113, val loss: 7.24178

[25/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51013, val loss: 6.62375, in 0.008s
[26/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51011, val loss: 6.62373, in 0.008s
[27/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51009, val loss: 6.62372, in 0.008s
[28/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51007, val loss: 6.62371, in 0.008s
[29/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51004, val loss: 6.62370, in 0.009s
[30/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51002, val loss: 6.62368, in 0.007s
[31/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.51000, val loss: 6.62367, in 0.007s
[32/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50998, val loss: 6.62366, in 0.008s
[33/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50996, val loss: 6.62365, in 0.009s
[34/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50994, val loss: 6.62364, in 0.008s
[35/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.50992, val loss: 6.62363

[11/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43366, val loss: 6.54879, in 0.006s
[12/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43364, val loss: 6.54875, in 0.006s
[13/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43361, val loss: 6.54870, in 0.007s
[14/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43358, val loss: 6.54866, in 0.006s
[15/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43356, val loss: 6.54862, in 0.006s
[16/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43354, val loss: 6.54858, in 0.006s
[17/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43351, val loss: 6.54854, in 0.007s
[18/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43349, val loss: 6.54850, in 0.007s
[19/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43347, val loss: 6.54846, in 0.007s
[20/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43345, val loss: 6.54842, in 0.007s
[21/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43342, val loss: 6.54838

[100/100] 1 tree, 3 leaves, max depth = 2, train loss: 7.43257, val loss: 6.54661, in 0.007s
Fit 100 trees in 0.771 s, (300 total leaves)
Time spent computing histograms: 0.072s
Time spent finding best splits:  0.016s
Time spent applying splits:      0.113s
Time spent predicting:           0.024s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.1s finished


In [12]:
results = {}
results["no_embeddings"] = scores

### evaluate on dataset with node2vec embeddings

In [13]:
features = train_cols_b
X,y = dataset[features], dataset[target_col]
cv = 5
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 1
model_params = {
    "learning_rate":0.01,
    "max_iter":100,
    "categorical_features" : X.columns.isin(["feature_2"]),
    "l2_regularization":0.005,
    "early_stopping":True,
    "n_iter_no_change":5,
    "verbose":1,
    "random_state":0,
    "max_depth":5,
    "max_leaf_nodes":20
}
model = HistGradientBoostingRegressor(**model_params)

In [14]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)
results["with_embeddings"] = scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Binning 0.038 GB of training data: 0.790 s
Binning 0.004 GB of validation data: 0.011 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34856, val loss: 8.13136, in 0.026s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34805, val loss: 8.13122, in 0.027s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34755, val loss: 8.13109, in 0.020s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34706, val loss: 8.13096, in 0.017s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34652, val loss: 8.13102, in 0.019s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34599, val loss: 8.13108, in 0.019s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34550, val loss: 8.13097, in 0.019s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34497, val loss: 8.13105, in 0.019s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.34444, val loss: 8.13111, in 0.020s
Fit 9 trees in 1.117 s, (180 total leaves)
Time spe



Binning 0.038 GB of training data: 0.685 s
Binning 0.004 GB of validation data: 0.010 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38690, val loss: 7.81601, in 0.015s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38643, val loss: 7.81598, in 0.016s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38598, val loss: 7.81595, in 0.015s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38553, val loss: 7.81593, in 0.015s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38510, val loss: 7.81593, in 0.015s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38467, val loss: 7.81593, in 0.014s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38422, val loss: 7.81593, in 0.015s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38379, val loss: 7.81594, in 0.016s
[9/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.38335, val loss: 7.81596, in 0.016s
[10/100] 1 tree, 20 leaves, max depth = 5, train lo



Binning 0.038 GB of training data: 0.672 s
Binning 0.004 GB of validation data: 0.007 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46195, val loss: 7.24257, in 0.017s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46145, val loss: 7.24259, in 0.017s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46098, val loss: 7.24261, in 0.017s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46051, val loss: 7.24265, in 0.018s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.46004, val loss: 7.24267, in 0.019s
Fit 5 trees in 0.901 s, (100 total leaves)
Time spent computing histograms: 0.024s
Time spent finding best splits:  0.006s
Time spent applying splits:      0.014s
Time spent predicting:           0.001s




Binning 0.038 GB of training data: 0.805 s
Binning 0.004 GB of validation data: 0.007 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.51036, val loss: 6.62425, in 0.017s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50985, val loss: 6.62430, in 0.017s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50933, val loss: 6.62435, in 0.017s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50891, val loss: 6.62431, in 0.015s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.50847, val loss: 6.62427, in 0.016s
Fit 5 trees in 1.056 s, (100 total leaves)
Time spent computing histograms: 0.020s
Time spent finding best splits:  0.006s
Time spent applying splits:      0.013s
Time spent predicting:           0.001s




Binning 0.038 GB of training data: 0.703 s
Binning 0.004 GB of validation data: 0.010 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43350, val loss: 6.54934, in 0.015s
[2/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43302, val loss: 6.54934, in 0.017s
[3/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43260, val loss: 6.54931, in 0.016s
[4/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43214, val loss: 6.54932, in 0.016s
[5/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43169, val loss: 6.54933, in 0.016s
[6/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43128, val loss: 6.54931, in 0.015s
[7/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43085, val loss: 6.54933, in 0.016s
[8/100] 1 tree, 20 leaves, max depth = 5, train loss: 7.43046, val loss: 6.54931, in 0.016s
Fit 8 trees in 0.997 s, (160 total leaves)
Time spent computing histograms: 0.032s
Time spent finding best splits:  0.009s
Time spent applying 

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.7s finished


In [15]:
from numpy import mean
print(mean(results['no_embeddings']["test_neg_root_mean_squared_error"]))
print(mean(results['no_embeddings']["train_neg_root_mean_squared_error"]))
print(mean(results['with_embeddings']["test_neg_root_mean_squared_error"]))
print(mean(results['with_embeddings']["train_neg_root_mean_squared_error"]))

-3.85005310803518
-3.8501609494819595
-3.850322958015238
-3.8496290867837524


In [16]:
print(mean(results['no_embeddings']["test_r2"]))
print(mean(results['no_embeddings']["train_r2"]))
print(mean(results['with_embeddings']["test_r2"]))
print(mean(results['with_embeddings']["train_r2"]))

0.00013871442920936338
0.0001655841013616044
-1.1479804909342306e-06
0.00044174938069247994


In [7]:
# get id columns
def get_id_data():
    # read
    merchants = pd.read_csv(merchants_path, usecols = feature_names['merchants']['id'])#, nrows = 100000) 
    new_transactions = pd.read_csv(new_transactions_path, usecols = feature_names['transactions']['id'])#, nrows = 100000)
    hist_transactions = pd.read_csv(historical_transactions_path, usecols = feature_names['transactions']['id'])#, nrows = 100000)
    # process
    # remove duplicate merchant_id - which there are
    merchants = merchants[~merchants.merchant_id.duplicated()]  
    # concat historical and new transactions - they have the same columns
    id_columns = pd.concat([hist_transactions, new_transactions], axis = 0)
    # fill missing merchant_id with the most frequent one 
    id_columns['merchant_id'] = id_columns['merchant_id'].fillna('M_ID_00a6ca8a8a')
    # merge transactions data with merchant information - merchant information has an additional "merchant_group_id" column
    id_columns = id_columns.merge(merchants[["merchant_id","merchant_group_id"]], how = "left", on = "merchant_id")
    del new_transactions, hist_transactions, merchants
    # convert these columns to edge list 
    to_process_cols = ['city_id', 'merchant_category_id', 'state_id','subsector_id', 'merchant_group_id']
    for c in to_process_cols:
        id_columns[c] = f"{c}_" + id_columns[c].astype(str)
    return id_columns
# id_columns = get_id_data()
# path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
# id_columns.to_csv(path, index = False)
# id_columns