# ABOUT: 
- this code evaluates the node2vec embeddings
- findings: 
    - using card_id embeddings appear to cause overfitting
        
- details:       
    - i.e compared to baseline, performance on training set is better but performance on validation set is worse
    - baseline - using just feature_2 as feature
    - model used is Histogram Gradient boosting
    - metrics used are r2 and rmse
    - 3 fold cross validated

In [134]:

from config import train_path, test_path, feature_names

import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nodevectors
from sklearn.model_selection import cross_validate


###  load id data

In [7]:
# get id columns
def get_id_data():
    # read
    merchants = pd.read_csv(merchants_path, usecols = feature_names['merchants']['id'])#, nrows = 100000) 
    new_transactions = pd.read_csv(new_transactions_path, usecols = feature_names['transactions']['id'])#, nrows = 100000)
    hist_transactions = pd.read_csv(historical_transactions_path, usecols = feature_names['transactions']['id'])#, nrows = 100000)
    # process
    # remove duplicate merchant_id - which there are
    merchants = merchants[~merchants.merchant_id.duplicated()]  
    # concat historical and new transactions - they have the same columns
    id_columns = pd.concat([hist_transactions, new_transactions], axis = 0)
    # fill missing merchant_id with the most frequent one 
    id_columns['merchant_id'] = id_columns['merchant_id'].fillna('M_ID_00a6ca8a8a')
    # merge transactions data with merchant information - merchant information has an additional "merchant_group_id" column
    id_columns = id_columns.merge(merchants[["merchant_id","merchant_group_id"]], how = "left", on = "merchant_id")
    del new_transactions, hist_transactions, merchants
    # convert these columns to edge list 
    to_process_cols = ['city_id', 'merchant_category_id', 'state_id','subsector_id', 'merchant_group_id']
    for c in to_process_cols:
        id_columns[c] = f"{c}_" + id_columns[c].astype(str)
    return id_columns
id_columns = get_id_data()

In [10]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
id_columns.to_csv(path, index = False)
id_columns

Unnamed: 0,card_id,city_id,merchant_category_id,merchant_id,state_id,subsector_id,merchant_group_id
0,C_ID_4e6213e9bc,city_id_88,merchant_category_id_80,M_ID_e020e9b302,state_id_16,subsector_id_37,merchant_group_id_35
1,C_ID_4e6213e9bc,city_id_88,merchant_category_id_367,M_ID_86ec983688,state_id_16,subsector_id_16,merchant_group_id_2084
2,C_ID_4e6213e9bc,city_id_88,merchant_category_id_80,M_ID_979ed661fc,state_id_16,subsector_id_37,merchant_group_id_27369
3,C_ID_4e6213e9bc,city_id_88,merchant_category_id_560,M_ID_e6d5ae8ea6,state_id_16,subsector_id_34,merchant_group_id_24104
4,C_ID_4e6213e9bc,city_id_88,merchant_category_id_80,M_ID_e020e9b302,state_id_16,subsector_id_37,merchant_group_id_35
...,...,...,...,...,...,...,...
31075387,C_ID_1320dee851,city_id_142,merchant_category_id_309,M_ID_7754b67f3b,state_id_19,subsector_id_21,merchant_group_id_35
31075388,C_ID_f112aa3381,city_id_158,merchant_category_id_560,M_ID_da063195b7,state_id_15,subsector_id_34,merchant_group_id_13452
31075389,C_ID_bd97b86450,city_id_69,merchant_category_id_278,M_ID_9a9ccb6544,state_id_9,subsector_id_37,merchant_group_id_27710
31075390,C_ID_c0513fd84f,city_id_130,merchant_category_id_367,M_ID_40c28d596f,state_id_7,subsector_id_16,merchant_group_id_35


### load node2vec

In [21]:
# load train target variable
train_file = pd.read_csv(train_path, usecols = ["card_id","target", "feature_2"])

In [13]:
# load trained node2vec
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\model\node2vec_card_id_merchant_group_id.zip"
node2vec = nodevectors.GGVec.load(path)

In [45]:
# convert embeddings to dataframe
node2vec_embeddings = pd.DataFrame.from_dict(node2vec.model, orient = "index")
node2vec_embeddings = node2vec_embeddings.reset_index()

In [50]:
# merge embeddings to train file
dataset = train_file.merge(node2vec_embeddings, left_on = "card_id", right_on = "index", how = "left")
dataset = dataset.drop("index", axis = 1)

In [61]:
dataset

Unnamed: 0,card_id,feature_2,target,0,1,2,3,4,5,6,...,22,23,24,25,26,27,28,29,30,31
0,C_ID_92a2005557,2,-0.820283,-0.220568,-0.139744,0.057934,-0.015119,0.318756,0.018661,0.230598,...,0.246385,-0.140476,0.229934,-0.199353,-0.044059,-0.024957,0.123961,0.149826,0.230108,0.006520
1,C_ID_3d0044924f,1,0.392913,-0.239454,-0.405854,0.046126,-0.339812,0.537390,0.241904,-0.179123,...,0.276161,-0.029742,0.458275,-0.236365,-0.429708,-0.210461,-0.172655,-0.176038,-0.069350,-0.316585
2,C_ID_d639edf6cd,2,0.688056,-0.112437,0.090886,0.058245,0.062859,0.123670,-0.208891,0.241053,...,-0.076500,-0.260998,0.182565,-0.091890,0.296772,0.030815,0.044964,-0.154956,0.049484,-0.107843
3,C_ID_186d6a6901,3,0.142495,-0.279849,0.072260,-0.258601,0.263341,0.120510,0.260998,-0.110376,...,-0.213643,-0.171976,0.118833,-0.122932,0.248441,0.062957,0.314313,-0.195974,0.188340,0.290067
4,C_ID_cdbd2c0db2,3,-0.159749,-0.108046,0.033870,-0.110723,0.154037,0.237220,0.161799,0.132301,...,-0.126480,-0.058793,0.030644,-0.042563,0.350222,0.052203,-0.133354,-0.078576,-0.056353,0.354240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,2,-2.740821,0.003949,-0.088620,0.126074,-0.230720,-0.095478,0.047710,-0.345991,...,-0.247296,-0.121769,0.196081,0.005684,0.071363,0.023308,-0.104513,-0.142490,-0.208996,-0.144854
201913,C_ID_1314773c0b,1,0.312917,-0.156092,-0.012785,-0.164592,-0.175823,0.214624,-0.053483,-0.060479,...,-0.286465,0.261342,0.131991,-0.093283,0.120997,-0.084877,-0.265694,-0.132654,0.196770,-0.129306
201914,C_ID_7666735b3d,3,0.093494,0.158885,-0.111571,-0.128393,-0.014080,-0.210952,-0.055708,0.028582,...,-0.202771,-0.192850,-0.217950,0.250106,-0.155840,-0.059609,0.244078,0.057832,-0.287865,0.411941
201915,C_ID_73f5a0efd0,2,-4.676589,-0.317371,0.311942,0.116634,-0.062602,0.033486,0.261600,-0.116269,...,-0.016839,-0.023256,-0.220083,-0.156501,0.111673,-0.028209,0.244969,0.002849,-0.001838,0.098923


## Evaluate effectiveness of card_id embeddings

In [127]:
# define columns for training
train_cols_a = ["feature_2"]
train_cols_b = ["feature_2"] + list(range(32))
target_col = "target"

In [136]:
features = train_cols_a
X,y = dataset[features], dataset[target_col]
cv = 3
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 10
model_params = {
    "max_iter":100,
    "categorical_features" : X.columns.isin(["feature_2"])
}
model = HistGradientBoostingRegressor(**model_params)

In [137]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] START .....................................................................
[CV] END  neg_root_mean_squared_error: (train=-3.864, test=-3.823) r2: (train=0.000, test=0.000) total time=   0.4s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV] END  neg_root_mean_squared_error: (train=-3.844, test=-3.862) r2: (train=0.000, test=0.000) total time=   0.2s
[CV] START .....................................................................
[CV] END  neg_root_mean_squared_error: (train=-3.842, test=-3.866) r2: (train=0.000, test=0.000) total time=   0.0s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.2s finished


In [150]:
from numpy import mean
metrics = ["train_r2", 'test_r2', 'train_neg_root_mean_squared_error', 'test_neg_root_mean_squared_error']
for m in metrics:
    print(mean(scores[m]))

0.014138117973720998
-0.00041725237082449834
-3.8231240649221405
-3.8512397294878475


In [147]:
features = train_cols_b
X,y = dataset[features], dataset[target_col]
cv = 3
scoring=('r2', 'neg_root_mean_squared_error')
verbose = 10
model_params = {
    "max_iter":100,
    "categorical_features" : X.columns.isin(["feature_2"])
}
model = HistGradientBoostingRegressor(**model_params)

In [148]:
scores = cross_validate(estimator = model, 
                        X = X, 
                        y = y, 
                        cv=cv,
                        scoring=scoring,
                        verbose = verbose,
                        return_train_score=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV] END  neg_root_mean_squared_error: (train=-3.823, test=-3.825) r2: (train=0.021, test=-0.001) total time=   1.1s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.2s remaining:    0.0s


[CV] END  neg_root_mean_squared_error: (train=-3.825, test=-3.862) r2: (train=0.010, test=-0.000) total time=   0.8s
[CV] START .....................................................................
[CV] END  neg_root_mean_squared_error: (train=-3.822, test=-3.867) r2: (train=0.011, test=-0.000) total time=   0.9s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.3s finished


In [151]:
from numpy import mean
metrics = ["train_r2", 'test_r2', 'train_neg_root_mean_squared_error', 'test_neg_root_mean_squared_error']
for m in metrics:
    print(mean(scores[m]))

0.014138117973720998
-0.00041725237082449834
-3.8231240649221405
-3.8512397294878475


# pipeline
- features - different embeddings
- cross validated