# ABOUT: 
- save final node2vec embeddings for training and test sets

In [1]:
from config import *
import pandas as pd
import nodevectors
import os
from tqdm import tqdm

In [2]:
id_cols = ['merchant_id', 'merchant_group_id', 'merchant_category_id','subsector_id', 'city_id', 'state_id']
paths = ['node2vec_card_id_merchant_id.zip', 'node2vec_card_id_merchant_group_id.zip', 'node2vec_card_id_merchant_category_id.zip','node2vec_card_id_subsector_id.zip', 'node2vec_card_id_city_id.zip','node2vec_card_id_state_id.zip']
node2vec_dir = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\model"
os.chdir(node2vec_dir)
node2vec_paths = dict(zip(id_cols,paths))
node2vec_paths

{'merchant_id': 'node2vec_card_id_merchant_id.zip',
 'merchant_group_id': 'node2vec_card_id_merchant_group_id.zip',
 'merchant_category_id': 'node2vec_card_id_merchant_category_id.zip',
 'subsector_id': 'node2vec_card_id_subsector_id.zip',
 'city_id': 'node2vec_card_id_city_id.zip',
 'state_id': 'node2vec_card_id_state_id.zip'}

In [3]:
# load train target variable
train = pd.read_csv(train_path, usecols = ["card_id"])
test = pd.read_csv(test_path, usecols = ["card_id"])
test

Unnamed: 0,card_id
0,C_ID_0ab67a22ab
1,C_ID_130fd0cbdd
2,C_ID_b709037bc5
3,C_ID_d27d835a9f
4,C_ID_2b5e3df5c2
...,...
123618,C_ID_7a239d2eda
123619,C_ID_75ace375ae
123620,C_ID_21d56d950c
123621,C_ID_6c46fc5a9d


In [4]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
id_columns = pd.read_csv(path, usecols = ["card_id"]+id_cols)
id_columns

Unnamed: 0,card_id,city_id,merchant_category_id,merchant_id,state_id,subsector_id,merchant_group_id
0,C_ID_4e6213e9bc,city_id_88,merchant_category_id_80,M_ID_e020e9b302,state_id_16,subsector_id_37,merchant_group_id_35
1,C_ID_4e6213e9bc,city_id_88,merchant_category_id_367,M_ID_86ec983688,state_id_16,subsector_id_16,merchant_group_id_2084
2,C_ID_4e6213e9bc,city_id_88,merchant_category_id_80,M_ID_979ed661fc,state_id_16,subsector_id_37,merchant_group_id_27369
3,C_ID_4e6213e9bc,city_id_88,merchant_category_id_560,M_ID_e6d5ae8ea6,state_id_16,subsector_id_34,merchant_group_id_24104
4,C_ID_4e6213e9bc,city_id_88,merchant_category_id_80,M_ID_e020e9b302,state_id_16,subsector_id_37,merchant_group_id_35
...,...,...,...,...,...,...,...
31075387,C_ID_1320dee851,city_id_142,merchant_category_id_309,M_ID_7754b67f3b,state_id_19,subsector_id_21,merchant_group_id_35
31075388,C_ID_f112aa3381,city_id_158,merchant_category_id_560,M_ID_da063195b7,state_id_15,subsector_id_34,merchant_group_id_13452
31075389,C_ID_bd97b86450,city_id_69,merchant_category_id_278,M_ID_9a9ccb6544,state_id_9,subsector_id_37,merchant_group_id_27710
31075390,C_ID_c0513fd84f,city_id_130,merchant_category_id_367,M_ID_40c28d596f,state_id_7,subsector_id_16,merchant_group_id_35


In [1]:
id_columns[:100]

NameError: name 'id_columns' is not defined

## generate node2vec embeddings 

In [5]:
def generate_embeddings(transactions, target_id_column, node2vec_path):
    # load trained node2vec
    node2vec = nodevectors.GGVec.load(node2vec_path)
    # convert embeddings to dataframe
    node2vec_embeddings = pd.DataFrame.from_dict(node2vec.model, orient = "index")
    node2vec_embeddings = node2vec_embeddings.reset_index()
    # group and aggregate the id embeddings (e.g city_id embeddings) by the "card_id"
    node2vec_embeddings = transactions.merge(node2vec_embeddings, how = "left", left_on = target_id_column, right_on = "index")
    node2vec_embeddings = node2vec_embeddings.drop("index", axis = 1)
    node2vec_embeddings = node2vec_embeddings.groupby("card_id").mean().reset_index()
    node2vec_embeddings.columns = [f"embedding_{target_id_column}_{col}" if col != "card_id" else col for col in node2vec_embeddings.columns]
    return node2vec_embeddings
for target_id_column, node2vec_path in tqdm(node2vec_paths.items()):
    embeddings = generate_embeddings(id_columns[["card_id",target_id_column]], target_id_column, node2vec_path)
    train = train.merge(embeddings, on = "card_id", how = "left")
    test = test.merge(embeddings, on = "card_id", how = "left")

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [03:47<00:00, 37.94s/it]


In [6]:
train  

Unnamed: 0,card_id,embedding_merchant_id_0,embedding_merchant_id_1,embedding_merchant_id_2,embedding_merchant_id_3,embedding_merchant_id_4,embedding_merchant_id_5,embedding_merchant_id_6,embedding_merchant_id_7,embedding_merchant_id_8,...,embedding_state_id_6,embedding_state_id_7,embedding_state_id_8,embedding_state_id_9,embedding_state_id_10,embedding_state_id_11,embedding_state_id_12,embedding_state_id_13,embedding_state_id_14,embedding_state_id_15
0,C_ID_92a2005557,-0.021422,-0.048420,-0.069924,-0.235514,-0.066599,-0.007033,-0.037319,-0.079851,0.003120,...,-0.004938,-0.002624,0.000850,-0.002822,-0.000432,-0.001827,0.002113,0.000528,-0.001570,0.002187
1,C_ID_3d0044924f,-0.027878,-0.036459,-0.063783,-0.138645,-0.040927,-0.025110,-0.100333,-0.079696,0.013673,...,-0.004180,-0.002642,0.000839,-0.002639,-0.000403,-0.001928,0.001855,0.000794,-0.001388,0.001819
2,C_ID_d639edf6cd,-0.064158,0.006955,-0.094551,0.030421,-0.022097,-0.057407,0.037134,-0.084187,-0.021349,...,-0.001579,0.000784,-0.001795,-0.002831,-0.001593,-0.000011,0.000307,-0.001649,-0.000292,0.000362
3,C_ID_186d6a6901,0.014432,0.062962,-0.018725,0.090656,0.136996,0.139103,-0.171963,-0.084831,0.135640,...,-0.001622,0.000298,-0.000334,-0.000762,0.000128,-0.001378,0.000264,0.000427,-0.000211,0.000176
4,C_ID_cdbd2c0db2,0.081563,0.114104,0.006516,0.125304,0.210970,0.211177,-0.192805,-0.098114,0.180337,...,-0.001591,0.000718,-0.000517,-0.000258,0.000252,-0.000961,0.000265,0.000176,-0.000061,0.000072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,0.023376,-0.320476,-0.200911,0.146055,-0.126526,0.242035,0.051884,-0.158640,-0.049487,...,-0.000433,-0.000712,-0.001026,0.001749,0.000094,0.000990,0.000909,0.000843,0.000255,0.000140
201913,C_ID_1314773c0b,-0.175326,0.249148,0.275323,0.135643,0.055214,-0.214280,-0.238401,-0.324303,-0.023397,...,-0.001664,-0.000069,0.000583,0.001279,-0.001497,0.003895,0.001198,0.001665,-0.000519,0.000641
201914,C_ID_7666735b3d,-0.093119,-0.093210,0.055223,0.147493,-0.118970,0.096318,-0.103726,0.059491,0.161142,...,0.000537,-0.001881,0.000747,0.002086,0.000424,-0.001706,0.000163,0.004237,-0.000984,0.000550
201915,C_ID_73f5a0efd0,-0.013375,-0.068527,-0.092815,-0.205066,-0.048569,-0.005745,-0.131231,-0.084432,0.031983,...,-0.004985,-0.002663,0.000883,-0.002835,-0.000425,-0.001852,0.002142,0.000555,-0.001584,0.002206


In [7]:
test

Unnamed: 0,card_id,embedding_merchant_id_0,embedding_merchant_id_1,embedding_merchant_id_2,embedding_merchant_id_3,embedding_merchant_id_4,embedding_merchant_id_5,embedding_merchant_id_6,embedding_merchant_id_7,embedding_merchant_id_8,...,embedding_state_id_6,embedding_state_id_7,embedding_state_id_8,embedding_state_id_9,embedding_state_id_10,embedding_state_id_11,embedding_state_id_12,embedding_state_id_13,embedding_state_id_14,embedding_state_id_15
0,C_ID_0ab67a22ab,-0.055192,0.011766,0.269195,0.144707,-0.093502,-0.005717,-0.050028,0.046280,0.140804,...,0.000426,0.001220,-0.000728,-0.000056,-0.001665,-0.001724,-0.000455,0.001732,-0.000669,-0.000824
1,C_ID_130fd0cbdd,0.237315,-0.197994,0.095006,0.214218,-0.375538,0.110544,-0.196008,-0.139800,-0.217380,...,-0.001080,-0.002692,-0.000830,0.000722,0.001065,0.000084,-0.000550,0.000056,-0.001192,0.000014
2,C_ID_b709037bc5,-0.007985,0.197631,-0.329219,0.171940,-0.101896,-0.075731,0.561958,-0.125975,-0.096082,...,-0.000666,0.000227,-0.001436,-0.002229,-0.001318,-0.000721,-0.000101,-0.001080,-0.000241,0.000200
3,C_ID_d27d835a9f,-0.046348,-0.068238,-0.054618,-0.176786,-0.040879,-0.001683,-0.083053,-0.067383,0.002496,...,-0.004654,-0.002574,0.000790,-0.002810,-0.000444,-0.001871,0.002003,0.000554,-0.001492,0.002040
4,C_ID_2b5e3df5c2,0.229831,-0.082966,0.129265,0.209686,-0.490513,0.011477,-0.196259,-0.107796,-0.281888,...,-0.001338,-0.002481,-0.000818,0.000654,0.000986,0.000179,-0.000530,-0.000034,-0.001104,0.000199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123618,C_ID_7a239d2eda,0.094353,-0.291801,-0.110410,0.001610,0.148609,-0.016962,-0.027606,-0.016250,0.013708,...,0.002850,-0.003240,0.000494,-0.001951,0.000148,-0.004466,-0.000727,0.002846,0.000242,-0.001574
123619,C_ID_75ace375ae,-0.102121,0.025592,-0.085662,-0.240763,-0.194907,0.116414,0.022433,0.012758,-0.084117,...,-0.004644,-0.002318,0.000615,-0.002835,-0.000542,-0.001668,0.001959,0.000335,-0.001455,0.002022
123620,C_ID_21d56d950c,-0.030793,0.121836,-0.231608,0.134719,-0.080220,-0.068310,0.347490,-0.137227,-0.078426,...,-0.000981,0.000670,-0.001777,-0.002757,-0.001523,-0.000266,0.000099,-0.001413,-0.000162,0.000077
123621,C_ID_6c46fc5a9d,0.018481,-0.279081,-0.221584,0.085877,-0.058881,0.229616,-0.003448,-0.125106,0.013101,...,0.001141,-0.000097,-0.001642,0.003227,0.000269,0.001858,0.000470,0.000970,0.000871,-0.000578


In [15]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\train_df.pkl\train_node2vec_embeddings.pkl"
train.to_pickle(path)
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\train_df.pkl\test_node2vec_embeddings.pkl"
test.to_pickle(path)

In [8]:
test.columns

Index(['card_id', 'embedding_merchant_id_0', 'embedding_merchant_id_1',
       'embedding_merchant_id_2', 'embedding_merchant_id_3',
       'embedding_merchant_id_4', 'embedding_merchant_id_5',
       'embedding_merchant_id_6', 'embedding_merchant_id_7',
       'embedding_merchant_id_8', 'embedding_merchant_id_9',
       'embedding_merchant_id_10', 'embedding_merchant_id_11',
       'embedding_merchant_id_12', 'embedding_merchant_id_13',
       'embedding_merchant_id_14', 'embedding_merchant_id_15',
       'embedding_merchant_group_id_0', 'embedding_merchant_group_id_1',
       'embedding_merchant_group_id_2', 'embedding_merchant_group_id_3',
       'embedding_merchant_group_id_4', 'embedding_merchant_group_id_5',
       'embedding_merchant_group_id_6', 'embedding_merchant_group_id_7',
       'embedding_merchant_group_id_8', 'embedding_merchant_group_id_9',
       'embedding_merchant_group_id_10', 'embedding_merchant_group_id_11',
       'embedding_merchant_group_id_12', 'embedding_mer

In [12]:
test[["card_id", "embedding_subsector_id_15","embedding_state_id_15","embedding_city_id_15"]]

Unnamed: 0,card_id,embedding_subsector_id_15,embedding_state_id_15,embedding_city_id_15
0,C_ID_0ab67a22ab,-0.000123,-0.000824,-0.002632
1,C_ID_130fd0cbdd,-0.000118,0.000014,0.001585
2,C_ID_b709037bc5,-0.000109,0.000200,-0.000888
3,C_ID_d27d835a9f,-0.000238,0.002040,-0.000557
4,C_ID_2b5e3df5c2,-0.000139,0.000199,0.002227
...,...,...,...,...
123618,C_ID_7a239d2eda,-0.000244,-0.001574,-0.001772
123619,C_ID_75ace375ae,-0.000087,0.002022,0.001380
123620,C_ID_21d56d950c,-0.000261,0.000077,-0.000838
123621,C_ID_6c46fc5a9d,-0.000239,-0.000578,0.000693


In [10]:
node2vec = nodevectors.GGVec.load('node2vec_card_id_subsector_id.zip')

In [14]:
node2vec.model["C_ID_0ab67a22ab"]

array([ 0.12624051,  0.03521788,  0.04406326, -0.05836817, -0.0310752 ,
        0.10785004,  0.09721222, -0.0145225 ,  0.08446085, -0.08169481,
        0.05288555,  0.11896229,  0.07825649,  0.10237395, -0.01012974,
        0.16195668])