In [7]:
import os
import nodevectors
from numpy import dot
from collections import defaultdict
import json
import pandas as pd
import numpy as np
from tqdm import tqdm 

In [8]:
id_cols = ['merchant_id', 'merchant_group_id', 'merchant_category_id','subsector_id', 'city_id', 'state_id']
paths = ['node2vec_card_id_merchant_id.zip', 'node2vec_card_id_merchant_group_id.zip', 'node2vec_card_id_merchant_category_id.zip','node2vec_card_id_subsector_id.zip', 'node2vec_card_id_city_id.zip','node2vec_card_id_state_id.zip']
node2vec_dir = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\model"
os.chdir(node2vec_dir)
node2vec_paths = dict(zip(id_cols,paths))
node2vec_paths

{'merchant_id': 'node2vec_card_id_merchant_id.zip',
 'merchant_group_id': 'node2vec_card_id_merchant_group_id.zip',
 'merchant_category_id': 'node2vec_card_id_merchant_category_id.zip',
 'subsector_id': 'node2vec_card_id_subsector_id.zip',
 'city_id': 'node2vec_card_id_city_id.zip',
 'state_id': 'node2vec_card_id_state_id.zip'}

In [9]:
def get_edge_path(target_id, directory = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\edgelist for node2vec"):
    return os.path.join(directory, f"card_id_{target_id}.csv")
def get_jsonsave_path(target_id, directory = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\node2vec similarities"):
    return os.path.join(directory, f"{target_id}_similarities.json")
def get_picklesave_path(target_id, directory = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\node2vec similarities"):
    return os.path.join(directory, f"{target_id}_similarity_features.pkl")

In [10]:
"""
compute node similarity and append to json
"""
def process(id1,id2, target_id):
    add_score(id1, get_score(id1,id2), target_id)
"""
wrapped in process
"""
def add_score(card_id, score, target_id):
    # global - similarity_score
    similarity_score[f"{target_id}_similarities"][card_id].append(score)
def get_score(id1,id2):
    # global - node2vec.model
    return dot(node2vec.model[id1],node2vec.model[id2])

"""
Computes min,max,sum and mean of similarity scores
"""
def compute_similarity_features(card_id: str, similarity_score:list, target_id: str):
    return card_id, min(similarity_score), max(similarity_score), sum(similarity_score), np.mean(similarity_score), np.std(similarity_score)
#     return {
#         "card_id":card_id,
#         f"{target_id}_similarity_min":min(similarity_score),
#         f"{target_id}_similarity_max":max(similarity_score),
#         f"{target_id}_similarity_sum":sum(similarity_score),
#         f"{target_id}_similarity_mean":np.mean(similarity_score)
#     }

### compute for similarity features

In [11]:
for target_id_feature in ["merchant_id",'merchant_group_id', 'merchant_category_id','subsector_id', 'city_id', 'state_id']:
    
    print(f"On {target_id_feature}")
    similarity_score = {
        f"{target_id_feature}_similarities":defaultdict(list)
    }
    print("Loading node2vec model..")
    node2vec = nodevectors.GGVec.load(node2vec_paths[target_id_feature])
    print("Loading edgelist..")
    path = get_edge_path(target_id_feature)
    edges = pd.read_csv(path, header = None, names = ["card_id",target_id_feature])
    print("Computing similarities..")
    edges.apply(lambda row: process(row["card_id"], row[target_id_feature], target_id_feature), axis = 1)
    print("Computing similarity features")
    similarity_features = []
    for card_id, simi in tqdm(similarity_score[f"{target_id_feature}_similarities"].items()):
        scores = compute_similarity_features(card_id, simi, target_id_feature)
        similarity_features.append(scores)
    print("saving..")
    output = pd.DataFrame(similarity_features, columns = ["card_id", f"{target_id_feature}_similarity_min",f"{target_id_feature}_similarity_max",f"{target_id_feature}_similarity_sum",f"{target_id_feature}_similarity_mean",f"{target_id_feature}_similarity_std"])
    path = get_picklesave_path(target_id_feature)
    output.to_pickle(path)

On merchant_id
Loading node2vec model..
Loading edgelist..
Computing similarities..
Computing similarity features


100%|███████████████████████████████████████████████████████████████████████| 325540/325540 [00:13<00:00, 24062.07it/s]


saving..
On merchant_group_id
Loading node2vec model..
Loading edgelist..
Computing similarities..
Computing similarity features


100%|███████████████████████████████████████████████████████████████████████| 325540/325540 [00:12<00:00, 26522.39it/s]


saving..
On merchant_category_id
Loading node2vec model..
Loading edgelist..
Computing similarities..
Computing similarity features


100%|███████████████████████████████████████████████████████████████████████| 325540/325540 [00:12<00:00, 25341.47it/s]


saving..
On subsector_id
Loading node2vec model..
Loading edgelist..
Computing similarities..
Computing similarity features


100%|███████████████████████████████████████████████████████████████████████| 325540/325540 [00:10<00:00, 30216.40it/s]


saving..
On city_id
Loading node2vec model..
Loading edgelist..
Computing similarities..
Computing similarity features


100%|███████████████████████████████████████████████████████████████████████| 325540/325540 [00:10<00:00, 31895.69it/s]


saving..
On state_id
Loading node2vec model..
Loading edgelist..
Computing similarities..
Computing similarity features


100%|███████████████████████████████████████████████████████████████████████| 325540/325540 [00:10<00:00, 32080.89it/s]


saving..


In [39]:

# path = get_jsonsave_path(target_id_feature)
# with open(path, "w") as f:
#     json.dump(similarity_score, f, indent=4)
# path = get_jsonsave_path(target_id_feature)
# with open(path, "r") as f:
#     similarity_score = json.load(f)