In [6]:
import os
import sys
import pandas as pd
import numpy as np
import sklearn
import glob
import pickle
import random
from joblib import Parallel, delayed
import yaml
import math
from collections import Counter
sys.path.append('.')
sys.path.append('./..')
import model_file
from sklearn.metrics.pairwise import cosine_similarity




def get_initial_entity_embeddings(
    data_file='train_data.csv',
    model_data_save_dir='.',
    id_col = 'PanjivaRecordID'
):
    train_df = pd.read_csv(data_file)
    feature_cols = sorted(list(train_df.columns))
    feature_cols = list(feature_cols)
    feature_cols.remove(id_col)
    domains = feature_cols

    data = train_df[feature_cols].values
    nd = len(feature_cols)
    num_c = nd *(nd-1) // 2

    with open("coOccMatrix_dict.pkl",'rb') as fh:
        coOccMatrix_dict = pickle.load( fh )

    with open("domain_dims.pkl",'rb') as fh:
        domain_dims = pickle.load( fh )

    # ===== 
    # Ensure X_ij is in aflattened format ; i < j
    # =====
    if os.path.exists("X_ij.pkl") :
        with open("X_ij.pkl","rb") as fh:
            X_ij = pickle.load(fh)
    else:

        X_ij = np.zeros([data.shape[0], num_c ])
        k = 0
        for i in range(len(feature_cols)):
            for j in range(i+1, len(feature_cols)):
                key = feature_cols[i]+ '_+_' + feature_cols[j]
                for d in range(data.shape[0]):
                    e1 = data[d][i]
                    e2 = data[d][j]
                    X_ij[d][k] = coOccMatrix_dict[key][e1][e2]
                k+=1

        with open("X_ij.pkl","wb") as fh:
            pickle.dump(X_ij,fh,pickle.HIGHEST_PROTOCOL)             

    # X_ij_max needed for scaling 
    X_ij_max = []
    for k,v in coOccMatrix_dict.items():
        X_ij_max.append(np.max(v))

    model = model_file.get_model(
        domain_dimesnsions = list(domain_dims.values()),
        num_domains = 4,
        embed_dim = 256,
        _X_ij_max = X_ij_max
    )

    model_file.train_model(
        model,
        data,
        X_ij,
        epochs = 500
    )

    # ----
    # Save the embeddings (weights) in a dictionary
    # ----
    emb_w = {}
    for i in range(len(feature_cols)):
        dom = feature_cols[i]
        w = np.load('embedding_w_{}.npy'.format(i))
        emb_w[dom] = w

    # ================== 
    # Following GloVe
    # emb ( entity = E in D)
    #  x = 0
    #  For d in {Doamian} - D
    #     x += Sum (CoOcc( E, E_d`)/max(CoOcc( E, E_d`)) *  emb ( entity = E ))
    #  x = 1/2(emb_old(E) + x)
    # ==================

    new_embeddings = {}
    for domain_i in domains:
        new_embeddings[domain_i] = np.zeros(
            emb_w[domain_i].shape
        )
                      
        domain_dim = domain_dims[domain_i]
        # For each entity in domain i 
        for entity_id in range(domain_dim):
            res = 0
            # For each entity in domain j != i
            for domain_j in domains:
                if domain_j == domain_i : continue    
                pair = sorted([domain_i,domain_j])

                key = '_+_'.join(pair)
                coOcc_matrix = coOccMatrix_dict[key]
                if domain_i == pair[0]:
                    arr = coOcc_matrix[entity_id,:]
                else:
                    arr = coOcc_matrix[:,entity_id]
                      
                sum_co_occ = max(np.sum(arr),1)
                scale = np.reshape(arr/sum_co_occ,[-1,1])

                emb_domain_j = emb_w[domain_j]
                res_j = np.sum(scale * emb_domain_j,axis=0)
                res =  res + res_j

            res = 0.5 *( res + emb_w[domain_i][entity_id] )
            new_embeddings[domain_i][entity_id] = res

    # Write the embeddings to file 
    for d in domains:
        print(' >> ', d) 
        file_name = os.path.join(
            model_data_save_dir,
            'init_embedding' + d + '.npy'
        )
        np.save(
            file = file_name, 
            arr = new_embeddings[domain_i]
        )
    
    def test():
        hscode = 25
        # find the 10 closest  to ShipmentDestination to HSCode in data
        df = train_df.loc[train_df['HSCode']==hscode]
        df = df.groupby(['HSCode','ShipmentDestination']).size().reset_index(name='counts')
        df = df.sort_values(by=['counts'])

        k_closest = df.tail(10)['ShipmentDestination'].values
        print(k_closest)

        # hs_code_vec = wt[0][hscode] + bias[0][hscode]
        hs_code_vec = new_embeddings['HSCode'][hscode]

        shp_dest_vec = []
        wt = new_embeddings['ShipmentDestination']
        for i in range(wt.shape[0]):
            r = wt[i] 
            shp_dest_vec.append(r)

        res = { }
        for i in range(wt.shape[0]):
            a = np.reshape(shp_dest_vec[i],[1,-1])
            b = np.reshape(hs_code_vec,[1,-1])
            res[i] = cosine_similarity(a,b)

        new_df = pd.DataFrame(list(res.items()))
        new_df = new_df.sort_values(by=[1])
        print(new_df.tail(10))

In [5]:
get_initial_entity_embeddings()





__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 4)            0                                            
__________________________________________________________________________________________________
split_layer (Lambda)            [(None, 1), (None, 1 0           input_1[0][0]                    
__________________________________________________________________________________________________
embedding_w_0 (Embedding)       (None, 1, 256)       14592       split_layer[0][0]                
__________________________________________________________________________________________________
embedding_w_1 (Embedding)       (None, 1, 256)       7680        split_layer[0][1]                
__________________________________________________________________________________________________
embedd

array([ 1,  7, 17, 29,  9, 26, 27,  6, 22, 24])