In [20]:
import tensorflow as tf
import keras.backend as K
import numpy as np
import os
from keras import Model
from keras.layers import Input, Embedding, Dot, Reshape, Add
from keras.layers import Lambda
import sys
import os
import yaml
import pandas as pd
import numpy as np
import os
import glob
from collections import OrderedDict
import pickle
import logging
import multiprocessing as mp
from sklearn.metrics.pairwise import cosine_similarity
try:
    from . import utils_1
except:
    import utils_1
    

In [2]:
X_ij_max = None
# =================================
# Co-occurrence based embedding model
# Projecting GloVe to multivariate categorical 
# =================================


def get_model(
    domain_dimesnsions = None,
    num_domains = 4,
    embed_dim = 16,
    _X_ij_max = None
):

    global X_ij_max
    X_ij_max = _X_ij_max
    embedding_layer = []
    bias_layer = []

    input_layer = Input(
        shape=(num_domains,)
    )

    # =======================
    # Input record
    # =======================
    split_input_record = Lambda(
        lambda x:
        tf.split(
            x,
            num_or_size_splits=num_domains,
            axis=-1
        ),
        name='split_layer'
    )(input_layer)
    
    for i in range(num_domains):
        emb_i = Embedding(
            input_dim = domain_dimesnsions[i],
            output_dim= embed_dim,
            embeddings_initializer='random_uniform',
            name='embedding_w_'+str(i)
        )(split_input_record[i])
        embedding_layer.append(emb_i)

        bias_i = Embedding(
            input_dim = domain_dimesnsions[i],
            output_dim=1,
            input_length=1,
            embeddings_initializer='random_uniform',
            name= 'embedding_b_'+str(i)
        )(split_input_record[i])
        bias_layer.append(bias_i)

    y_pred = []

    for i in range(num_domains):
        for j in range(i+1,num_domains):
            w_i__w_j = Dot(axes=-1)([
                embedding_layer[i],
                embedding_layer[j]
            ])
            w_i__w_j = Reshape(target_shape=(1,))(w_i__w_j)
            pred_logXij = Add()([w_i__w_j, bias_layer[i],bias_layer[j]])
            pred_logXij = Reshape(target_shape=(1,))(pred_logXij)
            y_pred.append(pred_logXij)

    y_pred_stacked = Lambda(
        lambda x:
        tf.stack(
            x,
            axis=1
        ),
        name='stack_layer'
    )(y_pred)

    y_pred_final = Lambda(
        lambda x:
        tf.squeeze(
            x,
            axis=-1
        ),
        name='squeeze_layer'
    )(y_pred_stacked)

    model = Model(
        input_layer,
        y_pred_final
    )
    model.compile(
        loss = custom_loss_function,
        optimizer='adam'
    )

    return model

def custom_loss_function(
        y_true,
        y_pred
):
    global X_ij_max
    a = 0.75
    epsilon = 0.000001

    _err1 = K.square(y_pred - K.log(y_true + epsilon))
    _scale1 = K.pow(
        K.clip(y_true / X_ij_max, 0.0, 1.0),
        a
    )
    loss = _scale1 * _err1
    return K.sum(
        loss,
        axis=-1
    )

def train_model(
        model,
        x,
        y_true,
        file_save_loc,
        epochs=100
):
    model.summary()
    model.fit(
        x=x,
        y=y_true,
        batch_size=256,
        epochs=epochs,
        verbose=1,
        shuffle=True
    )
    save_model(model,file_save_loc)

    return model

def save_model(model, file_save_loc):

    for layer in model.layers:
        if 'embedding_w' in layer.name:
            f_path = os.path.join( file_save_loc, layer.name + ".npy")
            np.save(f_path, arr=layer.get_weights()[0])

In [8]:
# ==================== Global variables ===================== #
CONFIG_FILE = 'config_1.yaml'
# ============================================================ #


CONFIG_FILE = 'config_1.yaml'
DIR = None
OP_DIR = None
modelData_SaveDir = None
DATA_DIR = None
num_jobs = None
CONFIG = None
Refresh_Embeddings = None

In [13]:
def setup_config(_DIR=None):
    global CONFIG_FILE
    global DATA_DIR
    global modelData_SaveDir
    global OP_DIR
    global DIR
    global num_jobs
    global Refresh_Embeddings
    global CONFIG

    with open(CONFIG_FILE) as f:
        CONFIG = yaml.safe_load(f)
    if _DIR is None:
        DIR = CONFIG['DIR']
    else:
        DIR = _DIR

    DATA_DIR = os.path.join(CONFIG['DATA_DIR'])

    modelData_SaveDir = os.path.join(
        CONFIG['model_data_save_dir'],
        DIR
    )

    if not os.path.exists(CONFIG['OP_DIR']):
        os.mkdir(CONFIG['OP_DIR'])
    OP_DIR = os.path.join(CONFIG['OP_DIR'], DIR)
    if not os.path.exists(OP_DIR):
        os.mkdir(OP_DIR)

    Refresh_Embeddings = CONFIG[DIR]['Refresh_Embeddings']
    cpu_count = mp.cpu_count()
    num_jobs = min(cpu_count, CONFIG['num_jobs'])

    if not os.path.exists(CONFIG['model_data_save_dir']):
        os.mkdir(CONFIG['model_data_save_dir'])

    if not os.path.exists(modelData_SaveDir):
        os.mkdir(modelData_SaveDir)
    
    print(' Set up config')
    return

setup_config()

 Set up config


In [5]:
def create_coocc_matrix(df, col_1, col_2):
    set_elements_1 = set(list(df[col_1]))
    set_elements_2 = set(list(df[col_2]))
    count_1 = len(set_elements_1)
    count_2 = len(set_elements_2)
    coocc = np.zeros([count_1, count_2])
    df = df[[col_1, col_2]]
    new_df = df.groupby([col_1, col_2]).size().reset_index(name='count')

    for _, row in new_df.iterrows():
        i = row[col_1]
        j = row[col_2]
        coocc[i][j] = row['count']

    print('Col 1 & 2', col_1, col_2, coocc.shape, '>>', (count_1, count_2))
    return coocc



def get_coOccMatrix_dict(df, id_col):
    columns = list(df.columns)
    columns.remove(id_col)
    columns = list(sorted(columns))
    columnWise_coOccMatrix_dict = {}

    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            col_1 = columns[i]
            col_2 = columns[j]
            key = col_1 + '_+_' + col_2
            res = create_coocc_matrix(df, col_1, col_2)
            columnWise_coOccMatrix_dict[key] = res
    columnWise_coOccMatrix_dict = OrderedDict(columnWise_coOccMatrix_dict)
    return columnWise_coOccMatrix_dict

In [44]:
def get_initial_entity_embeddings(
        train_data_file,
        model_data_save_dir,
        DATA_DIR,
        embedding_dims,
        num_epochs,
        id_col='PanjivaRecordID'
):
    train_df = pd.read_csv(os.path.join(DATA_DIR, train_data_file))
    feature_cols = list(train_df.columns)
    feature_cols = list(feature_cols)
    feature_cols.remove(id_col)
    domains = sorted(feature_cols)
    print(feature_cols)

    data = train_df[feature_cols].values
    # ------------------------------- #
    coOcc_dict_file = os.path.join(model_data_save_dir, "coOccMatrix_dict.pkl")
    X_ij_file = os.path.join(model_data_save_dir, "X_ij.npy")
    domain_dims_file = os.path.join(DATA_DIR, "domain_dims.pkl")
    domain_dims = utils_1.get_domain_dims(domain_dims_file)

    # -----
    # Check if pairwise co-occurrence dictionary exists
    # -----
    if os.path.exists(coOcc_dict_file):
        with open(coOcc_dict_file, 'rb') as fh:
            coOccMatrix_dict = pickle.load(fh)
    else:
        coOccMatrix_dict = get_coOccMatrix_dict(train_df, id_col='PanjivaRecordID')
        with open(coOcc_dict_file, "wb") as fh:
            pickle.dump(coOccMatrix_dict, fh, pickle.HIGHEST_PROTOCOL)


    # ----------------
    # Ensure X_ij is in a flattened format ; i < j
    # ----------------
    if os.path.exists(X_ij_file):
        with open(X_ij_file, 'rb') as fh:
            X_ij = np.load(fh)

    else:
        nd = len(feature_cols)
        num_c = nd * (nd - 1) // 2
        X_ij = np.zeros([data.shape[0], num_c])
        k = 0
        for i in range(len(feature_cols)):
            for j in range(i + 1, len(feature_cols)):
                key = feature_cols[i] + '_+_' + feature_cols[j]
                for d in range(data.shape[0]):
                    e1 = data[d][i]
                    e2 = data[d][j]
                    X_ij[d][k] = coOccMatrix_dict[key][e1][e2]
                k += 1
        X_ij = np.asarray(X_ij,np.int32)
        with open(X_ij_file, "wb") as fh:
            np.save(fh, X_ij)

    # -------------------------------- #

    # X_ij_max needed for scaling
    X_ij_max = []
    for k, v in coOccMatrix_dict.items():
        X_ij_max.append(np.max(v))

    num_domains = len(domain_dims)
    print(domain_dims.values())

    model = get_model(
        domain_dimesnsions=list(domain_dims.values()),
        num_domains=num_domains,
        embed_dim=embedding_dims,
        _X_ij_max=X_ij_max
    )

    # check if model present !!
    _present = len(glob.glob(os.path.join(model_data_save_dir, 'embedding_w_**')))>0
    if not _present :
        model = train_model(
            model,
            data,
            X_ij,
            file_save_loc=model_data_save_dir,
            epochs=num_epochs
        )
    # ----
    # Save the embeddings (weights) in a dictionary
    # ----
    emb_w = {}
    for i in range(len(feature_cols)):
        dom = feature_cols[i]
        f_path = os.path.join(model_data_save_dir, 'embedding_w_{}.npy'.format(i))
        w = np.load(f_path)
        emb_w[dom] = w

    # ================== 
    # Modifying concept of  GloVe
    # emb ( entity = E in D)
    #  x = 0
    #  For d in {Doamian} - D
    #     x += Sum (CoOcc( E, E_d`)/max(CoOcc( E, E_d`)) *  emb ( entity = E ))
    #  x = 1/2(emb_old(E) + x)
    # ==================

    new_embeddings = {}
    for domain_i in domains:
        new_embeddings[domain_i] = np.zeros(
            emb_w[domain_i].shape
        )

        domain_dim = domain_dims[domain_i]
        # For each entity in domain i 
        for entity_id in range(domain_dim):
            res = 0
            # For each entity in domain j != i
            for domain_j in domains:
                if domain_j == domain_i: continue
                pair = sorted([domain_i, domain_j])

                key = '_+_'.join(pair)
                coOcc_matrix = coOccMatrix_dict[key]
                if domain_i == pair[0]:
                    arr = coOcc_matrix[entity_id, :]
                else:
                    arr = coOcc_matrix[:, entity_id]

                sum_co_occ = max(np.sum(arr), 1)
                scale = np.reshape(arr / sum_co_occ, [-1, 1])

                emb_domain_j = emb_w[domain_j]
                res_j = np.sum(scale * scale * emb_domain_j, axis=0)
                res = res + res_j

            res = 0.5 * (res + emb_w[domain_i][entity_id])
#             res = emb_w[domain_i][entity_id]
            new_embeddings[domain_i][entity_id] = res

    # Write the embeddings to file 
    for domain_i in domains:
        print(' >> ', domain_i)
        file_name = os.path.join(
            model_data_save_dir,
            'init_embedding_' + domain_i + '_' + str(embedding_dims) + '.npy'
        )
        np.save(
            file=file_name,
            arr=new_embeddings[domain_i]
        )
    
    # =================================
    # This is only for testing whether the model works
    # Usually not called, only for debugging
    # =================================
    def test():
        hscodeList = [10,25,35,40,50,55,75,90]
        
        for hscode in hscodeList:
            print('-----> ::: ',hscode)
            # find the 10 closest  to ShipmentDestination to HSCode in data
            df = train_df.loc[train_df['HSCode'] == hscode]
            df = df.groupby(['HSCode', 'PortOfLading']).size().reset_index(name='counts')
            df = df.sort_values(by=['counts'])

            k_closest = df.tail(10)['PortOfLading'].values
            print(k_closest)

            # hs_code_vec = wt[0][hscode] + bias[0][hscode]
            hs_code_vec = new_embeddings['HSCode'][hscode]

            shp_dest_vec = []
            wt = new_embeddings['PortOfLading']
            for i in range(wt.shape[0]):
                r = wt[i]
                shp_dest_vec.append(r)

            res = {}
            for i in range(wt.shape[0]):
                a = np.reshape(shp_dest_vec[i], [1, -1])
                b = np.reshape(hs_code_vec, [1, -1])
                res[i] = cosine_similarity(a, b)

            new_df = pd.DataFrame(list(res.items()))
            new_df = new_df.sort_values(by=[1])
            new_df = new_df.tail(10)
            print(list(new_df[0]))
            
            
            print('----->')
    test()
    
    return new_embeddings

In [45]:
training_data_file = CONFIG['train_data_file']
 
src_DIR = os.path.join(DATA_DIR, DIR)
embeddings = get_initial_entity_embeddings(
    training_data_file,
    modelData_SaveDir,
    src_DIR,
    embedding_dims=256,
    num_epochs=40
)

['Carrier', 'ConsigneePanjivaID', 'HSCode', 'PortOfLading', 'PortOfUnlading', 'ShipmentDestination', 'ShipmentOrigin', 'ShipperPanjivaID']
dict_values([548, 5113, 95, 238, 64, 113, 116, 6193])
 >>  Carrier
 >>  ConsigneePanjivaID
 >>  HSCode
 >>  PortOfLading
 >>  PortOfUnlading
 >>  ShipmentDestination
 >>  ShipmentOrigin
 >>  ShipperPanjivaID
-----> :::  10
[ 63  79 107 234]
[37, 54, 235, 10, 84, 62, 49, 127, 11, 234]
----->
-----> :::  25
[  4  58 126 217 184 106 164  96 130 146]
[153, 34, 115, 202, 9, 161, 96, 51, 204, 146]
----->
-----> :::  35
[ 27  80  96 151 164 146 184 130]
[204, 9, 96, 153, 152, 184, 53, 118, 52, 130]
----->
-----> :::  40
[217  66  10 211   5  84 111 216 138 184]
[84, 220, 236, 24, 111, 216, 99, 235, 205, 138]
----->
-----> :::  50
[234 146 164  54 184 126 232 217  96 130]
[233, 130, 152, 181, 137, 151, 75, 32, 144, 52]
----->
-----> :::  55
[111 102 163 130 184  41  71 220 103  96]
[68, 130, 153, 20, 71, 161, 43, 28, 118, 96]
----->
-----> :::  75
[ 82 106 