In [1]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from random import shuffle
from torch.nn.parameter import Parameter
print( torch.cuda.is_available(), torch.cuda.current_device(),torch.cuda.get_device_name(0))
if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"

device = torch.device(dev)

import numpy as np
import os
import sys
import os
import yaml
import pandas as pd
import numpy as np
import os
import glob
from collections import OrderedDict
import pickle

import multiprocessing as mp
from sklearn.metrics.pairwise import cosine_similarity
try:
    from . import utils_1
except:
    import utils_1
    

True 0 Tesla P100-PCIE-16GB


In [2]:
CONFIG_FILE = 'config_1.yaml'
DIR = None
OP_DIR = None
modelData_SaveDir = None
DATA_DIR = None
num_jobs = None
CONFIG = None
Refresh_Embeddings = None
logger = None
domain_dims = None
train_data_file = None
id_col = 'PanjivaRecordID'
# ------ #

def get_domain_dims(dd_file_path):
    with open(dd_file_path, 'rb') as fh:
        domain_dims = pickle.load(fh)
    _tmpDF = pd.DataFrame.from_dict(domain_dims, orient='index')
    _tmpDF = _tmpDF.reset_index()
    _tmpDF = _tmpDF.rename(columns={'index': 'domain'})
    _tmpDF = _tmpDF.sort_values(by=['domain'])
    res = {k: v for k, v in zip(_tmpDF['domain'], _tmpDF[0])}
    return res


def setup_config(_DIR=None):
    global CONFIG_FILE
    global DATA_DIR
    global modelData_SaveDir
    global OP_DIR
    global DIR
    global num_jobs
    global Refresh_Embeddings
    global logger
    global CONFIG
    global domain_dims
    global train_data_file
    
    with open(CONFIG_FILE) as f:
        CONFIG = yaml.safe_load(f)
    if _DIR is None:
        DIR = CONFIG['DIR']
    else:
        DIR = _DIR

    DATA_DIR = os.path.join(CONFIG['DATA_DIR'])
    modelData_SaveDir = os.path.join(
        CONFIG['model_data_save_dir'],
        DIR
    )
    train_data_file = CONFIG['train_data_file']
    
    if not os.path.exists(CONFIG['OP_DIR']):
        os.mkdir(CONFIG['OP_DIR'])
    OP_DIR = os.path.join(CONFIG['OP_DIR'], DIR)
    if not os.path.exists(OP_DIR):
        os.mkdir(OP_DIR)

    Refresh_Embeddings = CONFIG[DIR]['Refresh_Embeddings']
    cpu_count = mp.cpu_count()
    num_jobs = min(cpu_count, CONFIG['num_jobs'])

    if not os.path.exists(CONFIG['model_data_save_dir']):
        os.mkdir(CONFIG['model_data_save_dir'])

    if not os.path.exists(modelData_SaveDir):
        os.mkdir(modelData_SaveDir)
    
    domain_dims_file = os.path.join(DATA_DIR, DIR, "domain_dims.pkl")
    domain_dims = get_domain_dims(domain_dims_file)
    print(' Set up config')
    return

setup_config()

 Set up config


In [3]:
def create_coocc_matrix(df, col_1, col_2):
    set_elements_1 = set(list(df[col_1]))
    set_elements_2 = set(list(df[col_2]))
    count_1 = len(set_elements_1)
    count_2 = len(set_elements_2)
    coocc = np.zeros([count_1, count_2])
    df = df[[col_1, col_2]]
    new_df = df.groupby([col_1, col_2]).size().reset_index(name='count')

    for _, row in new_df.iterrows():
        i = row[col_1]
        j = row[col_2]
        coocc[i][j] = row['count']

    print('Col 1 & 2', col_1, col_2, coocc.shape, '>>', (count_1, count_2))
    return coocc



def get_coOccMatrix_dict(df, id_col):
    columns = list(df.columns)
    columns.remove(id_col)
    columns = list(sorted(columns))
    columnWise_coOccMatrix_dict = {}

    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            col_1 = columns[i]
            col_2 = columns[j]
            key = col_1 + '_+_' + col_2
            res = create_coocc_matrix(df, col_1, col_2)
            columnWise_coOccMatrix_dict[key] = res
    columnWise_coOccMatrix_dict = OrderedDict(columnWise_coOccMatrix_dict)
    return columnWise_coOccMatrix_dict

In [4]:
src_DIR = os.path.join(DATA_DIR, DIR)
training_data_file = CONFIG['train_data_file']
train_df = pd.read_csv(os.path.join(src_DIR, train_data_file))
feature_cols = list(train_df.columns)
feature_cols = list(feature_cols)
feature_cols.remove(id_col)
domains = sorted(feature_cols)
print(feature_cols)

model_data_save_dir = modelData_SaveDir

data = train_df[feature_cols].values
# ------------------------------- #
coOcc_dict_file = os.path.join(model_data_save_dir, "coOccMatrix_dict.pkl")
X_ij_file = os.path.join(model_data_save_dir, "X_ij.npy")
domain_dims_file = os.path.join(src_DIR, "domain_dims.pkl")
domain_dims = get_domain_dims(domain_dims_file)

['Carrier', 'ConsigneePanjivaID', 'HSCode', 'PortOfLading', 'PortOfUnlading', 'ShipmentDestination', 'ShipmentOrigin', 'ShipperPanjivaID']


In [5]:
domain_dims

{'Carrier': 548,
 'ConsigneePanjivaID': 5113,
 'HSCode': 95,
 'PortOfLading': 238,
 'PortOfUnlading': 64,
 'ShipmentDestination': 113,
 'ShipmentOrigin': 116,
 'ShipperPanjivaID': 6193}

In [6]:
# -----
# Check if pairwise co-occurrence dictionary exists
# -----
if os.path.exists(coOcc_dict_file):
    with open(coOcc_dict_file, 'rb') as fh:
        coOccMatrix_dict = pickle.load(fh)
else:
    coOccMatrix_dict = get_coOccMatrix_dict(train_df, id_col='PanjivaRecordID')
    with open(coOcc_dict_file, "wb") as fh:
        pickle.dump(coOccMatrix_dict, fh, pickle.HIGHEST_PROTOCOL)


# ----------------
# Ensure X_ij 
# ----------------
if os.path.exists(X_ij_file):
    with open(X_ij_file, 'rb') as fh:
        X_ij = np.load(fh)

else:
   
    nd = len(feature_cols)
    X_ij = np.zeros([data.shape[0], nd, nd])
    print( X_ij.shape )

    for i in range(nd):
        for j in range(nd):
            if i == j :
                for d in range(data.shape[0]):
                    X_ij[d][i][j] = 0
            else:
                if i < j: 
                    _i =i
                    _j =j
                else : 
                    _i =j
                    _j =i
                key = feature_cols[_i] + '_+_' + feature_cols[_j]
                
                for d in range(data.shape[0]):
                    e1 = data[d][_i]
                    e2 = data[d][_j]
                    X_ij[d][i][j] = coOccMatrix_dict[key][e1][e2]
                    
    X_ij = np.asarray(X_ij,np.int32)
    with open(X_ij_file, "wb") as fh:
        np.save(fh, X_ij)


            

In [7]:
nd = len(feature_cols)
X_ij_max = np.zeros([nd,nd])
for i in range(nd):
    for j in range(nd):
        if i==j : continue
        if i < j: 
            _i =i
            _j =j
        else : 
            _i =j
            _j =i
        key = feature_cols[_i] + '_+_' + feature_cols[_j]
        X_ij_max[i][j] = np.max(coOccMatrix_dict[key])
        

In [32]:
X_ij_max = X_ij_max+1

In [9]:

# =================================
# Co-occurrence based embedding model
# Projecting GloVe to multivariate categorical 
# =================================



In [38]:
# y : shape [ ?, d, d]
def custom_loss(y_pred, y_true):
    # X_ij shape should be [ d,d ]
    global X_ij_max
    _X_ij_max = torch.FloatTensor(X_ij_max)
    a = 0.5
    epsilon = 0.000001

    e1 = torch.pow(y_pred - torch.log(y_true + epsilon) , 2)
#     _xij_m = torch.cat(y_pred.shape[0]*[_X_ij_max])
    _xij_m = _X_ij_max.repeat(y_pred.size()[0], 1,1)
    z = y_true / _xij_m 
    s1 = torch.pow( torch.clamp(z, 0.0, 1.0),a)
    loss = s1 * e1
    sample_loss = torch.sum(loss,keepdim = False, dim=-1)
    sample_loss = torch.sum(sample_loss,keepdim = False, dim=-1)
    return torch.mean(
        sample_loss
    )

In [13]:
class Net(nn.Module):

    def __init__(
        self, 
        emb_dim,
        domain_dims
    ):
        super(Net, self).__init__()
        self.num_domains = len(domain_dims)
        self.domain_dims = domain_dims
        self.emb_dim = emb_dim
        
        self.list_W_m = []
        self.list_W_c = []
        self.list_B_m = []
        self.list_B_c = []
        
#         self.e = nn.Embedding(num_embeddings= domain_dims[0], embedding_dim=emb_dim)
#         self.e.weight = Parameter(torch.Tensor(torch.empty(domain_dims[0], emb_dim).uniform_(-1, 1)))
        
        for d_idx in range(self.num_domains):
            e = nn.Embedding(num_embeddings= domain_dims[d_idx], embedding_dim=emb_dim)
            e.weight = Parameter(torch.Tensor(torch.empty(self.domain_dims[d_idx], emb_dim).uniform_(-1, 1)))
            self.register_parameter('e_'+str(d_idx), e.weight)
            self.list_W_m.append(e)
            
#             e1 = nn.Embedding(num_embeddings= domain_dims[d_idx], embedding_dim=emb_dim)
#             e1.weight = Parameter(torch.Tensor(torch.empty(self.domain_dims[d_idx], emb_dim).uniform_(-1, 1)))
#             self.register_parameter('e1_'+str(d_idx), e1.weight)
#             self.list_W_c.append(e1)
            
            b = nn.Embedding(num_embeddings= domain_dims[d_idx], embedding_dim=1)
            b.weight = Parameter(torch.Tensor(torch.empty(domain_dims[d_idx], 1).uniform_(-1, 1)))
            self.register_parameter('b_'+str(d_idx), b.weight)
            self.list_B_m.append(b)
            
#             b1 = nn.Embedding(num_embeddings=domain_dims[d_idx], embedding_dim=1)
#             b1._weight = Parameter(torch.Tensor(torch.empty(domain_dims[d_idx], 1).uniform_(-1, 1)))
#             self.register_parameter('b1_'+str(d_idx), b1.weight)
#             self.list_B_c.append(b1) 
            
    
    
    # --------------------------------------
    # Define network structure
    # x : [? , dims]
    # --------------------------------------
    def forward(self, x):
        split_x = torch.chunk(
            x, 
            chunks = self.num_domains, 
            dim = 1
        )
        
        nd = self.num_domains
        res = []
        for m_idx in range(nd):
            _zero = split_x[m_idx]*0
            _zero = _zero.type(torch.FloatTensor).view([-1,1,1])
            
            
            w_i = self.list_W_m[m_idx](split_x[m_idx])
            b_i = self.list_B_m[m_idx](split_x[m_idx])
            
            for c_idx in range(nd):
                if m_idx == c_idx : 
                    res.append(_zero)
                else:
                    w_j = self.list_W_m[c_idx](split_x[c_idx])
                    b_j = self.list_B_m[c_idx](split_x[c_idx])

                    s = torch.bmm(
                        w_i.view(-1, 1, self.emb_dim), 
                        w_j.view(-1, self.emb_dim, 1)
                    ) 
                    s = s + b_i + b_j
                    
                    res.append(s)

        # Reshape from  list [ [?,1] ...[?,d*d] ] to  [?, d ,d]
      
        res = torch.stack(
            res,
            dim=1
        )
        res = torch.squeeze(res,dim=-1)
        
        res = torch.squeeze(res,dim=-1)
        
        res = res.view([-1, nd, nd])
        return res
    
        

In [14]:
emb_dim = 256
domain_dims_vals = list( domain_dims.values() )

In [39]:
net = Net(emb_dim,domain_dims_vals)
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
criterion = custom_loss

In [40]:
len(list(net.parameters()))

16

In [25]:
train_x = train_df[feature_cols].values

In [41]:
X_ij.shape

(140382, 8, 8)

In [42]:
num_epochs = 25
bs = 256
log_interval = 50

In [43]:
num_batches = train_x.shape[0]//bs 

In [44]:

for epoch in range(num_epochs):
    # Shuffle
    ind_list = list(range(train_x.shape[0]))
    shuffle(ind_list)
    _train_x = train_x[ind_list,:]
    _y = X_ij[ind_list,:,:]
    
    for batch_idx in range(num_batches+1):
        _x_pos = _train_x[batch_idx*bs:(batch_idx+1)*bs]
        _y_true = _y[batch_idx*bs:(batch_idx+1)*bs]
        # feed tensor
        _x_pos = torch.LongTensor(_x_pos)
        _y_true = torch.FloatTensor(_y_true)
        # ----- #
        optimizer.zero_grad()
        output = net(_x_pos)
       
        loss = criterion(output, _y_true)
        loss.backward()
        optimizer.step()
        # ----- #
        if batch_idx % log_interval == 0:
            print('Train ::  Epoch: {}, Batch {}, Loss {:4f}'.format(epoch, batch_idx,loss))
                        

Train ::  Epoch: 0, Batch 0, Loss 1565.781006
Train ::  Epoch: 0, Batch 50, Loss 759.853210
Train ::  Epoch: 0, Batch 100, Loss 406.476044
Train ::  Epoch: 0, Batch 150, Loss 243.695419
Train ::  Epoch: 0, Batch 200, Loss 157.073227
Train ::  Epoch: 0, Batch 250, Loss 110.142258
Train ::  Epoch: 0, Batch 300, Loss 79.698776
Train ::  Epoch: 0, Batch 350, Loss 57.867954
Train ::  Epoch: 0, Batch 400, Loss 52.459496
Train ::  Epoch: 0, Batch 450, Loss 40.097603
Train ::  Epoch: 0, Batch 500, Loss 37.207954
Train ::  Epoch: 1, Batch 0, Loss 32.085220
Train ::  Epoch: 1, Batch 50, Loss 25.074657
Train ::  Epoch: 1, Batch 100, Loss 22.791748
Train ::  Epoch: 1, Batch 150, Loss 19.727421
Train ::  Epoch: 1, Batch 200, Loss 17.088781
Train ::  Epoch: 1, Batch 250, Loss 15.921331
Train ::  Epoch: 1, Batch 300, Loss 12.981608
Train ::  Epoch: 1, Batch 350, Loss 12.816526
Train ::  Epoch: 1, Batch 400, Loss 12.145791
Train ::  Epoch: 1, Batch 450, Loss 11.652188
Train ::  Epoch: 1, Batch 500, Lo