In [1]:
import os
import sys
import pandas as pd
import numpy as np
import sklearn
import glob
import pickle
import random
from joblib import Parallel, delayed
import yaml
import math
from collections import Counter
sys.path.append('.')
sys.path.append('./..')
import model_file

Using TensorFlow backend.


In [2]:
id_col = 'PanjivaRecordID'

In [3]:

def replace_attr_with_id(row, attr, val2id_dict):
    val = row[attr]
    if val not in val2id_dict.keys():
        print(attr, val)
        return None
    else:
        return val2id_dict[val]

In [4]:
def convert_to_ids(
        df,
        save_dir
):
    global id_col

    feature_columns = list(df.columns)
    feature_columns.remove(id_col)

    dict_DomainDims = {}
    col_val2id_dict = {}

    for col in sorted(feature_columns):
        vals = list(set(df[col]))

        # ----
        #
        #   0 : item1 ,
        #   1 : item2 ,
        #   ...
        # ----
        id2val_dict = {
            e[0]: e[1]
            for e in enumerate(vals, 0)
        }

        # ----
        #
        #   item1 : 0 ,
        #   item2 : 0 ,
        #   ...
        # ----
        val2id_dict = {
            v: k for k, v in id2val_dict.items()
        }
        col_val2id_dict[col] = val2id_dict

        # Replace
        df[col] = df.apply(
            replace_attr_with_id,
            axis=1,
            args=(
                col,
                val2id_dict,
            )
        )
        dict_DomainDims[col] = len(id2val_dict)

    print(' Feature columns :: ', feature_columns)
    print('dict_DomainDims ', dict_DomainDims)

    # -------------
    # Save the domain dimensions
    # -------------

    file = 'domain_dims.pkl'
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    f_path = os.path.join(save_dir, file)

    with open(f_path, 'wb') as fh:
        pickle.dump(
            dict_DomainDims,
            fh,
            pickle.HIGHEST_PROTOCOL
        )
    return df, col_val2id_dict

In [5]:
def HSCode_cleanup(list_df):
    hscode_col = 'HSCode'

    # ----- #
    # Expert curated HS codes
    hs_code_filter_file =  'hscode_filter_file.txt'

    tmp = pd.read_csv(
        hs_code_filter_file,
        index_col=None,
        header=None
    )
    target_codes = list(tmp[0])

    def hsc_proc(_code):
        return str(_code)[:4]

    target_codes = list(sorted([hsc_proc(_) for _ in target_codes]))

    def filter_by_ExpertHSCodeList(_code, target_codes):
        if _code[:2] in target_codes or _code[:4] in target_codes:
            return _code
        return None
    def add_preceeding_zero(_code):
        _code = _code.strip()
        if len(_code) > 6:
            _code = _code[:6]
        elif len(_code) == 5:
            _code = '0' + _code
        return _code

    list_processed_df = []
    for df in list_df:
        df = df.dropna()
        df[hscode_col] = df[hscode_col].astype(str)
        
        df[hscode_col] = df[hscode_col].apply(add_preceeding_zero)

        df[hscode_col] = df[hscode_col].apply(
            filter_by_ExpertHSCodeList,
            args=(target_codes,)
        )
        df = df.dropna()
        list_processed_df.append(df)
    # --------- #

    return list_processed_df

In [6]:
def remove_low_frequency_values(df):
    global id_col
    freq_bound = 3

    freq_column_value_filters = {}

    feature_cols = list(df.columns)
    feature_cols.remove(id_col)
    # ----
    # figure out which entities are to be removed
    # ----
    for c in feature_cols:
        values = list(df[c])
        freq_column_value_filters[c] = []

        obj_counter = Counter(values)
        for _item, _count in obj_counter.items():
            if _count < freq_bound:
                freq_column_value_filters[c].append(_item)
    print('Removing :: ')
    for c, _items in freq_column_value_filters.items():
        print('column : ', c, 'count', len(_items))

    print(' DF length : ', len(df))
    for col, val in freq_column_value_filters.items():
        df = df.loc[
            (~df[col].isin(val))
        ]

    return df

In [7]:
def clean_train_data():
    
    use_cols = ['PanjivaRecordID', 'ShipperPanjivaID','ShipmentDestination', 'TransportMethod', 'HSCode']
    
    files = ['./../../Data/panjiva_costa_rica_exports_02_2017.csv']
    list_df = [ pd.read_csv(_file, usecols = use_cols, low_memory=False) for _file in files]
    list_df = [ df.dropna() for df in list_df ]
    
    # use_cols
    print (list_df[0].columns)
   

    list_df = HSCode_cleanup(list_df)
    
    master_df = None
    for df in list_df:
        if master_df is None:
            master_df = pd.DataFrame(df, copy=True)
        else:
            master_df = master_df.append(
                df,
                ignore_index=True
            )

    master_df = remove_low_frequency_values(master_df)

    return master_df


In [8]:
train_df = clean_train_data()

Index(['PanjivaRecordID', 'ShipperPanjivaID', 'ShipmentDestination',
       'TransportMethod', 'HSCode'],
      dtype='object')
Removing :: 
column :  ShipperPanjivaID count 51
column :  ShipmentDestination count 9
column :  TransportMethod count 0
column :  HSCode count 26
 DF length :  2686


In [9]:
save_dir = '.'
train_df.head(10)

Unnamed: 0,PanjivaRecordID,ShipperPanjivaID,ShipmentDestination,TransportMethod,HSCode
18,4432892,29142268,United States,Maritime,950669
42,4449340,29142268,United States,Air,950669
86,4356348,45735618,India,Maritime,440799
87,4356412,36856429,India,Maritime,440349
88,4361788,41666887,Chile,Maritime,940340
139,4377660,44479072,India,Maritime,440729
152,4381116,45735618,India,Maritime,440349
158,4387068,39846525,Bangladesh,Maritime,440399
172,4393212,29142268,United States,Maritime,950669
174,4406780,45722841,India,Maritime,440796


In [10]:
column_valuesId_dict_file = 'column_valuesId_dict.pkl'
column_valuesId_dict_path = os.path.join(save_dir, column_valuesId_dict_file)

In [11]:
train_df, col_val2id_dict = convert_to_ids(
    train_df,
    save_dir
)
id_col = 'PanjivaRecordID'

feature_cols = list(train_df.columns)
feature_cols.remove(id_col)
feature_cols = list(sorted(feature_cols))
all_cols = [id_col]
all_cols.extend(feature_cols)

train_df = train_df[all_cols]
train_df_file='train_data.csv'
train_df.to_csv(train_df_file, index=False)


 Feature columns ::  ['ShipperPanjivaID', 'ShipmentDestination', 'TransportMethod', 'HSCode']
dict_DomainDims  {'HSCode': 57, 'ShipmentDestination': 30, 'ShipperPanjivaID': 91, 'TransportMethod': 3}


In [12]:

def create_coocc_matrix(df, col_1, col_2):
    set_elements_1 = set(list(df[col_1]))
    set_elements_2 = set(list(df[col_2]))
    count_1 = len(set_elements_1)
    count_2 = len(set_elements_2)
    coocc = np.zeros([count_1, count_2])
    df = df[[col_1, col_2]]
    new_df = df.groupby([col_1, col_2]).size().reset_index(name='count')

    for _, row in new_df.iterrows():
        i = row[col_1]
        j = row[col_2]
        coocc[i][j] = row['count']

    print('Col 1 & 2', col_1, col_2, coocc.shape, '>>', (count_1, count_2))
    return coocc


'''
Create co-occurrence between entities using training data. 
Returns a dict { Domain1_+_Domain2 : __matrix__ }
Domain1 and Domain2 are sorted lexicographically
'''

from collections import OrderedDict

def get_coOccMatrix_dict(df, id_col):
    columns = list(df.columns)
    columns.remove(id_col)
    columns = list(sorted(columns))
    columnWise_coOccMatrix_dict = {}

    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            col_1 = columns[i]
            col_2 = columns[j]
            key = col_1 + '_+_' + col_2
            res = create_coocc_matrix(df, col_1, col_2)
            columnWise_coOccMatrix_dict[key] = res
    columnWise_coOccMatrix_dict = OrderedDict(columnWise_coOccMatrix_dict)   
    return columnWise_coOccMatrix_dict

In [13]:
coOccMatrix_dict = get_coOccMatrix_dict(train_df, id_col='PanjivaRecordID')

Col 1 & 2 HSCode ShipmentDestination (57, 30) >> (57, 30)
Col 1 & 2 HSCode ShipperPanjivaID (57, 91) >> (57, 91)
Col 1 & 2 HSCode TransportMethod (57, 3) >> (57, 3)
Col 1 & 2 ShipmentDestination ShipperPanjivaID (30, 91) >> (30, 91)
Col 1 & 2 ShipmentDestination TransportMethod (30, 3) >> (30, 3)
Col 1 & 2 ShipperPanjivaID TransportMethod (91, 3) >> (91, 3)


In [14]:
with open("coOccMatrix_dict.pkl",'wb') as fh:
    pickle.dump( coOccMatrix_dict, fh, pickle.HIGHEST_PROTOCOL )


In [15]:
with open("domain_dims.pkl",'rb') as fh:
    domain_dims = pickle.load( fh )

In [16]:
X_ij_max = []
for k,v in coOccMatrix_dict.items():
    print(k, np.max(v))
    X_ij_max.append(np.max(v))

HSCode_+_ShipmentDestination 318.0
HSCode_+_ShipperPanjivaID 176.0
HSCode_+_TransportMethod 342.0
ShipmentDestination_+_ShipperPanjivaID 253.0
ShipmentDestination_+_TransportMethod 595.0
ShipperPanjivaID_+_TransportMethod 253.0


In [17]:
# ======================================= #

In [18]:
X_ij_max

[318.0, 176.0, 342.0, 253.0, 595.0, 253.0]

In [19]:
model = model_file.get_model(
    domain_dimesnsions = list(domain_dims.values()),
    num_domains = 4,
    embed_dim = 16,
    _X_ij_max = X_ij_max
)







In [20]:
# Create training data
feature_cols = sorted(list(train_df.columns))
feature_cols = list(feature_cols)
feature_cols.remove(id_col)
data = train_df[feature_cols].values

In [21]:
nd = len(feature_cols)
num_c = nd *(nd-1) // 2
X_ij = np.zeros([data.shape[0], num_c ])

k = 0
for i in range(len(feature_cols)):
    for j in range(i+1, len(feature_cols)):
        key = feature_cols[i]+ '_+_' + feature_cols[j]
        print('>>',key)
        
        for d in range(data.shape[0]):
            e1 = data[d][i]
            e2 = data[d][j]
            X_ij[d][k] = coOccMatrix_dict[key][e1][e2]
        k+=1
        
        
with open("X_ij.pkl","wb") as fh:
    pickle.dump(X_ij,fh,pickle.HIGHEST_PROTOCOL)
    

>> HSCode_+_ShipmentDestination
>> HSCode_+_ShipperPanjivaID
>> HSCode_+_TransportMethod
>> ShipmentDestination_+_ShipperPanjivaID
>> ShipmentDestination_+_TransportMethod
>> ShipperPanjivaID_+_TransportMethod


In [24]:
model_file.train_model(
    model,
    data,
    X_ij
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 4)            0                                            
__________________________________________________________________________________________________
split_layer (Lambda)            [(None, 1), (None, 1 0           input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 16)        912         split_layer[0][0]                
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 16)        480         split_layer[0][1]                
__________________________________________________________________________________________________
embedding_

InvalidArgumentError: data[0].shape = [2] does not start with indices[0].shape = [3]
	 [[{{node training/Adam/gradients/loss/stack_layer_loss/Sum_grad/DynamicStitch}}]]