# Case study based on US Imports

## train on on data : 8 months of 2015
## use next 4 months as test data ( no filtering)
## analyze results with low score - find a transaction such that the combination of 2 entities is rare and also covered by some filter.
## test on just test data, no anomalies injected

In [33]:
import pandas as pd 
import numpy as np
import pickle
import os
import sys
import itertools
from itertools import combinations

In [19]:
DATA_DIR = 'generated_data'
_DIR = 'us_import'
id_col ='PanjivaRecordID'

In [16]:
train_df = pd.read_csv(os.path.join(DATA_DIR, _DIR, 'train_data.csv'))
test_df = pd.read_csv(os.path.join(DATA_DIR, _DIR, 'test_data.csv'))

In [17]:
test_df.head(10)

Unnamed: 0,PanjivaRecordID,Carrier,ConsigneePanjivaID,PortOfLading,PortOfUnlading,ShipmentDestination,ShipmentOrigin,ShipperCountry,ShipperPanjivaID,hscode_6
0,114779263,180,2830,212,15,104,116,115,2362,64
1,113632447,44,5436,38,51,15,81,81,4013,50
2,113695935,14,2625,38,51,30,81,81,4231,21
3,113721855,127,3301,199,15,82,81,81,4135,6
4,113688319,514,4012,142,51,44,81,81,1364,45
5,113695487,84,5929,142,51,99,81,81,20,21
6,113732799,44,281,46,63,56,81,81,3722,26
7,113701823,538,3141,46,14,103,81,81,5073,50
8,113722943,127,413,46,15,104,81,81,2007,50
9,113674047,376,432,38,15,104,81,81,6255,21


In [18]:
result_df = pd.read_csv(os.path.join('output', _DIR, 'result_1.csv'))

In [175]:
anomaly_id_list  = result_df['PanjivaRecordID']
score_list = list(result_df['score'])

In [86]:
col_val2id_dict_file = os.path.join( DATA_DIR, _DIR, 'col_val2id_dict.pkl' )
with open(col_val2id_dict_file,'rb') as fh:
    col_val2id_dict = pickle.load(fh)

col_id2val_dict = {}
for attr, _dict in col_val2id_dict.items():
    col_id2val_dict[attr] = { v:k  for k,v in _dict.items() }
    

In [92]:
def convert_row2vals( _row_dict, col_id2val_dict ):
    res = {}
    for k, v in _row_dict.items() :
        res[k] = col_id2val_dict[k][v]
    return res    

In [101]:
hdf_df = pd.read_csv('collated_hscode_filters.csv')
tmp = hdf_df.loc[hdf_df['count']>=1]

hdf_list = list(tmp['hscode_6'])

In [122]:
with open(os.path.join('output', _DIR, 'train_embeddings.pkl'),'rb') as fh:
    train_x_emb = pickle.load(fh)
with open(os.path.join(DATA_DIR, _DIR, 'domain_dims.pkl'),'rb') as fh:
    domain_dims =  pickle.load(fh)

In [123]:
train_x_emb.shape

(222443, 9, 12)

In [118]:
train_x_pos_file = os.path.join(
    DATA_DIR,
    _DIR,
    'matrix_train_positive_v1.pkl'
)

with open(train_x_pos_file, 'rb') as fh:
    train_x = pickle.load(fh)

In [124]:
train_x.shape

(222443, 9)

In [138]:
emb_size = train_x_emb.shape[-1]
domain_sizes = list(domain_dims.values())
domain_names = list(domain_dims.keys())
domain_emb_dict = {}
domain_id2name = {e[0]:e[1] for e in enumerate(domain_names,0)}
domain_name2id = {e[1]:e[0] for e in enumerate(domain_names,0)}

In [139]:
domain_id2name

{0: 'Carrier',
 1: 'ConsigneePanjivaID',
 2: 'PortOfLading',
 3: 'PortOfUnlading',
 4: 'ShipmentDestination',
 5: 'ShipmentOrigin',
 6: 'ShipperCountry',
 7: 'ShipperPanjivaID',
 8: 'hscode_6'}

In [140]:
for _ds,_dn in zip(domain_sizes, domain_names):
    a = np.zeros([_ds,emb_size])
    domain_emb_dict[_dn] = a

num_cols =  train_x_emb.shape[1]

In [141]:
for c in range(num_cols):
    ids = train_x[:,c]
    vals = train_x_emb[:,c,:]
    ids = np.reshape(ids,[-1,1])
    tmp = np.hstack([ids,vals])
    _df = pd.DataFrame(tmp)
    _df = _df.rename(columns={0:'id'})
    _df = _df.drop_duplicates(subset=['id'])
   
    for i,row in _df.iterrows():
        _id = int(row['id'])
        z = row.values
        _name = domain_id2name[c]
        domain_emb_dict[_name][_id] = z[1:]
        

In [153]:
domain_id2name

{0: 'Carrier',
 1: 'ConsigneePanjivaID',
 2: 'PortOfLading',
 3: 'PortOfUnlading',
 4: 'ShipmentDestination',
 5: 'ShipmentOrigin',
 6: 'ShipperCountry',
 7: 'ShipperPanjivaID',
 8: 'hscode_6'}

In [154]:
def get_embedding(id_value, domain_id, domain_emb_dict):
    global domain_id2name
    print(id_value)
    domain_name = domain_id2name[domain_id]
    return domain_emb_dict[domain_name][id_value]

def get_emb_arr(arr, domain_emb_dict):
    tmp = []
    for i in range(len(arr)):
        v = get_embedding(arr[i], i, domain_emb_dict)
        tmp.append(v)
    tmp = np.array(tmp)
    return tmp

def dissect(arr):
    p = [_/np.linalg.norm(_, ord=2) for _ in arr]
    arr_sum = np.mean(arr,axis=0)
    
    res = []
    for i in range(arr.shape[0]):
        x = arr[i]
        prj = np.dot(arr_sum,x)/np.linalg.norm(arr_sum, ord=2)
        res.append(prj)
    res = np.array(res)
    return res

In [None]:
# create a list of valid PanjivaIDs where HS Code filters match


In [None]:
def investigate(target_id, train_df, test_df,_score):
    global id_col
    global hdf_list
    global col_id2val_dict
    global domain_emb_dict
    target_row = test_df.loc[test_df[id_col]==target_id].squeeze()
    print('ID ',target_row['PanjivaRecordID'])
    del target_row['PanjivaRecordID']
    print(target_row)
    arr = list(target_row)
    
    val_dict = convert_row2vals( target_row.to_dict(), col_id2val_dict )
    if val_dict['hscode_6'] in hdf_list:
        print('-------------> Filter Valid')
        print('Score ',_score)
    else: 
        return   
    row_emb = get_emb_arr(arr,domain_emb_dict)
    prj = dissect(row_emb)
    _mean = np.mean(prj)
    
    for i,j in zip(list(target_row.to_dict().keys()),prj):
        if j < _mean:
            print ('::>', i, j)
        else:
            print ('[[', i, j, ']]')
   
    print(val_dict)
    attributes = list(target_row.to_dict()) 
    
    for comb in combinations(attributes, 2):
        query_str = []
        for _c in comb:
            val =  target_row[_c]
            if _c == id_col:
                continue    
            query_str.append(' ' + _c + ' == ' + str(val))
        query_str = ' & '.join(query_str)
        
        res_query = train_df.query(query_str)
#         if res_query is None:
#             print('None', query_str)
        if res_query is not None and len(res_query)<10:
            print('>>>', query_str)
            print(len(res_query))
    print('@==========================================================================@')

In [186]:
for idx in range(95000,95500):
    
    _score = score_list[idx]
    investigate(
        anomaly_id_list[idx], 
        train_df, 
        test_df,
        _score
    )

ID  116068918
Carrier                 584
ConsigneePanjivaID     1233
PortOfLading            236
PortOfUnlading            0
ShipmentDestination      26
ShipmentOrigin           81
ShipperCountry           81
ShipperPanjivaID       4477
hscode_6                 50
Name: 41898, dtype: int64
-------------> Filter Valid
Score  0.7350806593894958
584
1233
236
0
26
81
81
4477
50
::> Carrier -0.007047368637631641
[[ ConsigneePanjivaID 0.18429571062629735 ]]
::> PortOfLading 0.06760882182655184
::> PortOfUnlading 0.029924892042672794
[[ ShipmentDestination 0.24617104199971734 ]]
[[ ShipmentOrigin 0.2896446005968335 ]]
[[ ShipperCountry 0.19534922220904555 ]]
::> ShipperPanjivaID -0.27891570383534176
[[ hscode_6 0.24234573268082613 ]]
{'Carrier': 'SHKK - Schenkerocean Limited', 'ConsigneePanjivaID': 44210348.0, 'PortOfLading': 'Ningpo, China', 'PortOfUnlading': 'Baltimore, Maryland', 'ShipmentDestination': 'Baltimore, Maryland', 'ShipmentOrigin': 'China', 'ShipperCountry': 'China', 'ShipperPa

In [190]:
np.percentile(score_list,7)



0.5180734539031983