In [1]:
import pickle
import numpy as np
import pandas as pd
import sys
from sklearn.decomposition import PCA
import os
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
from scipy.spatial.distance import euclidean

In [2]:
_DIR = 'china_import'
_FILE_DIR = './../output'
DATA_DIR = './../../../generated_data'

In [3]:
with open(os.path.join(_FILE_DIR, _DIR, 'train_embedding_values.pkl'),'rb') as fh:
    arr = pickle.load(fh)
    
with open(os.path.join(DATA_DIR, _DIR, 'domain_dims.pkl'),'rb') as fh:
    domain_dims =  pickle.load(fh)
    
train_x = arr[0]
train_x_emb = arr[1]
emb_size = train_x_emb.shape[-1]

# create arrays for each domain


In [4]:
domain_sizes = list(domain_dims.values())
domain_names = list(domain_dims.keys())
domain_emb_dict = {}

In [5]:
domain_id2name = {e[0]:e[1] for e in enumerate(domain_names,0)}
domain_name2id = {e[1]:e[0] for e in enumerate(domain_names,0)}

In [6]:
for _ds,_dn in zip(domain_sizes, domain_names):
    a = np.zeros([_ds,emb_size])
    domain_emb_dict[domain_name2id[_dn]] = a

In [7]:
domain_emb_dict[1].shape

(3375, 12)

In [8]:
num_cols =  train_x_emb.shape[1]
for c in range(num_cols):
    ids = train_x[:,c]
    vals = train_x_emb[:,c,:]
    ids = np.reshape(ids,[-1,1])
    tmp = np.hstack([ids,vals])
    _df = pd.DataFrame(tmp)
    _df = _df.rename(columns={0:'id'})
    _df = _df.drop_duplicates(subset=['id'])
   
    for i,row in _df.iterrows():
        _id = int(row['id'])
        z = row.values
        domain_emb_dict[c][_id] = z[1:]

In [9]:
test_x_df_file = os.path.join(DATA_DIR, _DIR, 'test_data.csv')
anomalies_c1_df_file = os.path.join(DATA_DIR, _DIR, 'anomalies_c1_data.csv')

In [10]:
# select 100 of the test instances


In [11]:
anomalies_c1_df_file

'./../../../generated_data/china_import/anomalies_c1_data.csv'

In [12]:
suffix = '011'

a_c1_df = pd.read_csv(anomalies_c1_df_file)

In [13]:
def set_ref_id(row):
    _id = row['PanjivaRecordID']
    _id = str(_id)[:-3]
    return int(_id)

In [14]:
a_c1_df['ref_id'] = 0
a_c1_df['ref_id'] = a_c1_df.apply(
    set_ref_id,
    axis=1
)

In [15]:
a_c1_df.head(10)

Unnamed: 0,PanjivaRecordID,AdminRegion,ConsigneePanjivaID,CountryOfSale,Province,ShipmentOrigin,TradeType,TransportMethod,hscode_6,ref_id
0,202269631011,176,2264,57,18,51,3,4,1,202269631
1,202802495011,115,2242,25,22,28,3,1,44,202802495
2,202272191011,176,1741,6,18,20,3,1,44,202272191
3,202212287011,296,2713,11,18,10,3,1,44,202212287
4,202210175011,273,289,103,10,67,3,4,22,202210175
5,202159295011,213,2587,97,18,111,7,4,22,202159295
6,203268287011,30,368,95,24,104,3,1,24,203268287
7,203182847011,6,816,34,24,40,3,1,42,203182847
8,202282431011,305,2775,58,10,62,3,1,42,202282431
9,203409535011,84,2495,99,2,108,3,1,61,203409535


In [31]:
test_df = pd.read_csv(test_x_df_file)
# a_c1_df = pd.read_csv(anomalies_c1_df_file)

In [32]:
count = 4000

In [33]:
len(test_df)

14420

In [34]:
# test_df = test_df.head(count)
a_c1_df = a_c1_df.loc[a_c1_df['ref_id'].isin(list(test_df['PanjivaRecordID']))]


In [35]:
a_c1_df.head(10)

Unnamed: 0,PanjivaRecordID,AdminRegion,ConsigneePanjivaID,CountryOfSale,Province,ShipmentOrigin,TradeType,TransportMethod,hscode_6,ref_id
0,202269631011,176,2264,57,18,51,3,4,1,202269631
1,202802495011,115,2242,25,22,28,3,1,44,202802495
2,202272191011,176,1741,6,18,20,3,1,44,202272191
3,202212287011,296,2713,11,18,10,3,1,44,202212287
4,202210175011,273,289,103,10,67,3,4,22,202210175
5,202159295011,213,2587,97,18,111,7,4,22,202159295
6,203268287011,30,368,95,24,104,3,1,24,203268287
7,203182847011,6,816,34,24,40,3,1,42,203182847
8,202282431011,305,2775,58,10,62,3,1,42,202282431
9,203409535011,84,2495,99,2,108,3,1,61,203409535


In [36]:
# contrast will have : Actual +  Generated Anomaly
contrast_data = []
for i,row in test_df.iterrows():
    t_row_copy= pd.Series(row,copy=True)
    _tmp = a_c1_df.loc[a_c1_df['ref_id']==row['PanjivaRecordID']]
    del t_row_copy['PanjivaRecordID']
    v = t_row_copy.values
    del _tmp['PanjivaRecordID']
    del _tmp['ref_id']
    try:
        _tmp = (_tmp.values)[0]
        arr = np.vstack([v,_tmp])
        contrast_data.append(arr) 
    except: pass

In [37]:
len(contrast_data)

3991

In [38]:
def get_embedding(id_value, domain_id, domain_emb_dict):
    return domain_emb_dict[domain_id][id_value]

def get_emb_arr(arr,domain_emb_dict):
    tmp = []
    for i in range(len(arr)):
        v = get_embedding(arr[i], i, domain_emb_dict)
        tmp.append(v)
    tmp = np.array(tmp)

    return tmp

In [39]:
def get_reduce_dim(arr):
    nc = 1
    pca = PCA(n_components=nc, svd_solver='full')
    res = pca.fit_transform(arr)
    return np.reshape(res,[-1,nc])

In [40]:
def dissect_v1(arr):
    arr_sum = np.sum(arr,axis=0)
    arr_mean = np.mean(arr,axis=0)
    val_1 = np.square(np.linalg.norm(np.sum(arr,axis=0), ord=2))
    
    # leave one out 
    for i in range(arr.shape[0]):
        tmp_copy = np.array(arr)
        tmp_copy[i,:]= 0
        val_2 = np.square(np.linalg.norm(np.sum(tmp_copy,axis=0), ord=2))
        val_3 =  np.mean(tmp_copy,axis=0)
        print(i, val_1-val_2)

In [41]:
def dissect_v2(arr):
    arr_sum = np.sum(arr,axis=0)
    arr_mean = np.mean(arr,axis=0)
    val_1 = np.square(np.linalg.norm(np.sum(arr,axis=0), ord=2))
    _dict = { j: 0 for j in range(arr.shape[0])}
   # column wise
    for i in range(arr.shape[1]):
        tmp_copy = np.array(arr[:,i])
        s = np.mean(tmp_copy)
        # how far is each component of each
        for j in range(arr.shape[0]):
            _mse = tmp_copy[j] - s
            _dict[j] += abs(_mse)
    print ('>>',_dict)

In [42]:
def dissect_v3(arr):
    # calculate pairwise cosine distance
    num_vecs = arr.shape[0]
    d_arr = np.zeros([num_vecs,num_vecs])
    for i in range(arr.shape[0]):
        for j in range(i+1, arr.shape[0]):
            d_arr[i][j] = cosine(arr[i],arr[j])
            d_arr[j][i] = d_arr[i][j]

    avg_d = np.mean(d_arr,axis=1)
    print(avg_d)

In [47]:
def dissect_v4(arr):
    # calculate projection
    # normalize arr
   
    arr_sum = np.sum(arr,axis=0)
    res = []
    for i in range(arr.shape[0]):
        prj = np.dot(arr_sum,arr[i])/np.linalg.norm(arr_sum, ord=2)
#         print(prj)
        res.append(prj)
    return res

In [48]:
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})


In [50]:
pos = 0
for instance in contrast_data:
    n = instance[0]
    a = instance[1]
    diff_idx = np.nonzero(np.bitwise_xor(n, a))[0][0]
#     print('True   ', a)
#     print('Anomaly', n)
    n_emb = get_emb_arr(n,domain_emb_dict)
    a_emb = get_emb_arr(a,domain_emb_dict)
    
    r = dissect_v4(a_emb)
    min_idx = np.argmin(r)
    print(min_idx,diff_idx)
#     n_emb_red = get_reduce_dim(n_emb)
#     a_emb_red = get_reduce_dim(a_emb)
   
    if min_idx == diff_idx:
        pos +=1
    norm_n = np.square(np.linalg.norm(np.sum(n_emb,axis=0), ord=2))
    norm_a = np.square(np.linalg.norm(np.sum(a_emb,axis=0), ord=2))
    print(np.tanh(norm_n) , ' || ', np.tanh(norm_a))      
    print('----')

print(pos, len(contrast_data))

True    [ 176 2264   57   18   51    3    4    1]
Anomaly [ 176 2264   57   18   63    3    4    1]
4 4
0.5284968880447095  ||  0.46382239885846027
----
True    [ 115 2242   25   22   28    3    1   44]
Anomaly [ 266 2242   25   22   28    3    1   44]
0 0
0.5657034380592846  ||  0.44237318310095985
----
True    [ 176 1741    6   18   20    3    1   44]
Anomaly [ 176 1741   18   18   20    3    1   44]
2 2
0.6237544905894483  ||  0.417615638138817
----
True    [ 296 2713   11   18   10    3    1   44]
Anomaly [ 313 2713   11   18   10    3    1   44]
0 0
0.3814939379235955  ||  0.2280959249542215
----
True    [273 289 103  10  67   3   4  22]
Anomaly [305 289 103  10  67   3   4  22]
6 0
0.5457903735005546  ||  0.3997594211756469
----
True    [ 213 2587   97   18  111    7    4   22]
Anomaly [ 213 2587   97   18   54    7    4   22]
5 4
0.45718245147211867  ||  0.43865790000891725
----
True    [ 30 368  95  24 104   3   1  24]
Anomaly [ 30 368  95  24 104   3   1   7]
7 7
0.52504006253

In [46]:
2724/3991

0.6825357053370082