In [2]:
import pandas as pd
import os 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import visdom
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
try:
    from IPython.display import display, HTML
    %load_ext autoreload
    %autoreload 2
except:
    pass

In [3]:
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import numpy as np
import os
import sys
import glob
from tqdm import tqdm
from sklearn.preprocessing import normalize

sys.path.append('./.')
sys.path.append('./..')
from pathlib import Path
import argparse
import pickle
import copy
import json
from onlineGD import onlineGD
from loss_function_grad import maxDotProd_gradient, calculate_cosineDist_gradient
from linear_model_v2 import linearClassifier_bEF

import seaborn as sns
from matplotlib import pyplot as plt
from record import record_class
import yaml
import time
from collections import OrderedDict
from common_utils import utils
from sklearn.utils import shuffle

explantions_file_path = None
embedding_data_path = None
serialID_mapping_loc = None
anomalies_pos_fpath = None
anomalies_neg_fpath = None
feedback_batch_size = None
top_K_count = None
interaction_type = 'concat'
'''
embedding_data_path  = './../../createGraph_trade/saved_model_data/{}'.format(DIR)
serialID_mapping_loc = './../../generated_data_v1/{}/idMapping.csv'.format(DIR)
anomalies_pos_fpath = './../../generated_data_v1/generated_anomalies/{}/pos_anomalies.csv'.format(DIR)
anomalies_neg_fpath = './../../generated_data_v1/generated_anomalies/{}/neg_anomalies.csv'.format(DIR)
explantions_f_path =  './../../generated_data_v1/generated_anomalies/{}/pos_anomalies_explanations.json'.format(DIR)
'''

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


"\nembedding_data_path  = './../../createGraph_trade/saved_model_data/{}'.format(DIR)\nserialID_mapping_loc = './../../generated_data_v1/{}/idMapping.csv'.format(DIR)\nanomalies_pos_fpath = './../../generated_data_v1/generated_anomalies/{}/pos_anomalies.csv'.format(DIR)\nanomalies_neg_fpath = './../../generated_data_v1/generated_anomalies/{}/neg_anomalies.csv'.format(DIR)\nexplantions_f_path =  './../../generated_data_v1/generated_anomalies/{}/pos_anomalies_explanations.json'.format(DIR)\n"

In [5]:
def setup_config(DIR):
    global explantions_file_path
    global embedding_data_path
    global serialID_mapping_loc
    global anomalies_pos_fpath
    global anomalies_neg_fpath
    global domain_dims
    global test_data_serialized_loc
    with open('config.yaml', 'r') as fh:
        config = yaml.safe_load(fh)

    serialID_mapping_loc = config['serialID_mapping_loc'].format(DIR)
    embedding_data_path = config['embedding_data_path'].format(DIR)
    explantions_file_path = config['explantions_file_path'].format(DIR)
    anomalies_pos_fpath = config['anomalies_pos_fpath'].format(DIR)
    anomalies_neg_fpath = config['anomalies_neg_fpath'].format(DIR)
    test_data_serialized_loc = config['test_data_serialized_loc'].format(DIR)

    with open(config['domain_dims_file_path'].format(DIR), 'rb') as fh:
        domain_dims = OrderedDict(pickle.load(fh))
    return

DIR = 'us_import3'
setup_config(DIR)

In [6]:
# ---------------------------------------------------------------------------------
def get_serialID_to_entityID():
    global serialID_mapping_loc
    idMapper_file = os.path.join(serialID_mapping_loc)
    mapping_df = pd.read_csv(idMapper_file, index_col=None)
    serialID_to_entityID = {}

    for i, row in mapping_df.iterrows():
        serialID_to_entityID[row['serial_id']] = row['entity_id']
    return serialID_to_entityID


# ---------------------------
# Get records which are deemed nominal/normal
# ---------------------------
def obtain_normal_samples():
    global test_data_serialized_loc
    normal_data = pd.read_csv(
        test_data_serialized_loc, index_col=None
    )

    _df = normal_data.sample(5000)
    obj_list = []
    for i in tqdm(range(_df.shape[0])):
        obj = record_class(_df.iloc[i].to_dict(), -1)
        obj_list.append(obj)
    data_x = []
    for _obj in obj_list:
        data_x.append(_obj.x)
    data_x = np.stack(data_x)
    return data_x


In [7]:

def get_trained_classifier(X, y, num_domains, emb_dim, num_epochs=10000):
    global domain_dims
    global interaction_type
    classifier_obj = linearClassifier_bEF(
        num_domains=num_domains,
        emb_dim=emb_dim,
        num_epochs=num_epochs,
        L2_reg_lambda=0.0025,
        force_reg=False,
        interaction_type=interaction_type
    )

    classifier_obj.setup_binaryFeatures(
        domain_dims,
        binaryF_domains=['ConsigneePanjivaID', 'ShipperPanjivaID']
    )

    # classifier_obj.fit_on_pos(X, np.ones(X.shape[0]),n_epochs=10000)
    classifier_obj.fit(X, y, log_interval=5000)
    classifier_obj.fit_on_pos(X, y, n_epochs=num_epochs // 2, log_interval=1000)
    return classifier_obj


def fetch_entityID_arr_byList(data_df, id_list):
    global domain_dims
    domain_list = list(domain_dims.keys())
    ID_COL = 'PanjivaRecordID'
    data_df = data_df.copy(deep=True)
    data_df = data_df.loc[data_df[ID_COL].isin(id_list)]
    # Order of id_list has to be preserved!!!
    X = []
    for _id in id_list:
        _tmp = data_df.loc[data_df[ID_COL] == _id][domain_list].iloc[0].values.tolist()
        X.append(_tmp)
    return np.array(X).astype(int)


In [26]:
def display_df(_df):  
    display(HTML(_df.to_html()))


In [9]:

data_source_loc = './../generated_data_v1/'
loc = os.path.join(data_source_loc, DIR)
with open(os.path.join(loc, 'domain_dims.pkl'), 'rb') as fh:
    domain_dims = OrderedDict(pickle.load(fh))

domain_list = list(domain_dims.keys())
num_domains = len(domain_dims)
domain_dims


OrderedDict([('Carrier', 600),
             ('ConsigneePanjivaID', 5601),
             ('HSCode', 126),
             ('PortOfLading', 244),
             ('PortOfUnlading', 51),
             ('ShipmentDestination', 112),
             ('ShipmentOrigin', 107),
             ('ShipperPanjivaID', 6893)])

In [110]:
with open(os.path.join(loc, 'col_val2id_dict.pkl'), 'rb') as fh:
    colval2id = pickle.load(fh)
colval2id
col_id2_val = { dom: {v:k for k,v in _dict.items()} for dom,_dict in colval2id.items()}

In [10]:
interactionID2_pair = {}
k = 0 
for i in range(num_domains):
    for j in range(i+1,num_domains):
        interactionID2_pair[k] = (domain_list[i],domain_list[j])
        k+=1

In [176]:
            
        
def highlight_pairs(row):
    # extract pairs
    global interactionID2_pair
    label_col ='Label (Relevant)'
    color_list = []
    columns = list(OrderedDict(row.to_dict()).keys())
    try:
        exp_1 = interactionID2_pair[row['expl_1']]
        exp_2 = interactionID2_pair[row['expl_2']]
    except:
         
        for column in columns:
            if column == label_col:
                color_list.append( 'background-color: {};  text-align: center; border-width: 2px; height: 50px; font-size: 11.0pt'.format('rgb(255, 0, 8)' ))
            else:
                color_list.append('font-size: 11.0pt;  text-align: center; border-width: 1px; height: 50px;')
                
        return color_list
    
    
    valid_column2color = []
    colors = [('rgb(255, 255, 128)','rgb(255, 204, 102)'),('rgb(204, 221, 255)','rgb(255, 204, 255)')]
    for e, c in zip([exp_1,exp_2],colors):
        for _e,_c  in  zip(e,c):
            valid_column2color.append((_e, _c))
    _multiple_count = [ key for key,v in (Counter([ _[0] for _ in valid_column2color])).items()  if v > 1]
    _match = False
    for column in columns:
        _match = False
        for item in valid_column2color:
            if item[0] == column:
                __color_str = item[1]
                if column in _multiple_count:
                    __color_str = 'yellow'
                color_list.append( 'background-color: {};  text-align: center;font-size: 11.5pt; height: 50px;'.format(__color_str ))
                _match = True
                break
        
        if column == label_col:
            _match = True
            if row[column] == 1:
                color_list.append( 'background-color: {}; font-size: 11.0pt; text-align: center; border-width: 2px; border-color: black; height: 50px;'.format('limegreen' ))
            else:
                color_list.append( 'background-color: {};  text-align: center; font-size: 11.0pt ;height: 50px;'.format('red' ))
        if _match == False:
            color_list.append('font-size: 11.0pt;  text-align: center; border-width: 1px; height: 50px;')
        
            
    return color_list

In [21]:
# tmp = all_data_df.head(5)
# exp1 = [17,10,15,12,20]
# exp2 = [17,4,5,2,10]
# tmp['expl_1'] = exp1
# tmp['expl_2'] = exp2

# tmp = tmp.style.apply(highlight_pairs, axis=1)
# tmp.hide_columns(['expl_1','expl_2'])
# # tmp.data = tmp.data[['PanjivaRecordID','Carrier','ConsigneePanjivaID','HSCode', 'PortOfUnlading','ShipmentDestination','ShipmentOrigin','ShipperPanjivaID']]
# # help(tmp)
# # tmp.table_styles
# tmp

In [12]:
# ============================================

anom_pos_df = pd.read_csv(anomalies_pos_fpath, index_col=None)
anom_neg_df = pd.read_csv(anomalies_neg_fpath, index_col=None)

# ============================================
# setup objects

serialID_to_entityID = get_serialID_to_entityID()
record_class.__setup_embedding__(embedding_data_path, serialID_to_entityID, _normalize=True)
emb_dim = record_class.embedding['HSCode'].shape[1]

# main_data_df has the records with entity ids
main_data_df = pd.concat([anom_pos_df, anom_neg_df], axis=0)
main_data_df = utils.convert_to_UnSerializedID_format(main_data_df, DIR)
# -------------------------------------------
obj_list = []
for i in tqdm(range(anom_neg_df.shape[0])):
    obj = record_class(anom_neg_df.iloc[i].to_dict(), -1)
    obj_list.append(obj)

for i in tqdm(range(anom_pos_df.shape[0])):
    obj = record_class(anom_pos_df.iloc[i].to_dict(), 1)
    obj_list.append(obj)

# Read in the explantions
with open(explantions_file_path, 'rb') as fh:
    explanations = json.load(fh)

explanations = {int(k): [sorted(_) for _ in v] for k, v in explanations.items()}
num_domains = len(domain_dims)
domain_idx = {e[0]: e[1] for e in enumerate(domain_dims.keys())}

domainInteraction_index = {}
k = 0
for i in range(num_domains):
    for j in range(i + 1, num_domains):
        domainInteraction_index['_'.join((domain_idx[i], domain_idx[j]))] = k
        k += 1

data_x = []
data_id = []
data_label = []
data_ID_to_matrix = {}

for _obj in obj_list:
    data_x.append(_obj.x)
    data_id.append(_obj.id)
    data_label.append(_obj.label)
    data_ID_to_matrix[_obj.id] = _obj.x

data_x = np.stack(data_x)
data_label = np.array(data_label)
data_id = np.array(data_id)

idx = np.arange(len(data_id), dtype=int)
np.random.shuffle(idx)

data_x = data_x[idx]
data_label = data_label[idx]
data_id = data_id[idx]

X_0 = data_x  # Relevant anomalies
X_1 = obtain_normal_samples()  # Nominal
y_0 = np.ones(X_0.shape[0])
y_1 = -1 * np.ones(X_1.shape[0])
y = np.hstack([y_0, y_1])
X = np.vstack([X_0, X_1])
num_coeff = len(domainInteraction_index)
classifier_obj = get_trained_classifier(X, y, num_domains, emb_dim)
W = classifier_obj.W.cpu().data.numpy()
emb_dim = W.shape[-1]

# classifier_obj.predict_score_op(X_0)
# Create a reference dataframe  :: data_reference_df
data_reference_df = pd.DataFrame(
    data=np.vstack([data_id, data_label]).transpose(),
    columns=['PanjivaRecordID', 'label']
)

data_reference_df['baseID'] = data_reference_df['PanjivaRecordID'].apply(lambda x: str(x)[:-3])
data_reference_df['expl_1'] = -1
data_reference_df['expl_2'] = -1
data_reference_df['original_score'] = 1

for i, row in data_reference_df.iterrows():
    _id = int(row['PanjivaRecordID'])
    if _id in explanations.keys():
        entry = explanations[_id]
        domain_1 = entry[0][0]
        domain_2 = entry[0][1]
        data_reference_df.loc[i, 'expl_1'] = domainInteraction_index['_'.join(sorted([domain_1, domain_2]))]
        domain_1 = entry[1][0]
        domain_2 = entry[1][1]
        data_reference_df.loc[i, 'expl_2'] = domainInteraction_index['_'.join(sorted([domain_1, domain_2]))]
    _x = data_ID_to_matrix[_id]
    data_reference_df.loc[i, 'original_score'] = classifier_obj.predict_score_op(np.array([_x]))[0]

data_reference_df['cur_score'] = data_reference_df['original_score'].values

100%|██████████| 8/8 [00:15<00:00,  1.91s/it]
100%|██████████| 340/340 [00:00<00:00, 6739.84it/s]
100%|██████████| 340/340 [00:00<00:00, 6173.73it/s]
100%|██████████| 5000/5000 [00:00<00:00, 7072.04it/s]
  1%|          | 53/10000 [00:00<00:35, 278.15it/s]

Step 1 Loss 0.5230


 51%|█████     | 5056/10000 [00:15<00:16, 299.26it/s]

Step 5001 Loss 0.1604


100%|██████████| 10000/10000 [00:31<00:00, 320.09it/s]
  1%|          | 60/5000 [00:00<00:16, 298.59it/s]

Step 1 Loss 0.1501


 21%|██▏       | 1068/5000 [00:03<00:10, 368.31it/s]

Step 1001 Loss 0.0018


 41%|████      | 2043/5000 [00:05<00:09, 326.12it/s]

Step 2001 Loss 0.0004


 61%|██████▏   | 3069/5000 [00:08<00:05, 342.10it/s]

Step 3001 Loss 0.0001


 81%|████████  | 4045/5000 [00:11<00:02, 319.68it/s]

Step 4001 Loss 0.0000


100%|██████████| 5000/5000 [00:14<00:00, 342.83it/s]


In [161]:
import os
import pathlib
import numpy as np
import random
fakeName_dict = {}

def create_fakename_mapping(attribute, count):
    _attr = ''.join([char for char in attribute if char.isupper()] )
    cur_path  = './tmp'
    
    with open(os.path.join(cur_path, 'colors.txt'), 'r') as f:
        _colors_ = [str(_).strip() for _ in f.readlines()]

    with open(os.path.join(cur_path, 'usernames.txt'), 'r', encoding="latin-1") as f:
        _words_ =  [ str(_).strip() for _ in f.readlines() if len(_)>4  and len(_) < 12]
    np.random.shuffle(_colors_)
    np.random.shuffle(_words_)
    _u = np.random.choice(_colors_, size = count, replace=True )
    _v = np.random.choice(_words_, size = count, replace=False )
    map_list = {}
    i = 0
    for a,b in zip(_u,_v):
        map_list[i] = str(_attr) + '-' + a.title()[:3] + b.title()[:3]
        i += 1
        
    return map_list

In [162]:
def setup_FakeNameSetup(domain_dims):
    global fakeName_dict 
    fakeName_dict = {}
    global col_id2_val
    
    for dom,dim in domain_dims.items():
        if dom == 'HSCode':
             fakeName_dict[dom] = col_id2_val[dom]
        else:
            fakeName_dict[dom] = create_fakename_mapping(dom,dim)
    return 

setup_FakeNameSetup(domain_dims)

In [73]:
def execute_with_input(
        clf_obj,
        working_df,
        ref_data_df,
        domainInteraction_index,
        num_coeff,
        emb_dim,
        data_ID_to_matrix,
        check_next_value=20,
        batch_size=10, 
        max_iter = None
):
    global domain_dims
    global interaction_type
    ID_COL = 'PanjivaRecordID'
    BATCH_SIZE = batch_size
    working_df['delta'] = 0
    OGD_obj = onlineGD(num_coeff, emb_dim, calculate_cosineDist_gradient, interaction_type = interaction_type)
    W = clf_obj.W.cpu().data.numpy()
    OGD_obj.set_original_W(W)

    max_num_batches = len(working_df) // BATCH_SIZE + 1
    if max_iter is not None:
        max_num_batches = max_iter
    acc = []
    recall = []
    domain_list = list(domain_dims.keys())
    discovered_count = [0.0]
    total_relCount = len(working_df.loc[working_df['label'] == 1])

    # -------------------------------------------------
    #  Main loop
    # -------------------------------------------------
    next_K_precision = []
    W_list = []
    
    for batch_idx in tqdm(range(max_num_batches)):
         
        cur = working_df.head(BATCH_SIZE)
#         print(len(cur))
#         display_details(cur.loc[cur['label']==1])
        display_details(cur)
        flags = []  # Whether a pos anomaly or not
        terms = []  # Explanation terms

        # Count( of discovered in the current batch ( at the top; defined by batch size )
        cum_cur_discovered = discovered_count[-1] + len(cur.loc[cur['label'] == 1])
        _recall = float(cum_cur_discovered) / total_relCount
        discovered_count.append(
            cum_cur_discovered
        )
        recall.append(_recall)

        x_ij = []
        x_entityIds = []
        W_list.append(clf_obj.W.cpu().data.numpy())
        for i, row in cur.iterrows():
            _mask = np.zeros(len(domainInteraction_index))
            if row['label'] == 1:
                _mask[row['expl_1']] = 1
                _mask[row['expl_2']] = 1
                flags.append(1)
                terms.append((row['expl_1'], row['expl_2'],))
            else:
                flags.append(0)
                terms.append(())
            id_value = row['PanjivaRecordID']
            x_ij.append(data_ID_to_matrix[id_value])

            row_dict = ref_data_df.loc[(ref_data_df[ID_COL] == id_value)].iloc[0].to_dict()
            x_entityIds.append([row_dict[d] for d in domain_list])

        x_entityIds = np.array(x_entityIds)
        x_ij = np.array(x_ij)
        
        
        final_gradient, _W = OGD_obj.update_weight(
            flags,
            terms,
            x_ij
        )
        # ----------------------------------------------------
        # Update Model
        # ----------------------------------------------------
        clf_obj.update_W(_W)
        clf_obj.update_binary_VarW(x_entityIds, flags)

        _tail_count = len(working_df) - BATCH_SIZE
        working_df = working_df.tail(_tail_count).reset_index(drop=True)

        if len(working_df) == 0:
            break

        # Obtain scores
        x_ij_test = []
        x_entityIds = fetch_entityID_arr_byList(
            ref_data_df,
            working_df['PanjivaRecordID'].values.tolist()
        )
        for _id in working_df['PanjivaRecordID'].values:
            x_ij_test.append(data_ID_to_matrix[_id])

        x_ij_test = np.array(x_ij_test)
        new_scores = clf_obj.predict_bEF(x_entityIds, x_ij_test)
        old_scores = working_df['cur_score'].values
        _delta = new_scores - old_scores
        working_df['delta'] = _delta
        working_df = working_df.sort_values(by='delta', ascending=False)
        working_df = working_df.reset_index(drop=True)
        
        check_next = check_next_value 
        tmp = working_df.head(check_next)
        minMax_scaler_obj = MinMaxScaler()
        tmp.loc[:,'delta'] = minMax_scaler_obj.fit_transform(tmp['delta'].values.reshape([-1,1]))
        
          
        _labels = tmp['label'].values
        res = len(np.where(_labels == 1)[0])
        _precision = res / check_next

        
    return W_list

In [15]:
all_data_df = pd.DataFrame(anom_pos_df).copy().append(anom_neg_df,ignore_index=True)

In [134]:
def display_details(_df):
    global all_data_df
    global domain_dims
    global fakeName_dict
    global serialID_to_entityID
    viz_cols = ['PanjivaRecordID'] + list(domain_dims.keys()) + ['label']
    
    _hide_cols = [ _ for _ in list(_df.columns) if _ not in viz_cols ]
    _df = _df.merge( all_data_df, on='PanjivaRecordID', how='left')
    
    # Reorganize columns
    _cols = list(_df.columns)
    _cols1 = [ _ for _ in  _cols if _ in viz_cols ]
    _cols2 = [ _ for _ in  _cols if _ not in viz_cols ]
    _cols = viz_cols + _cols2
    
    _df = _df[_cols]
    import hashlib
    _df['PanjivaRecordID'] = _df['PanjivaRecordID'].apply(lambda x: abs(hash(str(x))) % (10 ** 10))
    # Replace IDs with fake names
    def _replace_wfakeName(row):
        for key in fakeName_dict.keys():
            row[key] = fakeName_dict[key][serialID_to_entityID[row[key]]]
        return row
    _df['label'] = _df['label'].apply(lambda x: x==1)
    _df = _df.rename(columns={'label':'Label (Relevant)'})
    _df = _df.apply(_replace_wfakeName, axis=1) 
    
    _df = _df.style.apply(highlight_pairs,axis=1)
    
 
    _df = _df.hide_columns(_hide_cols)
    display(_df)
    return _df

In [56]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>


In [203]:
# To get random results
# Randomization
cur_df = data_reference_df.copy()
cur_df = cur_df.sample(frac=1).reset_index(drop=True)
cur_df = shuffle(cur_df).reset_index(drop=True)

r = execute_with_input(
    clf_obj=copy.deepcopy(classifier_obj),
    working_df= cur_df,
    ref_data_df= main_data_df,
    domainInteraction_index=domainInteraction_index,
    num_coeff=num_coeff,
    emb_dim=emb_dim,
    data_ID_to_matrix=data_ID_to_matrix,
    batch_size=10,
    check_next_value = 10,
    max_iter = 3
)

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,PanjivaRecordID,Carrier,ConsigneePanjivaID,HSCode,PortOfLading,PortOfUnlading,ShipmentDestination,ShipmentOrigin,ShipperPanjivaID,Label (Relevant)
0,3155365412,C-ChaRev,CPID-AspWra,440793,POL-CarCle,POU-FucVar,SD-JetGod,SO-AspEch,SPID-NavUni,False
1,7732863662,C-JetYan,CPID-SilDer,940161,POL-ClaBur,POU-AvoMcb,SD-SieBer,SO-AshMin,SPID-CamCab,False
2,5774331259,C-BeiUnt,CPID-ClaLia,441820,POL-PinFus,POU-KhaWoo,SD-RusSpr,SO-AspEch,SPID-HeaBeh,False
3,7240517823,C-SalSan,CPID-GolGee,940161,POL-MarWhi,POU-CriAba,SD-MinDep,SO-AspEch,SPID-ChaVan,False
4,1295329695,C-RedSun,CPID-RusMud,441900,POL-GreCat,POU-AspZei,SD-BonMan,SO-AspEch,SPID-AshCac,True
5,2918176595,C-PinClo,CPID-BonUne,930200,POL-GreCut,POU-JetPar,SD-EboVel,SO-BlaEar,SPID-AmbJoc,False
6,3277466340,C-BroWir,CPID-AshCyt,940161,POL-CanVic,POU-CoaSno,SD-CorMis,SO-AspEch,SPID-ChaYap,False
7,9443138766,C-VioSiv,CPID-BraKar,940360,POL-IvoMan,POU-CorDea,SD-AmbBel,SO-CarMar,SPID-BroSca,False
8,6318664768,C-AubDev,CPID-CorLiq,940360,POL-CorLoo,POU-NavLud,SD-VioBox,SO-AspEch,SPID-LimCen,False
9,4038752068,C-BeiUnt,CPID-CorSco,920992,POL-CrePat,POU-FucVar,SD-YelBol,SO-CorOct,SPID-IvoAng,False


 33%|███▎      | 1/3 [00:00<00:01,  1.03it/s]

Unnamed: 0,PanjivaRecordID,Carrier,ConsigneePanjivaID,HSCode,PortOfLading,PortOfUnlading,ShipmentDestination,ShipmentOrigin,ShipperPanjivaID,Label (Relevant)
0,1857765465,C-RedSun,CPID-RusMud,821192,POL-CyaBob,POU-AspZei,SD-BonMan,SO-VioGov,SPID-AshCac,True
1,4983549840,C-RedSun,CPID-RusMud,821110,POL-RedPor,POU-AspZei,SD-MagInt,SO-AspEch,SPID-AshCac,True
2,4019591314,C-RedSun,CPID-BeiLus,940360,POL-ClaBur,POU-AspZei,SD-BroMet,SO-ChaRol,SPID-TanShr,False
3,4622729728,C-RusAly,CPID-RusMud,441900,POL-CyaBob,POU-AspZei,SD-ChaSin,SO-AspEch,SPID-AshCac,True
4,1323461951,C-BlaCop,CPID-GraGal,441875,POL-ClaBur,POU-SilDem,SD-ChoSax,SO-AspEch,SPID-AshCac,True
5,1378999876,C-CanCer,CPID-RusMud,821193,POL-AmbWal,POU-AspZei,SD-DenUnt,SO-AspEch,SPID-AshCac,True
6,7510368444,C-AzuTri,CPID-FucPla,940179,POL-BluDra,POU-MagFoo,SD-BonMan,SO-CitCou,SPID-TomCab,True
7,5549153683,C-KhaJoh,CPID-CreDul,441820,POL-CyaBob,POU-CorBer,SD-GolCum,SO-AspEch,SPID-AshCac,True
8,8786412467,C-IvoNeg,CPID-RusMud,940171,POL-WhiWoo,POU-ChaGer,SD-CamMin,SO-AspEch,SPID-AshCac,True
9,1686130451,C-CoaYar,CPID-WhiRes,440290,POL-CyaBob,POU-CitExe,SD-VioBox,SO-AspEch,SPID-AshCac,True


 67%|██████▋   | 2/3 [00:01<00:00,  1.01it/s]

Unnamed: 0,PanjivaRecordID,Carrier,ConsigneePanjivaID,HSCode,PortOfLading,PortOfUnlading,ShipmentDestination,ShipmentOrigin,ShipperPanjivaID,Label (Relevant)
0,656555997,C-CanCer,CPID-RusMud,940350,POL-FucPal,POU-AspZei,SD-DenUnt,SO-BisLan,SPID-AshCac,True
1,1239591722,C-BeiPhe,CPID-RusMud,940350,POL-FucPal,POU-AspZei,SD-AquErn,SO-AspEch,SPID-AshCac,True
2,1390838468,C-AvoCom,CPID-RusMud,940350,POL-BroCur,POU-RusBob,SD-DenUnt,SO-AspEch,SPID-AshCac,True
3,3029673120,C-RedDag,CPID-RusMud,940390,POL-CyaBob,POU-ChaGer,SD-CamMin,SO-DenLau,SPID-AshCac,True
4,7635555101,C-RedDag,CPID-RusMud,940330,POL-ChaCap,POU-ChaGer,SD-CamMin,SO-AspEch,SPID-AshCac,True
5,408512347,C-SalCas,CPID-RusMud,940330,POL-CyaBob,POU-ChaGer,SD-RusSpr,SO-AspEch,SPID-AshCac,True
6,1891586041,C-CanCer,CPID-DesLin,940389,POL-RedEmp,POU-FucVar,SD-CorMis,SO-AspEch,SPID-YelVan,False
7,1019029641,C-AzuTri,CPID-FucPla,940320,POL-RusCat,POU-MagFoo,SD-BonMan,SO-CofDau,SPID-TomCab,True
8,1758368795,C-AzuTri,CPID-FucPla,940320,POL-BluDra,POU-OraPre,SD-MagInt,SO-CofDau,SPID-TomCab,True
9,602787953,C-CanCer,CPID-JetKio,820190,POL-TanClo,POU-CriHal,SD-CorMis,SO-AspEch,SPID-ChaMor,True


100%|██████████| 3/3 [00:03<00:00,  1.00s/it]


In [197]:
col_id2_val

{'Carrier': {0: 'AAAG - 8 Atc Logistics Inc',
  1: 'AAHD - Amada Shipping Inc',
  2: 'AARG - Alpi Air & Sea',
  3: 'ABFB - Arc Best International Inc',
  4: 'ABPG - Albini & Pitigliani Spa',
  5: 'ABTB - Albatrans Spa',
  6: 'ACDF - Acs Lines',
  7: 'ACEO - Ace Ocean Express Co Ltd',
  8: 'ACGD - Ace Logistics Co Ltd',
  9: 'ACLU - Atlantic Container Line Ab',
  10: 'ACZD - Acl Marine Ltd',
  11: 'ADPR - Adp Global Logistics Inc (Usa)',
  12: 'AEDM - Allied Transport System (Usa) Inc',
  13: 'AEIG - American International Cargo Service Inc',
  14: 'AELB - Ael Berkman Forwarding (Hk) Limited',
  15: 'AERR - Acme Freight Services Corp',
  16: 'AFNC - Afe International Group Inc',
  17: 'AFOJ - All American Forwarding Inc',
  18: 'AFPN - Afco Shipping Line Llc',
  19: 'AGWT - Air & Ground World Transport',
  20: 'AIEI - Agriculture Investment Export Inc',
  21: 'AIRL - Airlift (U S A) Inc',
  22: 'AIWL - Ait Worldwide Logistics Inc',
  23: 'ALPJ - Allcargo Logistics Limited',
  24: 'ALRB 

KeyError: 125135714001

In [49]:
from hashlib import blake2b, blake2s

h = blake2s(digest_size=4).hexdigest()
h.update(b'the same content')
'6fa1d8fcfd719046d762'

'6fa1d8fcfd719046d762'

In [128]:
import hashlib
s = '6fa1d8ytescfd719046d762'
abs(hash(s)) % (10 ** 8)


35096588