In [1]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('./../..')
sys.path.append('./..')
import glob
from tqdm import tqdm
from itertools import combinations
import joblib
from joblib import Parallel, delayed
from pandarallel import pandarallel
pandarallel.initialize()
import re
import yaml
from collections import Counter
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
id_col = 'PanjivaRecordID'
import networkx as nx
import operator
import collections
import argparse

CONFIG = None
DIR_LOC = None 
CONFIG = None
CONFIG_FILE = 'config.yaml'
save_dir = None
id_col = 'PanjivaRecordID'
use_cols = None
freq_bound = None
attribute_columns = None
domain_dims=None

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [11]:
def set_up_config(_DIR=None):
    global DIR
    global CONFIG
    global CONFIG_FILE
    global use_cols
    global num_neg_samples
    global DATA_SOURCE
    global DIR_LOC
    global save_dir
    global id_col
    global attribute_columns
    global domain_dims
    
    DATA_SOURCE = './../generated_data_v1/'
    with open(CONFIG_FILE) as f:
        CONFIG = yaml.safe_load(f)

    if _DIR is not None:
        DIR = _DIR
        CONFIG['DIR'] = _DIR
    else:
        DIR = CONFIG['DIR']

    DIR_LOC = re.sub('[0-9]', '', DIR)
    DATA_SOURCE = os.path.join(DATA_SOURCE, DIR)
    save_dir = 'stage_2'
    save_dir = os.path.join(
        DATA_SOURCE,
        save_dir
    )

    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    use_cols = CONFIG[DIR]['use_cols']
    _cols = list(use_cols)
    _cols.remove(id_col)
    attribute_columns = list(sorted(_cols))
    with open(os.path.join(DATA_SOURCE,'domain_dims.pkl'),'rb') as fh:
        domain_dims = pickle.load(fh)
    print('Domains and sizes', domain_dims)

    return

def get_positive_nodes():
    global save_dir
    with open(os.path.join(save_dir,'seed_nodes.pkl'),'rb') as fh:
        nodes_dict = pickle.load(fh)
       
    return nodes_dict

# Remove duplicates from test set1
def check_suprious_coOcc(
    target_df, 
    ref_df, 
    domain_dims,
    actor_columns = ['ConsigneePanjvaID','ShipperPanjivaID'],
    id_col='PanjivaRecordID'):
    
    # =========================================
    # create a hash
    # =========================================
    print(len(target_df))
    domains = [ _ for _ in domain_dims.keys() if _ not in actor_columns]
    domain_pairs = [sorted(a) for a in combinations(domains,2)]  
    domain_pair_keys = ['_'.join(a) for a in domain_pairs]
    valid_values_dict = {}
   
    for domain_pair in domain_pairs:
        df_tmp = ref_df.groupby(domain_pair).size().reset_index(name='count')
        d1 = domain_pair[0]
        d2 = domain_pair[1]
        key = '_'.join(domain_pair)
        df_tmp['pair'] = df_tmp.apply(lambda x: str(x[d1]) + '_' + str(x[d2]), axis = 1)
        valid_values_dict[key] = list(df_tmp['pair'].values)
    
    def aux_check(row, domain_pairs):
        flag = True
        for domain_pair in domain_pairs:
            d1 = domain_pair[0]
            d2 = domain_pair[1]
            key = '_'.join(domain_pair)
            value = str(row[d1]) + '_' + str(row[d2])
            if value not  in valid_values_dict[key]: 
                flag = False
            
        return flag
    
    target_df['valid'] =  target_df.parallel_apply(aux_check, axis=1,args=(domain_pairs,))
    target_df = target_df.loc[target_df['valid']==True]
    del target_df['valid']
    print( ' Post check length of test set::', len(target_df) )
    return target_df


def perturb_row(
    row,
    fixed_columns,
    domain_dims,
    perturb_count = 3,
    id_col = 'PanjivaRecordID'
):
    new_row = row.copy()
    row_dict = row.to_dict()
    domains_perturb = [_  for _ in  domain_dims.keys() if _ not in fixed_columns]
    
    domains_perturb = np.random.choice(domains_perturb,size=perturb_count,replace=False)
    
    for i in range(perturb_count):
        d = domains_perturb[i]
        e = row_dict[d]
        # select a random entity
        while True:
            rnd_e = np.random.randint(0, domain_dims[d])
            if rnd_e == e : 
                continue
            new_row[d]=rnd_e
            break
    #Perturb the ID
    new_row[id_col] = int(str(new_row[id_col]) + str(1001))
    return new_row
            
    
def generate_anomalies(
    target_df, 
    actor_columns, 
    domain_dims
):
    anomalous_records = target_df.parallel_apply(
        perturb_row,
        axis=1,
        args=(actor_columns, domain_dims , )
    )
    return anomalous_records

    

def main():
    
    global DAT_SOURCE
    global domain_dims
    global save_dir
    
    
    actor_pos_nodes_dict = get_positive_nodes()
    actor_columns = ['ConsigneePanjivaID','ShipperPanjivaID']
    

    train_df = pd.read_csv(os.path.join(DATA_SOURCE,'train_data.csv' ),low_memory=False)
    test_df = pd.read_csv(os.path.join(DATA_SOURCE,'test_data.csv' ),low_memory=False)
    test_df = test_df.drop_duplicates(list(domain_dims.keys()))


    cleaned_records = check_suprious_coOcc(
        test_df, 
        train_df, 
        domain_dims
    )


    # Select some records that should be used  to generate anomalies

    positive_samples = cleaned_records.loc[(cleaned_records['ConsigneePanjivaID'].isin(actor_pos_nodes_dict['ConsigneePanjivaID']))|(cleaned_records['ShipperPanjivaID'].isin(actor_pos_nodes_dict['ShipperPanjivaID']))]
    num_positive_samples = len(positive_samples)

    candidates = cleaned_records.loc[~(cleaned_records['ConsigneePanjivaID'].isin(actor_pos_nodes_dict['ConsigneePanjivaID'])) & ~(cleaned_records['ShipperPanjivaID'].isin(actor_pos_nodes_dict['ShipperPanjivaID']))]
    negative_samples = candidates.sample(num_positive_samples)

    print('Print # positive, negative samples', num_positive_samples,len(negative_samples))
    pos_neg_IDs = list(negative_samples[id_col].values) + list(positive_samples[id_col].values)

    negative_samples = generate_anomalies(
        negative_samples, 
        actor_columns, 
        domain_dims
    )

    positive_samples = generate_anomalies(
        positive_samples, 
        actor_columns, 
        domain_dims
    )
    positive_samples[id_col] = positive_samples[id_col].apply(lambda x: int(str(x) + str(1002)))
    normal_samples = cleaned_records.loc[~(cleaned_records[id_col].isin(pos_neg_IDs))]

    # ========================================
    # Save the data to csv file 
    # ========================================

    # Normal samples 
    save_path = os.path.join(save_dir, 'test_normal_data_csv')
    normal_samples.to_csv(save_path,index=None)

    # Positive samples 
    save_path = os.path.join(save_dir, 'test_pos_data_csv')
    positive_samples.to_csv(save_path,index=None)

    # Negative samples 
    save_path = os.path.join(save_dir, 'test_neg_data_csv')
    negative_samples.to_csv(save_path,index=None)

    # Save all the cleaned records
    save_path = os.path.join(save_dir, 'cleaned_test_data_csv')




parser = argparse.ArgumentParser()
parser.add_argument(
    '--DIR', choices=['us_import1', 'us_import2', 'us_import3' ],
    default= 'us_import1'
)

args = parser.parse_args()
DIR = args.DIR

set_up_config(DIR)
main()

37367
 Post check length of test set:: 22935
Print # positive, negative samples 466 466


In [16]:
positive_samples.head(10)

NameError: name 'positive_samples' is not defined

In [426]:
len(positive_samples),len(negative_samples)

(466, 466)

[['Carrier', 'ConsigneePanjivaID'],
 ['Carrier', 'HSCode'],
 ['Carrier', 'PortOfLading'],
 ['Carrier', 'PortOfUnlading'],
 ['Carrier', 'ShipmentDestination'],
 ['Carrier', 'ShipmentOrigin'],
 ['Carrier', 'ShipperPanjivaID'],
 ['ConsigneePanjivaID', 'HSCode'],
 ['ConsigneePanjivaID', 'PortOfLading'],
 ['ConsigneePanjivaID', 'PortOfUnlading'],
 ['ConsigneePanjivaID', 'ShipmentDestination'],
 ['ConsigneePanjivaID', 'ShipmentOrigin'],
 ['ConsigneePanjivaID', 'ShipperPanjivaID'],
 ['HSCode', 'PortOfLading'],
 ['HSCode', 'PortOfUnlading'],
 ['HSCode', 'ShipmentDestination'],
 ['HSCode', 'ShipmentOrigin'],
 ['HSCode', 'ShipperPanjivaID'],
 ['PortOfLading', 'PortOfUnlading'],
 ['PortOfLading', 'ShipmentDestination'],
 ['PortOfLading', 'ShipmentOrigin'],
 ['PortOfLading', 'ShipperPanjivaID'],
 ['PortOfUnlading', 'ShipmentDestination'],
 ['PortOfUnlading', 'ShipmentOrigin'],
 ['PortOfUnlading', 'ShipperPanjivaID'],
 ['ShipmentDestination', 'ShipmentOrigin'],
 ['ShipmentDestination', 'ShipperPanj