In [1]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('./..')
from pathlib import Path
import yaml
import pickle
import glob
from itertools import combinations
from joblib import Parallel, delayed
from tqdm import tqdm
import multiprocessing as mp
import re
from collections import  Counter
import argparse
from operator import itemgetter
import json
DIR = None
DATA_LOC = None
train_data_loc = None
test_data_loc = None
anomaly_data_loc = None
domain_dims = None
dist_df_dict = None
reference_df = None

In [2]:
def set_up(_DIR):
    
    global DIR, DATA_LOC, train_data_loc, test_data_loc, anomaly_data_loc, domain_dims, dist_df_dict, reference_df
 
    with open('config.yaml', 'r') as fh:
        config = yaml.safe_load(fh)
        
    DATA_LOC = config['DATA_LOC']
    pairWiseDist_dir = os.path.join(config['pairWiseDist_dir'],DIR)
    anomaly_data_loc = os.path.join(config['anomaly_data_loc'],DIR)
    train_data_loc = os.path.join(DATA_LOC, DIR, 'train_data.csv')
    test_data_loc = os.path.join(DATA_LOC, DIR, 'test_data.csv')
    with open(os.path.join(DATA_LOC, DIR, 'domain_dims.pkl'.format(DIR)), 'rb')  as fh:
        domain_dims = pickle.load(fh)
    dist_df_dict = {}
    for _file in glob.glob(os.path.join(config['pairWiseDist_dir'],DIR,'**.csv')):
         
        _filename = os.path.split(_file)[-1].split('.')[0]
        _parts = _filename.split('_')
        key = (_parts[1],_parts[2])
     
        dist_df_dict[key] = pd.read_csv(_file,index_col=None)    
    return

In [3]:
def perturb_randomly(
    record
):
    global reference_df
    global domain_dims
    num_pert = np.random.randint(2,4)
    row_copy = record.copy()
    perturb_domains = list(domain_dims.keys())
    perturb_domains = np.random.choice(perturb_domains, num_pert)
    
    for i in range(num_pert):
        _dom = perturb_domains[i]
        row_copy[_dom] = np.random.choice(np.arange(domain_dims[_dom], dtype=int),size=1)[0]
   
    res = pd.concat([row_copy],axis=1).transpose().reset_index(drop=True)
    return row_copy

In [5]:
# neg_anomalies = pd.read_csv(os.path.join(anomaly_data_loc, 'neg_anomalies.csv') ,index_col=None)
# neg_anomalies.iloc[100]
# perturb_randomly(neg_anomalies.iloc[100])

In [6]:
# --------------------------------
# Calculte precision & Recall 
# ---------------------------

def analyze_record(
    record
):
    global reference_df
    global domain_dims
    global dist_df_dict 
    tqdm._instances.clear()
    try:
        ref_row = reference_df.loc[reference_df['PanjivaRecordID']==record['PanjivaRecordID']].iloc[0]
    except:
        _id =  int( str(record['PanjivaRecordID']) + '0' )
        ref_row = reference_df.loc[reference_df['PanjivaRecordID']==_id].iloc[0]
        
    wrong_domains = {}
    for d in domain_dims.keys():
        if ref_row[d]!= record[d]:
            wrong_domains[d] = record[d]
    
    # format domain1,domain2, dist
    record_entDist = []
    for pair in combinations(list(domain_dims.keys()),2):
        pair = sorted(pair)
        d1,d2 = pair[0],pair[1]
        key = (d1,d2)
        tmp_df = dist_df_dict[key]
        e1 = int(record[d1])
        e2 = int(record[d2])
        
        _dist = tmp_df.loc[(tmp_df[d1]==e1)&(tmp_df[d2]==e2)]['dist'].values[0]
        record_entDist.append([d1,d2,_dist]) 
        
    record_entDist = list(sorted(record_entDist, key = lambda x: x[2], reverse=True))
    wrong_domain_list = list(wrong_domains.keys())
    
    if len(wrong_domain_list) == 0:
        print(record)
        record = perturb_randomly(record)
        # perturb any 2 or 3 
        print(record)
        # recompute 
        wrong_domains = {}
        for d in domain_dims.keys():
            if ref_row[d]!= record[d]: wrong_domains[d] = record[d]
        wrong_domain_list = list(wrong_domains.keys())
        
    seen = set()
    precision = []
    recall = []
    # An item is correct if 
    correct_count =0 
    idx = 1
    for item in record_entDist:
        correct = False
        _item = []
        if item[0] in wrong_domain_list: 
            _item =  item[0]
            seen = seen.union(set([_item]))
            correct = True
        if  item[1] in wrong_domain_list:
            _item =  item[1]
            correct = True
            seen = seen.union(set([_item]))
        if correct :
            correct_count += 1
        _prec = correct_count/idx
        _rec =  len(seen)/len(wrong_domain_list)
        precision.append(_prec)
        recall.append(_rec)
        idx +=1
        if idx > 15:
            break
          
    return (precision, recall)

In [7]:
def main():
    
    global DIR, DATA_LOC, train_data_loc, test_data_loc, anomaly_data_loc, domain_dims, dist_df_dict, reference_df
    pos_anomalies = pd.read_csv(os.path.join(anomaly_data_loc, 'pos_anomalies.csv') ,index_col=None).reset_index(drop=True)
    neg_anomalies = pd.read_csv(os.path.join(anomaly_data_loc, 'neg_anomalies.csv') ,index_col=None)
    reference_df = pd.read_csv(train_data_loc, index_col=None).reset_index(drop=True)
    neg_anomalies['PanjivaRecordID'] = neg_anomalies['PanjivaRecordID'].apply(lambda x : int(str(x)[:-3]) ).reset_index(drop=True)
    pos_anomalies['PanjivaRecordID'] = pos_anomalies['PanjivaRecordID'].apply(lambda x : int(re.subn('00\d{1,2}$', '', str(x))[0]) )

    anomalies = neg_anomalies.append(pos_anomalies, ignore_index=True)
    anomalies = anomalies.reset_index(drop=True)

    results = {}

    results = Parallel(n_jobs=mp.cpu_count())(
        delayed(analyze_record)(anomalies.iloc[i])  for i in tqdm(range(anomalies.shape[0])))

    P = []
    R = []
    for r in results:
        P.append(r[0])
        R.append(r[1])
    P = np.array(P)
    R = np.array(R)
    
    idx = np.arange(1,P.shape[1]+1)
    P = np.mean(P,axis=0)
    R = np.mean(R,axis=0)
    _df_ = pd.DataFrame({
        'idx':idx,
        'precision':P,
        'recall': R
    }) 
    
    
    Path('results_4').mkdir(exist_ok=True, parents=True)
    result_path = os.path.join('results_4',DIR)
    Path(result_path).mkdir(exist_ok=True, parents=True)  
    result_path = os.path.join('results_4',DIR,'results.csv')
    _df_.to_csv(result_path, index=None)
    
    return _df_

In [38]:
DIR ='us_import1'
set_up(DIR)
res_dict = main()




  0%|          | 0/3920 [00:00<?, ?it/s][A[A[A


  1%|          | 40/3920 [00:00<00:39, 97.29it/s][A[A[A


 53%|█████▎    | 2119/4000 [19:47<17:34,  1.78it/s]A[A[A
 39%|███▉      | 1559/4000 [20:59<32:51,  1.24it/s]



  3%|▎         | 120/3920 [00:03<02:00, 31.50it/s][A[A[A


  4%|▍         | 160/3920 [00:04<02:09, 28.96it/s][A[A[A


  5%|▌         | 200/3920 [00:06<02:21, 26.21it/s][A[A[A


  6%|▌         | 240/3920 [00:08<02:23, 25.62it/s][A[A[A


  7%|▋         | 280/3920 [00:09<02:24, 25.22it/s][A[A[A


  8%|▊         | 320/3920 [00:11<02:31, 23.80it/s][A[A[A


  9%|▉         | 360/3920 [00:13<02:30, 23.62it/s][A[A[A


 10%|█         | 400/3920 [00:15<02:31, 23.18it/s][A[A[A


 11%|█         | 440/3920 [00:17<02:32, 22.78it/s][A[A[A


 12%|█▏        | 480/3920 [00:18<02:33, 22.48it/s][A[A[A


 13%|█▎        | 520/3920 [00:20<02:32, 22.33it/s][A[A[A


 14%|█▍        | 560/3920 [00:22<02:29, 22.54it/s][A[A[A


 15%|█▌        | 600/3920 

In [39]:
DIR ='us_import2'
set_up(DIR)
res_dict = main()


  0%|          | 0/3960 [00:00<?, ?it/s][A
  2%|▏         | 80/3960 [00:00<00:05, 681.13it/s][A
  4%|▍         | 160/3960 [00:01<00:38, 99.45it/s][A
  6%|▌         | 240/3960 [00:02<00:47, 77.78it/s][A
  7%|▋         | 280/3960 [00:04<01:05, 55.92it/s][A
  8%|▊         | 320/3960 [00:05<01:18, 46.62it/s][A
  9%|▉         | 360/3960 [00:06<01:26, 41.42it/s][A
 10%|█         | 400/3960 [00:07<01:33, 38.27it/s][A
 11%|█         | 440/3960 [00:09<01:38, 35.85it/s][A
 12%|█▏        | 480/3960 [00:10<01:40, 34.69it/s][A
 13%|█▎        | 520/3960 [00:11<01:41, 33.88it/s][A
 14%|█▍        | 560/3960 [00:12<01:42, 33.32it/s][A
 15%|█▌        | 600/3960 [00:14<01:43, 32.61it/s][A
 16%|█▌        | 640/3960 [00:15<01:42, 32.54it/s][A
 17%|█▋        | 680/3960 [00:16<01:41, 32.41it/s][A
 18%|█▊        | 720/3960 [00:17<01:39, 32.45it/s][A
 19%|█▉        | 760/3960 [00:19<01:38, 32.39it/s][A
 20%|██        | 800/3960 [00:20<01:38, 32.16it/s][A
 21%|██        | 840/3960 [00:21<01:3

In [8]:
DIR ='us_import3'
set_up(DIR)
res_dict = main()

100%|██████████| 4000/4000 [02:58<00:00, 22.41it/s]


In [9]:
DIR ='us_import4'
set_up(DIR)
res_dict = main()

100%|██████████| 4000/4000 [03:01<00:00, 22.00it/s]


In [99]:
parser = argparse.ArgumentParser()
parser.add_argument(
    '--DIR',
    choices=['us_import1', 'us_import2', 'us_import3', 'us_import4'],
    default=None
)


# ----------------------------------------
args = parser.parse_args()
DIR = args.DIR
set_up(DIR)
main()

usage: ipykernel_launcher.py [-h]
                             [--DIR {us_import1,us_import2,us_import3,us_import4}]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/ddatta/.local/share/jupyter/runtime/kernel-5adc8ce9-3bc7-462d-b894-5e632b7e7afc.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [98]:
import json
dictionary ={  
    "name" : "sathiyajith",  
    "rollno" : 56,  
    "cgpa" : 8.6,  
    "phonenumber" : "9976770500"
}  


In [None]:
pd.read_Csv