In [23]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('./..')
from pathlib import Path
import yaml
import pickle
import glob
from itertools import combinations
from joblib import Parallel, delayed
from tqdm import tqdm
import multiprocessing as mp
import re
from collections import  Counter
import argparse
from operator import itemgetter
import json
DIR = None
DATA_LOC = None
train_data_loc = None
test_data_loc = None
anomaly_data_loc = None
domain_dims = None
dist_df_dict = None
reference_df = None

In [24]:
def set_up(_DIR):
    
    global DIR, DATA_LOC, train_data_loc, test_data_loc, anomaly_data_loc, domain_dims, dist_df_dict, reference_df
 
    with open('config.yaml', 'r') as fh:
        config = yaml.safe_load(fh)
        
    DATA_LOC = config['DATA_LOC']
    pairWiseDist_dir = os.path.join(config['pairWiseDist_dir'],DIR)
    anomaly_data_loc = os.path.join(config['anomaly_data_loc'],DIR)
    train_data_loc = os.path.join(DATA_LOC, DIR, 'train_data.csv')
    test_data_loc = os.path.join(DATA_LOC, DIR, 'test_data.csv')
    with open(os.path.join(DATA_LOC, DIR, 'domain_dims.pkl'.format(DIR)), 'rb')  as fh:
        domain_dims = pickle.load(fh)
    dist_df_dict = {}
    for _file in glob.glob(os.path.join(config['pairWiseDist_dir'],DIR,'**.csv')):
         
        _filename = os.path.split(_file)[-1].split('.')[0]
        _parts = _filename.split('_')
        key = (_parts[1],_parts[2])
     
        dist_df_dict[key] = pd.read_csv(_file,index_col=None)    
    return

In [62]:
def analyze_record(
    record
):
    global reference_df
    global domain_dims
    global dist_df_dict 
    tqdm._instances.clear()
    try:
        ref_row = reference_df.loc[reference_df['PanjivaRecordID']==record['PanjivaRecordID']].iloc[0]
    except:
        _id =  int( str(record['PanjivaRecordID']) + '0' )
        ref_row = reference_df.loc[reference_df['PanjivaRecordID']==_id].iloc[0]
        
    wrong_domains = {}
    for d in domain_dims.keys():
        if ref_row[d]!= record[d]:
            wrong_domains[d] = record[d]
    wrong_domains_list = list(wrong_domains.keys())
    
    # format domain1,domain2, dist
    record_entDist = []
    for pair in combinations(list(domain_dims.keys()),2):
        pair = sorted(pair)
        d1,d2 = pair[0],pair[1]
        key = (d1,d2)
        tmp_df = dist_df_dict[key]
        e1 = int(record[d1])
        e2 = int(record[d2])
        
        _dist = tmp_df.loc[(tmp_df[d1]==e1)&(tmp_df[d2]==e2)]['dist'].values[0]
        record_entDist.append([d1,d2,_dist]) 
        
    record_entDist = list(sorted(record_entDist, key = lambda x: x[2], reverse=True))
    wrong_domain_list = list(wrong_domains.keys())
    print('>',wrong_domain_list)
    print(record_entDist)
    seen = set()
    count = 0 
    cur = 1
    for item in record_entDist:
        print(item)
        _item = []
        if item[0] in wrong_domain_list: 
            _item =  item[0]
            seen = seen.union(set([_item]))
        if  item[1] in wrong_domain_list:
            _item =  item[1]
            seen = seen.union(set([_item]))
        print(seen)
        if len(seen) == len(wrong_domain_list):
            break
        cur +=1
    print(cur)   
    return cur

In [63]:
pos_anomalies = pd.read_csv(os.path.join(anomaly_data_loc, 'pos_anomalies.csv') ,index_col=None).reset_index(drop=True)
neg_anomalies = pd.read_csv(os.path.join(anomaly_data_loc, 'neg_anomalies.csv') ,index_col=None)
reference_df = pd.read_csv(train_data_loc, index_col=None).reset_index(drop=True)
neg_anomalies['PanjivaRecordID'] = neg_anomalies['PanjivaRecordID'].apply(lambda x : int(str(x)[:-3]) ).reset_index(drop=True)
pos_anomalies['PanjivaRecordID'] = pos_anomalies['PanjivaRecordID'].apply(lambda x : int(re.subn('00\d{1,2}$', '', str(x))[0]) )

anomalies = neg_anomalies.append(pos_anomalies, ignore_index=True)
anomalies = anomalies.reset_index(drop=True)

In [64]:
def main():
    
    global DIR, DATA_LOC, train_data_loc, test_data_loc, anomaly_data_loc, domain_dims, dist_df_dict, reference_df
    pos_anomalies = pd.read_csv(os.path.join(anomaly_data_loc, 'pos_anomalies.csv') ,index_col=None).reset_index(drop=True)
    neg_anomalies = pd.read_csv(os.path.join(anomaly_data_loc, 'neg_anomalies.csv') ,index_col=None)
    reference_df = pd.read_csv(train_data_loc, index_col=None).reset_index(drop=True)
    neg_anomalies['PanjivaRecordID'] = neg_anomalies['PanjivaRecordID'].apply(lambda x : int(str(x)[:-3]) ).reset_index(drop=True)
    pos_anomalies['PanjivaRecordID'] = pos_anomalies['PanjivaRecordID'].apply(lambda x : int(re.subn('00\d{1,2}$', '', str(x))[0]) )

    anomalies = neg_anomalies.append(pos_anomalies, ignore_index=True)
    anomalies = anomalies.reset_index(drop=True)

    results = {}

    res_neg = Parallel(n_jobs=mp.cpu_count())(
        delayed(analyze_record)(neg_anomalies.iloc[i])  for i in tqdm(range(neg_anomalies.shape[0])))

    res_pos = Parallel(n_jobs = mp.cpu_count())(
        delayed(analyze_record)(pos_anomalies.iloc[i])  for i in tqdm(range(pos_anomalies.shape[0])))

    res = res_neg + res_pos
    res_dict = Counter(res)
    
    res = list(sorted(res_dict.items(), key = itemgetter(0), reverse=False))
    total_count = sum([_[1] for _ in res])
    c = 0
    recall_dict = {}
    for item in res:
        c+= item[1]
        recall_dict[item[0]] = c/total_count
    
    Path('results_3').mkdir(exist_ok=True, parents=True)
    result_path = os.path.join('results_3',DIR)
    Path(result_path).mkdir(exist_ok=True, parents=True)  
    result_path = os.path.join('results_3',DIR,'results.json')
    with open(result_path, "w") as fh:  
        json.dump(recall_dict, fh) 
    return result_path

In [65]:
DIR ='us_import1'
set_up(DIR)
res_dict = main()

100%|██████████| 2940/2940 [02:05<00:00, 23.49it/s]
100%|██████████| 980/980 [00:35<00:00, 27.66it/s]


In [11]:
DIR ='us_import2'
set_up(DIR)
res_dict = main()

100%|██████████| 2970/2970 [03:28<00:00, 14.21it/s]
100%|██████████| 990/990 [01:06<00:00, 14.84it/s]


In [27]:
res_dict

'results_3/us_import1/results.json'

In [12]:
DIR ='us_import3'
set_up(DIR)
res_dict = main()

100%|██████████| 3000/3000 [03:19<00:00, 15.02it/s]
100%|██████████| 1000/1000 [00:50<00:00, 19.89it/s]


In [13]:
DIR ='us_import4'
set_up(DIR)
res_dict = main()

100%|██████████| 3000/3000 [03:23<00:00, 14.77it/s]
100%|██████████| 1000/1000 [01:08<00:00, 14.51it/s]


In [99]:
parser = argparse.ArgumentParser()
parser.add_argument(
    '--DIR',
    choices=['us_import1', 'us_import2', 'us_import3', 'us_import4'],
    default=None
)


# ----------------------------------------
args = parser.parse_args()
DIR = args.DIR
set_up(DIR)
main()

usage: ipykernel_launcher.py [-h]
                             [--DIR {us_import1,us_import2,us_import3,us_import4}]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/ddatta/.local/share/jupyter/runtime/kernel-5adc8ce9-3bc7-462d-b894-5e632b7e7afc.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [98]:
import json
dictionary ={  
    "name" : "sathiyajith",  
    "rollno" : 56,  
    "cgpa" : 8.6,  
    "phonenumber" : "9976770500"
}  
