In [198]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('./..')
sys.path.append('./../..')
from pandarallel import pandarallel
pandarallel.initialize()
import glob 
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import auc
try:
    from common_utils import AD_result_fetcher
except:
    from .common_utils import AD_result_fetcher
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
from time import time
from matplotlib import pyplot as plt
from sklearn.metrics import silhouette_score

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# ---------------------------------
## (Modified) implementatation of : Improve Blackbox Sequential Anomaly dtetctor Relevancy with Limited Human Feedback
# ---------------------------------

In [119]:
DIR = 'us_import1'

PERCENTILE_THRESHOLD = 100 - 5

UPDATE_scoreUB_percentile =  10 
UPDATE_scoreUB = 0.5
id_col = 'PanjivaRecordID'
attribute_SHIPPER = 'ShipperPanjivaID'
attribute_CONSIGNEE = 'ConsigneePanjivaID'

In [194]:
labelled_results = AD_result_fetcher.read_in_AD_result(DIR)
score_threshold = np.percentile(labelled_results['score'],PERCENTILE_THRESHOLD)
print(PERCENTILE_THRESHOLD, score_threshold)
labelled_results = labelled_results.sort_values(by=['score'],ascending=False)

working_df = labelled_results.copy()
working_df['dynamic_score'] = labelled_results['score'].values
# ------------------
# Type conversion : to ensure no bugs
# ------------------

working_df['PanjivaRecordID'] = working_df['PanjivaRecordID'].astype(int)
working_df['ConsigneePanjivaID'] = working_df['ConsigneePanjivaID'].astype(int)
working_df['ShipperPanjivaID'] = working_df['ShipperPanjivaID'].astype(int)
working_df = working_df.reset_index(drop=True)


22935 22648
95 0.4913615251753746


In [8]:
def get_domain_dims(DIR):
    file_path = './../generated_data_v1/{}/domain_dims.pkl'.format(DIR)
    with open(file_path,'rb') as fh:
        dd = pickle.load(fh)
    return dd

def get_serial_mappingId(DIR):
    file_path = './../generated_data_v1/{}/idMapping.csv'.format(DIR)
    df = pd.read_csv(file_path,index_col=None)
    return df

In [9]:
# The lowest score till which a new score should be calculated
UPDATE_scoreUB = np.percentile(labelled_results['score'],100-UPDATE_scoreUB_percentile)
print(UPDATE_scoreUB)

print('Number of positive anomalies : ', len(labelled_results.loc[labelled_results['label']==1]))
score_threshold = np.percentile(labelled_results['score'],PERCENTILE_THRESHOLD)

0.4539567681357035
Number of positive anomalies :  0


In [116]:
# --------------------------------------------
# Select entitties from records with a MIN_ANOM_SCORE
# Set to top 25 percentile
# --------------------------------------------
ENTITY_MIN_ANOM_SCORE = np.percentile(labelled_results['score'],100-25)
ENTITY_MIN_ANOM_SCORE


0.4002390717618687

In [55]:
def fetch_embedding(DIR, serialized=True):
    global attribute_SHIPPER
    global attribute_CONSIGNEE
    valid_domains = [attribute_CONSIGNEE, attribute_SHIPPER]
    
    embedding_data_loc = './../createGraph_trade/saved_model_data/{}'.format(DIR)
    files = sorted(glob.glob(os.path.join(embedding_data_loc,'**.npy')))
 
    domain_dims = get_domain_dims(DIR)
   
   
    serialIdMapping = get_serial_mappingId(DIR)
    # ===========================================================================================================
    # Create synthetic ids so that we can map from the entities of attribute_SHIPPER and attribute_CONSIGNEE only
    # -----------------------------------------------------------------------------------------------------------
    serialID_2_syntheticID = {}
    
    cur = 0
    for domain in valid_domains:
       
        _df = serialIdMapping.loc[serialIdMapping['domain']==domain]
        for e in enumerate(_df['serial_id'].values.tolist(),0):
            _syn_id = e[0]+cur
            serialID_2_syntheticID[e[1]] = _syn_id 
         
        cur += len(_df)
        print(max(serialID_2_syntheticID.keys()))
        
    num_entities = len(serialID_2_syntheticID) 
    combined_emb = None 
    for domain in valid_domains:
        file = sorted(glob.glob(os.path.join(embedding_data_loc,'**{}.npy'.format(domain))))[0]
        emb_data = np.load(file) 
        emb_dim  = emb_data.shape[1]
        # Initialize :
        if combined_emb is None: 
            combined_emb = np.zeros([num_entities,emb_dim])
        _df = serialIdMapping.loc[serialIdMapping['domain'] == domain]
        for i, row in _df.iterrows():
            _serial_id = row['serial_id']
            try:
                _syn_id = serialID_2_syntheticID[_serial_id]
                combined_emb[_syn_id] = emb_data[row['entity_id']]
            except:
                pass
            
    return combined_emb, serialID_2_syntheticID



In [56]:
embedding, serialID_2_syntheticID = fetch_embedding(DIR)

4895
11135


In [68]:
# ==================================
# Perform dimensionality reduction
# ==================================
from sklearn.decomposition import TruncatedSVD
def reduce_dimension(data, target_dim=16):
    svd_obj = TruncatedSVD (n_components = target_dim)
    svd_obj.fit(data)
    xformed = svd_obj.transform(data)
    return xformed

In [176]:
def perform_clustering( X, num_clusters = 2):
#     model = MiniBatchKMeans(
#         init='k-means++', 
#         n_clusters=num_clusters, 
#         verbose=0
#     )
    model = KMeans(
        init='k-means++', 
        n_clusters=num_clusters, 
        verbose=0
    )
    
    t0 = time()
    model.fit(X)
    t1 = time() - t0
    print('Time taken {:.4f}'.format(t1))
    return model

In [187]:
def obtain_custers( scored_df, embedding, serialID_2_syntheticID , ENTITY_MIN_ANOM_SCORE, reduced_dim_clustering=True, reduced_dim=16):
    global attribute_CONSIGNEE
    global attribute_SHIPPER
    
    valid_domains = [attribute_CONSIGNEE, attribute_SHIPPER]
    
    df = scored_df.loc[scored_df['score']>=ENTITY_MIN_ANOM_SCORE]
    valid_entities = []
    for d in valid_domains:
        valid_entities.extend(df[d].values.tolist())
      
    valid_entities = list(set(sorted(valid_entities)))
    # -----------------------------------------------------
    # convert from serial_IDs to  clustering_syn_ID
    # serial_IDs -> syntheticID -> access embedding value
    # -----------------------------------------------------
    
    serialID_2_tmpSynID = {e[1]:e[0] for e in enumerate(valid_entities)}
    clustering_data = np.zeros((len(valid_entities), embedding.shape[1]))
    
    for idx in valid_entities:
        syn_id = serialID_2_syntheticID[idx]
        tmpSynID = serialID_2_tmpSynID[idx]
        clustering_data[tmpSynID] = embedding[syn_id]
    
    print(clustering_data.shape)
    # Perform clustering
    if reduced_dim_clustering:
        clustering_data = reduce_dimension(clustering_data, target_dim=reduced_dim)
    
   
    max_silhoutte = -10
    clustering_model_obj = None
    valid_K_values = list(range(2,11))
   
    for k in valid_K_values:
        X = clustering_data.copy()
        _model_obj = perform_clustering( X, num_clusters = k)
        xformed = _model_obj.transform(X)
        silhoutte = silhouette_score(xformed,_model_obj.labels_ )
        print(' K = {}, silhoutte score {:.4f}'.format(k,silhoutte))
       
        if silhoutte > max_silhoutte:
            max_silhoutte = silhoutte
            clustering_model_obj = _model_obj
      
    # Create a mapping from serial_ID to cluster_id
    tmpSynID_2_serialID = {v:k for k,v in serialID_2_tmpSynID.items()}
    
    num_clusters = len(set(clustering_model_obj.labels_))
    serialID_clusterID_map = {}
    labels = np.array(clustering_model_obj.labels_)
    print(labels)
    for idx in range(labels.shape[0]):
        serial_id = tmpSynID_2_serialID[idx] 
        serialID_clusterID_map[serial_id] = clustering_model_obj.labels_[idx]
        
    return serialID_clusterID_map

    
    

In [191]:
serialID_clusterID_map = obtain_custers( labelled_results, embedding, serialID_2_syntheticID , ENTITY_MIN_ANOM_SCORE, reduced_dim_clustering=True, reduced_dim=4)

(3835, 128)
Time taken 0.0895
 K = 2, silhoutte score 0.3487
Time taken 0.1039
 K = 3, silhoutte score 0.3376
Time taken 0.1374
 K = 4, silhoutte score 0.2832
Time taken 0.1741
 K = 5, silhoutte score 0.2632
Time taken 0.1931
 K = 6, silhoutte score 0.2574
Time taken 0.1659
 K = 7, silhoutte score 0.3094
Time taken 0.3504
 K = 8, silhoutte score 0.3051
Time taken 0.2439
 K = 9, silhoutte score 0.3188
Time taken 0.2979
 K = 10, silhoutte score 0.3160
[1 1 1 ... 1 1 1]


In [195]:
# ------------------------------------
def calculate_cluster_relevancy(df_cur, serialID_clusterID_map, label_top_k=10 ):

    global id_col
    df_cur = df_cur.sort_values(by='score',ascending=False)
    seen_ids = []
    count = 0
    labelled_df = None
    idx = 0
    for i,row in df_cur.iterrows():
        _id = int(row[id_col])
        idx = i
        seen_ids.append(_id)
        
        if row['label']==1: 
            count +=1
            if count > label_top_k : 
                break
        
    seen_ids = set(seen_ids)
    labelled_df = df_cur.iloc[:idx,:]
    unlabelled_df = df_cur.iloc[idx:,:]
    
    print('# of records revealed to find {} instances at the top :: {}'.format(
        label_top_k, 
        len(labelled_df))
    )
    # ----

In [196]:
calculate_cluster_relevancy(working_df.copy(), serialID_clusterID_map )

# of records revealed to find 10 instances at the top :: 20856


In [112]:
xformed = reduce_dimension(reduced_dimX,2)

colors = ['r','b','g','c','m']
for l in set(clustering_model_obj.labels_):
    t = xformed[clustering_model_obj.labels_==l]
    plt.scatter(t[:,0],t[:,1],color=colors[l],s=1, alpha=0.8)
plt.show()

In [114]:
silhouette_score(xformed,clustering_model_obj.labels_ )

0.10725698295519771