In [1]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('./..')
sys.path.append('./../..')
from joblib import Parallel, delayed
import pickle
import argparse
import multiprocessing
from pandarallel import pandarallel
pandarallel.initialize()
import networkx as nx

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
try:
    from src.data_fetcher import data_fetcher_v2 as data_fetcher
except:
    from data_fetcher import data_fetcher_v2 as data_fetcher

try:
    from . import network_similarity_v1 as NS
except:
    import network_similarity_v1 as NS


# --------------------------------------------

DIR = None
model_use_data_DIR = 'model_use_data'
TARGET_DATA_SOURCE = './../../AD_system_output'

def setup():
    global DIR
    global model_use_data_DIR
    global TARGET_DATA_SOURCE
    NS.initialize(DIR, model_use_data_DIR)


# ---------------
# Algorithm ::
# Create a network
# Calculate SimRank between the Transaction nodes
#
# With partially labelled data - Train a classifier
# Classify points on the unlabelled data (transaction instances : where features are entities + anomaly scores )
# Set final label as
# Sign ( lambda * Weighted(similarity based) of labels of its K nearest (labelled) neighbors + (1-lambda) predicted label )
# ----------------

def get_training_data(DIR):
    SOURCE_DATA_DIR = './../../generated_data_v1'
    data = data_fetcher.get_train_x_csv(SOURCE_DATA_DIR, DIR)
    return data


def get_domain_dims(DIR):
    with open(
            os.path.join(
                './../../generated_data_v1/',
                DIR,
                'domain_dims.pkl'
            ), 'rb') as fh:
        domain_dims = pickle.load(fh)
    return domain_dims


def read_target_data():
    global DIR
    global TARGET_DATA_SOURCE

    csv_f_name = 'scored_test_data.csv'
    df = pd.read_csv(
        os.path.join(
            TARGET_DATA_SOURCE,
            DIR,
            csv_f_name), index_col=None
    )
    return df

# -----------------------------------

def get_tranasaction_pair_similarity():
    import networkx as nx

    global TARGET_DATA_SOURCE
    domain_dims = get_domain_dims(DIR)
    df = get_training_data(DIR)
    G = NS.get_initial_graph(df, domain_dims)
    test_data_df = read_target_data(
        TARGET_DATA_SOURCE,
        DIR
    )
    G = NS.get_graph_W_transaction_nodes(G,test_data_df)
    print(nx.simrank_similarity(G,10,100))
    return

# -----------------------------------

In [4]:
DIR = 'us_import1'

setup()
print (NS.model_use_data_DIR)
domain_dims = get_domain_dims(DIR)
df = get_training_data(DIR)
G = NS.get_initial_graph(df, domain_dims)
df_test = read_target_data()
G = NS.get_graph_W_transaction_nodes(G, df_test)

model_use_data/us_import1
Mapping to serial ids file :: model_use_data/us_import1/Serialized_Mapping.csv
model_use_data/us_import1/Serialized_Mapping.csv
['Carrier', 'ConsigneePanjivaID', 'HSCode', 'PortOfLading', 'PortOfUnlading', 'ShipmentDestination', 'ShipmentOrigin', 'ShipperPanjivaID']
Path ::  model_use_data/us_import1/nodeObj_dict_00.pkl
{'Carrier': 655, 'ConsigneePanjivaID': 7569, 'HSCode': 1510, 'PortOfLading': 266, 'PortOfUnlading': 68, 'ShipmentDestination': 131, 'ShipmentOrigin': 116, 'ShipperPanjivaID': 9415}
['HSCode', 'PortOfLading']
9465
Number of nodes and edges ::  1776 9465
['PanjivaRecordID', 'HSCode']
47196
Number of nodes and edges ::  48972 56661
['PanjivaRecordID', 'ConsigneePanjivaID']
47196
Number of nodes and edges ::  54490 103857
['PanjivaRecordID', 'ShipperPanjivaID']
47196
Number of nodes and edges ::  61380 151053


In [None]:
nx.simrank_similarity_numpy(G)