In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import sys
import basictest as bt
from datetime import datetime
from networkx.linalg.graphmatrix import incidence_matrix

filepath = '../datasets/16K_5/transactions.csv' 

df = pd.read_csv(filepath)
#df_sar_accounts = pd.read_csv(filepath[:-16] + "sar_accounts.csv")
df_sar_accounts = pd.read_csv(filepath[:-16] + "alert_accounts.csv")
df_accounts = pd.read_csv(filepath[:-16] + "accounts.csv")

accounts_df = df_accounts
np.sum(np.diff(np.diff(accounts_df["acct_id"])))

print("SAR percentage:",len(df_sar_accounts)/len(df_accounts))
print('num sar accoutns',len(df_sar_accounts))
print('num all accounts',len(df_accounts))

# Create a directed graph
graph_orig = nx.MultiDiGraph()   
graph2 = nx.DiGraph()
graph3 = nx.DiGraph()

# Add nodes to the graph using the accounts DataFrame
for index, row in accounts_df.iterrows():
    graph_orig.add_node(row["acct_id"],initial_deposit=row["initial_deposit"])
    graph2.add_node(row["acct_id"])
    graph3.add_node(row["acct_id"])

# Add edges to the graph using the transactions DataFrame
for index, row in df.iterrows():
    date_string = row["tran_timestamp"]
    datetime_object = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
    timestamp = datetime_object.timestamp()
    graph_orig.add_edge(row["orig_acct"], row["bene_acct"], base_amt=row["base_amt"], is_sar=row["is_sar"], tx_type=row["tx_type"], timestamp=timestamp)

#sar_accounts = list((df_sar_accounts['ACCOUNT_ID']))
sar_accounts = list((df_sar_accounts['acct_id']))
print(sar_accounts)

accounts_np = np.empty((0,25))

count = 0
for node in graph_orig.nodes: #loop through nodes. Current node is u
    '''
    #breaks early as testing tool. leave commented out
    if count > 100:
        break
    count += 1
    '''
    
    node_isSar = 0
    if node in sar_accounts:
        node_isSar = 1

    init_deposit = graph_orig.nodes[node].get("initial_deposit", None)
    transactions = []

    amt_in_arr = []
    deg_in_arr = []
    
    incoming_edges = graph_orig.in_edges(node, data=True)
    for source, target, edge_data in incoming_edges:
        deg_in_arr.append(source)
        amt_in_arr.append(edge_data["base_amt"])
        transactions.append((edge_data["timestamp"],edge_data["base_amt"]))


    if not amt_in_arr:
        amt_in_arr = [0]

    node_amt_total_in = np.sum(amt_in_arr)
    node_amt_min_in = np.min(amt_in_arr)
    node_amt_max_in = np.max(amt_in_arr)
    node_amt_mean_in = np.mean(amt_in_arr)
    node_amt_median_in = np.median(amt_in_arr)
    node_amt_std_in = np.std(amt_in_arr)
    node_deg_in = len(deg_in_arr) #counts multuple edges from one node if it exists
    node_deg_in_unique = len(set(deg_in_arr)) #unique nodes (removed multiple edges from one nodes, only consider it as 1 edge)
    

    amt_out_arr = []
    deg_out_arr = []
    outgoing_edges = graph_orig.out_edges(node, data=True)
    for source, target, edge_data in outgoing_edges:
        deg_out_arr.append(target)
        amt_out_arr.append(edge_data["base_amt"])
        transactions.append((edge_data["timestamp"],-1*edge_data["base_amt"])) #multiply by -1 to indicate money leaving account

    transactions = sorted(transactions, key=lambda x: x[0]) #sort transactions by time

    #loop through transactions
    max_account_balance = init_deposit
    min_account_balance = init_deposit
    starting_balance = init_deposit
    ending_balance = init_deposit
    loop_balance = init_deposit
    for t in transactions:
        loop_balance += t[1] #add transaction amount
        max_account_balance = np.max([max_account_balance, loop_balance])
        min_account_balance = np.min([min_account_balance, loop_balance])
    ending_balance = loop_balance

    transaction_times = [a for a,_ in transactions]
    diff_transactions = np.diff(transaction_times)
    mean_transaction_diff = np.mean(diff_transactions)
    std_transaction_diff = np.std(diff_transactions)

    #largest percent change max-min
    #balance shift start - end

    max_balance_shift = max_account_balance - min_account_balance
    limit_balance_shift = starting_balance - ending_balance
    

    graph_orig.nodes[node]['max_account_balance'] = max_account_balance
    graph_orig.nodes[node]['min_account_balance'] = min_account_balance
    graph_orig.nodes[node]['starting_balance'] = starting_balance
    graph_orig.nodes[node]['ending_balance'] = ending_balance

    if not amt_out_arr:
        amt_out_arr = [0]

    node_amt_total_out = np.sum(amt_out_arr)
    node_amt_min_out = np.min(amt_out_arr)
    node_amt_max_out = np.max(amt_out_arr)
    node_amt_mean_out = np.mean(amt_out_arr)
    node_amt_median_out = np.median(amt_out_arr)
    node_amt_std_out = np.std(amt_out_arr)
    node_deg_out = len(deg_out_arr) #counts multuple edges from one node if it exists
    node_deg_out_unique = len(set(deg_out_arr)) #unique nodes (removed multiple edges from one nodes, only consider it as 1 edge)
    
    node_deg_total = node_deg_in + node_deg_out
                                      
    node_attributes = np.array([node_amt_total_in, node_amt_min_in, node_amt_max_in, node_amt_mean_in, node_amt_median_in, node_amt_std_in, node_deg_in, node_deg_in_unique, \
                                node_amt_total_out, node_amt_min_out, node_amt_max_out, node_amt_mean_out, node_amt_median_out, node_amt_std_out,\
                                node_deg_out, node_deg_out_unique, node_deg_total, max_account_balance, min_account_balance, starting_balance, ending_balance, max_balance_shift, limit_balance_shift,\
                                int(node), node_isSar])

    accounts_np = np.append(accounts_np,[node_attributes],axis=0)

    #print(outgoing_edges)
    #loop through all unique out edges and generate a digraph
    for v in set(deg_out_arr):
        total_weight = 0
        for _, vv, vv_edge_data in outgoing_edges: # THIS IS INEFFICIENT. SHOULD FIND BETTER ALTERNATIVE
            if v == vv:
                total_weight += vv_edge_data['base_amt']

        graph3.add_edge(node, v, weight=total_weight) 
    
results = {}
G = graph3.copy()
results['GAW']    = bt.strengthDegree(G,1000,0.05)[0][2]
results['GAW10']  = bt.strengthDegree10(G,1000,0.05)[0][2]
results['GAW20']  = bt.strengthDegree20(G,1000,0.05)[0][2]
results['Std Degree']  = bt.getTotalDegree(G)[0][1]

results_arr = np.empty((len(results['GAW']),4))

results_arr = np.empty((len(results['GAW']),4))
results_arr[:,0] = np.array(list(results['GAW'].values()))
results_arr[:,1] = np.array(list(results['GAW10'].values()))
results_arr[:,2] = np.array(list(results['GAW20'].values()))
results_arr[:,3] = np.array(list(results['Std Degree'].values()))

node_features = np.concatenate((results_arr, accounts_np), axis=1)

accounts_np.shape, results_arr.shape, node_features.shape

column_names = ['GAW', 'GAW10', 'GAW20', 'Std_Degree', \
                'node_amt_total_in', 'node_amt_min_in', 'node_amt_max_in', 'node_amt_mean_in', 'node_amt_median_in', 'node_amt_std_in', 'node_deg_in', 'node_deg_in_unique', \
                'node_amt_total_out', 'node_amt_min_out', 'node_amt_max_out', 'node_amt_mean_out', 'node_amt_median_out', 'node_amt_std_out',\
                'node_deg_out', 'node_deg_out_unique', 'node_deg_total', 'max_account_balance', 'min_account_balance', 'starting_balance', 'ending_balance', 'max_balance_shift', 'limit_balance_shift', \
                'id', 'node_isSar']
accounts_df = pd.DataFrame(node_features, columns=column_names)

accounts_df.to_csv(filepath[0:-16]+'account_attributes.csv',index=0)


SAR percentage: 0.051578618318224446
num sar accoutns 825
num all accounts 15995
[28, 14342, 3395, 14122, 3681, 159, 12800, 13654, 3551, 1509, 10943, 8004, 12858, 7646, 723, 2246, 8053, 4296, 15393, 15827, 23, 6199, 12447, 10568, 3179, 2680, 2807, 8489, 11686, 459, 15281, 14801, 11634, 1488, 2567, 280, 8521, 11426, 13926, 6313, 10599, 15578, 13928, 313, 4491, 14743, 14278, 14899, 341, 9886, 1283, 11181, 2291, 9287, 67, 1432, 11001, 3068, 7859, 4625, 12704, 732, 10908, 7321, 3510, 5785, 7851, 6579, 507, 11378, 7140, 4785, 13527, 6719, 1371, 3504, 4869, 14319, 589, 14771, 15970, 10587, 4458, 482, 6347, 10318, 15648, 1878, 14135, 12895, 9544, 7373, 203, 2753, 4106, 13479, 437, 8970, 8055, 6005, 460, 14134, 8142, 11146, 2261, 3597, 6870, 6173, 9972, 6157, 524, 327, 8996, 4352, 3, 9225, 7767, 12508, 82, 1998, 13794, 5444, 1989, 162, 2873, 3558, 13577, 442, 6847, 6314, 15920, 6272, 8133, 705, 8108, 2221, 14955, 2035, 12652, 1941, 126, 10225, 1835, 15975, 1000, 99, 12502, 2075, 2073, 14777, 4

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
