In [1]:
import os
os.chdir('../..')

In [2]:
import pandas as pd
import numpy as np

from datetime import datetime, date
import matplotlib.pyplot as plt

import json

data = 'data/'

In [8]:
atms = pd.read_csv(data + 'atms.small.csv')
clients = pd.read_csv(data + 'clients.small.csv')
companies = pd.read_csv(data + 'companies.small.csv')
transactions = pd.read_csv(data + 'transactions.small.csv')

In [281]:
months = {}
days = {}
counts = 0
dates = []
for i in transactions.index:
    date = transactions.loc[i]['date']
    month = int(date.split('/')[1])
    day = int(date.split('/')[0])
    
    if month not in months.keys():
        months[month] = 0
    
    months[month] += 1
    if day not in days.keys():
        days[day] = 0
    
    days[day] += 1
    
    if day > 12 and month > 12:
        counts += 1

In [282]:
counts/len(transactions)

0.11905019472090005

In [54]:
months

{1: 618,
 2: 622,
 3: 663,
 4: 527,
 5: 518,
 6: 661,
 7: 564,
 8: 570,
 9: 627,
 10: 450,
 11: 505,
 12: 507,
 13: 759,
 14: 552,
 15: 575,
 16: 612,
 17: 843,
 18: 675,
 19: 579,
 20: 922,
 21: 612,
 22: 709,
 23: 752,
 24: 556,
 25: 503,
 26: 474,
 27: 646,
 28: 515,
 29: 558,
 30: 579,
 31: 235}

In [57]:
days

{1: 1246,
 2: 1301,
 3: 1188,
 4: 1265,
 5: 1283,
 6: 1368,
 7: 1241,
 8: 1317,
 9: 1307,
 10: 1270,
 11: 1284,
 12: 1542,
 13: 49,
 14: 52,
 15: 43,
 16: 49,
 17: 46,
 18: 40,
 19: 45,
 20: 45,
 21: 37,
 22: 44,
 23: 41,
 24: 233,
 25: 524,
 26: 573,
 27: 1055}

## Extract time and flow patterns

In [10]:
threshold_minutes = 15
threshold_amount = 0.1

In [11]:
groups = {'source': {'time': [], 'flow': []},
          'target': {'time': [], 'flow': []}}

ids = {'source': transactions['source'].unique(), 'target': transactions['target'].unique()}

for key in ids.keys():
    print('Check for {}'.format(key))
    for id_ in ids[key]:
        df = transactions[transactions[key] == id_]
        df.index = range(len(df))

        times = np.zeros((len(df), len(df)), dtype=np.bool)
        amounts = np.zeros((len(df), len(df)), dtype=np.bool)


        for j in range(len(df)):
            for k in range(j, len(df)):
                # Time diff
                tj = datetime.strptime(df.loc[j]['time'], '%H:%M:%S').time()
                tk = datetime.strptime(df.loc[k]['time'], '%H:%M:%S').time()

                diff = np.abs(((datetime.combine(date.today(), tj) - datetime.combine(date.today(), tk)).total_seconds()))

                if diff < threshold_minutes*60:
                    times[j][k] = True
                    times[k][j] = True
                else:
                    times[j][k] = False
                    times[k][j] = False

                # Amount diff

                amj = df.loc[j]['amount']
                amk = df.loc[k]['amount']

                diff = np.abs(amj-amk)
                avg = np.mean([amj, amk])

                if diff < threshold_amount*avg:
                    amounts[j][k] = True
                    amounts[k][j] = True
                else:
                    amounts[j][k] = False
                    amounts[k][j] = False

        mat = amounts*times
        
        j=0
        while j <= len(df)-1:
            arr = [i for i, x in enumerate(mat[j, :]) if x]
            j = arr[-1]+1
            if len(arr) > 2:
                linked = [df.loc[k]['id'] for k in arr]
                if len(df[df['id'].isin(linked)]['date'].unique()) == 1:
                    groups[key]['flow'].append(linked)
                else:
                    groups[key]['time'].append(linked) 

Check for source
Check for target


In [12]:
for key in groups.keys():
    print('{}:'.format(key))
    for type_ in groups[key].keys():
        print('  {}: {}'.format(type_, len(groups[key][type_])))

source:
  time: 9
  flow: 100
target:
  time: 16
  flow: 158


# Check for similar transactions

In [13]:
for key in groups.keys():
    for type_ in groups[key].keys():
        
        ids = []
        dup = []
        for i, g in enumerate(groups[key][type_]):
            df = transactions[transactions['id'].isin(g)]
            id_ = df[key].iloc[0]

            if id_ in ids:
                dup.append([ids.index(id_), i])
            ids.append(id_)  
            
        merge = []
        for pair in dup:
            amount1 = np.mean(transactions[transactions['id'].isin(groups[key][type_][pair[0]])]['amount'])
            amount2 = np.mean(transactions[transactions['id'].isin(groups[key][type_][pair[1]])]['amount'])

            delta = np.abs(amount1-amount2)

            if delta < threshold_amount*np.mean([amount1, amount2]):
                merge.append(pair)
                
        to_del = []
        for pair in merge:
            groups[key][type_][pair[0]] = list(np.append(groups[key][type_][pair[0]], groups[key][type_][pair[1]]))

            to_del.append(pair[1])
            
        for index in sorted(to_del, reverse=True):
            del groups[key][type_][index]

In [14]:
for key in groups.keys():
    print('{}:'.format(key))
    for type_ in groups[key].keys():
        print('  {}: {}'.format(type_, len(groups[key][type_])))

source:
  time: 9
  flow: 79
target:
  time: 14
  flow: 131


In [15]:
with open(data + 'groups.json', 'w') as outfile:
    json.dump(groups, outfile)

# Extract JSON for d3js

In [16]:
with open(data + 'groups.json', 'r') as outfile:
    groups = json.load(outfile)
    
source_ids = []
for g in groups['source']['time']:
    susp_node = transactions[transactions['id'].isin(g)]['source'].iloc[0]
    source_ids.append(susp_node)
    
target_ids = []
for g in groups['target']['time']:
    susp_node = transactions[transactions['id'].isin(g)]['source'].iloc[0]
    target_ids.append(susp_node)

In [17]:
# group 0 = suspect
# group 1 = accomplice
# group 2 = accomplice edge
# group 3 = normal
# group 4 = normal edge

In [35]:
id_ = 2

In [36]:
data = {'nodes': [], 'links': [], 'type': None}

In [37]:
susp_node = transactions[transactions['id'].isin(groups['source']['time'][id_])]['source'].iloc[0]
data['nodes'].append({'id': susp_node, 'tag': 'suspect', 'type': 'suspect'})

In [38]:
susp_index = list(transactions[transactions['id'].isin(groups['source']['time'][id_])].index)

In [39]:
out = transactions[transactions['source'] == susp_node]

out_nodes = []

for i in out.index:
    row = out.loc[i]
    acc = True if i in susp_index else False
    if acc:
        tag = 'accomplice'
    else:
        tag = 'usual'
    
    if row['target'] not in out_nodes:
        data['nodes'].append({'id': row['target'], 'tag': tag, 'type': 'target'})
        out_nodes.append(row['target'])
        
    data['links'].append({'source': susp_node, 'target': row['target'], 'tag': tag, 'date': row['date'], 'time': row['time'], 'amount': row['amount'], 'currency': row['currency']})

In [40]:
in_ = transactions[transactions['target'] == susp_node]

in_nodes = []
if susp_node in target_ids:
    susp_in = True
    
    data['type'] = 'Time Pattern Outflow + Inflow'

    idx = target_ids.index(susp_node)
        
    susp_index_in = list(transactions[transactions['id'].isin(groups['target']['time'][idx])].index)
else:
    data['type'] = 'Time Pattern Outflow'
    susp_in = False

for i in in_.index:
    row = in_.loc[i]
    
    if susp_in:
        acc = True if i in susp_index_in else False
        if acc:
            tag = 'accomplice'
        else:
            tag = 'usual'
    else:
        tag = 'usual'
        
    if row['source'] not in in_nodes:
        data['nodes'].append({'id': row['source'], 'tag': tag, 'type': 'source'})  
        in_nodes.append(row['source'])
        
    data['links'].append({'source': row['source'], 'target': susp_node, 'tag': tag, 'date': row['date'], 'time': row['time'], 'amount': row['amount'], 'currency': row['currency']})

In [41]:
fn = str(id_) + '.json'

with open('json/'+fn, 'w') as outfile:
    json.dump(data, outfile)