In [282]:
import os
import pandas as pd
import json
import folium
import seaborn as sns
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import datetime, date, timedelta
from node2vec_utils import read_node_vecs, load_json_dict, invert_dict
from suspicion_tools import cosine_sim, cycle_suspicion_desc, cycle_suspicion_score, cycle_suspicion_for_agg
%matplotlib inline  
%load_ext autoreload
%autoreload 2
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [283]:
transactions = pd.read_csv('../../data/transactions.small.csv')
transactions = transactions.rename(columns={'id':'tid'})
transactions['source'] = transactions['source'].astype(str)
transactions['target'] = transactions['target'].astype(str)
transactions['date'] = pd.to_datetime(transactions.date)
transactions['month'] = transactions['date'].apply(lambda x: x.month)
transactions['day'] = transactions['date'].apply(lambda x: x.day)
transactions['year'] = transactions['date'].apply(lambda x: x.year)
transactions['time'] = pd.to_datetime(transactions['time'])
transactions['amount'] = transactions.amount.astype(float)

In [284]:
node_vectors = read_node_vecs('vecs.emd')
ids_dict = load_json_dict('id_dict.json')
reversed_dict = invert_dict(ids_dict)


The similarity values can be used to propagate similarities across a path, and they can be used more directly in the context of a single transaction (to see if it's an obscure transaction or a transaction between known individuals - this can also help us determine the strategies of different money launderers; whether they prefer to work with longer-term partners or just send their money through some guy they don't know in some remote corner of the world).

In [285]:

transactions['node_similarity'] = transactions.apply(lambda row: cosine_sim(ids_dict[row['source']], ids_dict[row['target']], node_vectors), axis=1)


In [286]:
mean_similarity = transactions['node_similarity'].mean()
std_similarity = transactions['node_similarity'].std()

print(transactions['node_similarity'].min())
print(transactions['node_similarity'].max())
print(mean_similarity)
print(std_similarity)

0.246691875579
0.99475487613
0.8399258272757715
0.1108750264162977


In [287]:
transactions.head()

Unnamed: 0,tid,source,target,date,time,amount,currency,month,day,year,node_similarity
0,0a6dfc9d-c66e-4546-a2d7-3e910e6478e3,a9c9e8db-c796-4d87-bdba-bbc9bf683d40,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,2017-05-24,2017-11-12 00:47:36,457.952172,ISK,5,24,2017,0.956734
1,59328a0d-92bb-43a5-8925-5db6a693dde9,c81d20dd-8b65-43d1-bb07-6988fffa299f,d84e8a14-d531-497c-87cb-ee985099e9f9,2016-12-26,2017-11-12 03:57:40,940.451548,IRR,12,26,2016,0.727009
2,f0326cb7-9c59-49d0-9944-781bbea9546b,124e9c28-03a7-4bf5-81e5-13c21513ad51,9e4c7d85-e510-4be8-9b6c-3442e2c9dadf,2015-01-27,2017-11-12 02:56:57,371.643714,,1,27,2015,0.632
3,3a667343-bc3b-4038-86bb-66358b38db36,9e4c7d85-e510-4be8-9b6c-3442e2c9dadf,19ce1dd2-f9e8-4a59-9261-7bbd4dba0398,2015-01-27,2017-11-12 12:56:29,372.561046,,1,27,2015,0.485097
4,732d967d-647e-410e-b997-41180b4d51c3,779bbd4a-bb73-410e-a308-1da3138c1998,11c9de39-9ce8-44ab-9080-850f07b32a8e,2016-05-27,2017-11-12 13:58:13,2260.001111,,5,27,2016,0.895929


In [288]:
clients = pd.read_csv('../../data/clients.small.csv')
clients.head()

Unnamed: 0,id,first_name,last_name,age,email,occupation,political_views,nationality,university,academic_degree,address,postal_code,country,city
0,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,24 South Van Ness Nene,57776,Andorra,Easton
1,2d0090d1-aac7-49a8-84f9-3641e7081a29,Calandra,Mcdonald,17,cloherty1854@gmail.com,Tanker Driver,Communist,Dominican,Salem State University,,401 Fresnel Place,57776,Samoa,Loma Linda
2,9c1a02be-e1df-41df-b165-e13ebe0a5446,Gabriel,Carpenter,25,forger1848@yandex.com,Laboratory Analyst,Conservative,Costa Rican,Massachusetts College of Liberal Arts,,1089 Polaris Concession road,57776,Curaçao,Winona
3,de265d69-6944-4e23-aedd-b48d00f11f8a,Drusilla,Riggs,42,exciton1869@gmail.com,Charity Worker,Communist,Romanian,University of Georgia (UGA),,1007 Trainor Gardens,57776,Turkey,Tooele
4,605090dc-0221-47f1-a652-4b7697e9f07f,Tamisha,Mercado,39,panthers2028@live.com,Road Sweeper,Anarchism,Irish,Georgia Gwinnett College,,930 Harrison Pike,57776,Serbia,Englewood


In [289]:
companies = pd.read_csv('../../data/companies.small.csv')
companies.head()

Unnamed: 0,id,type,name,country
0,3ddf9481-7d31-4fec-819c-601da5a17df9,Limited Liability Company,Gemini Sound Products,Austria
1,180b30c2-a2ce-4f9f-863a-cc2de6f4191a,Limited Liability Partnership,Zappos.com,Niue
2,f71e0fd8-260b-4723-8ee8-b6d2ece4e550,Limited Liability Limited Partnership,Venus Swimwear,Kiribati
3,458f588c-3d71-4e41-9b67-bd5958067261,Limited Liability Company,Frontier Airlines,Fiji
4,b6539a68-b373-4cac-9290-a5f4b524c1e1,Incorporated,Dole Foods,United Arab Emirates


In [290]:
atms = pd.read_csv('../../data/atms.small.csv')
atms.head()

Unnamed: 0,id,latitude,longitude
0,be5a9198-32e1-4922-a8bd-aa3825c75d26,81.017848,-147.505813


In [291]:
atm_transactions = transactions.merge(atms, left_on = 'target', right_on='id')

In [292]:
transactions.merge(atms, left_on = 'source', right_on='id')

Unnamed: 0,tid,source,target,date,time,amount,currency,month,day,year,node_similarity,id,latitude,longitude


In [293]:
incoming = clients.merge(transactions, left_on='id', right_on='target')

In [294]:
incoming.head()

Unnamed: 0,id,first_name,last_name,age,email,occupation,political_views,nationality,university,academic_degree,...,source,target,date,time,amount,currency,month,day,year,node_similarity
0,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,d2a81efe-91aa-49ae-be0f-cc0a2e9a53fa,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,2015-03-28,2017-11-12 10:52:10,19671.648782,,3,28,2015,0.891272
1,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,b57dac23-a8f8-4154-93b2-cbb7fdae2c7d,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,2017-08-27,2017-11-12 04:44:31,1917.893038,THB,8,27,2017,0.844297
2,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,2cf9281a-1dab-4931-a641-4a2f9ddc44ec,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,2017-08-27,2017-11-12 04:44:37,1917.893038,,8,27,2017,0.736142
3,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,4d2adb86-6667-451d-b11c-b12e832d7c96,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,2017-11-18,2017-11-12 10:35:40,22602.872493,,11,18,2017,0.891068
4,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,a437aee1-e908-45db-bb22-15be85a29484,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,2015-08-21,2017-11-12 08:00:13,13.581721,BWP,8,21,2015,0.89422


In [295]:
outgoing = clients.merge(transactions, left_on='id', right_on='source')

In [296]:
outgoing.head()

Unnamed: 0,id,first_name,last_name,age,email,occupation,political_views,nationality,university,academic_degree,...,source,target,date,time,amount,currency,month,day,year,node_similarity
0,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,06bda2fe-858f-4054-aea8-8765f6c0a2c0,2017-08-27,2017-11-12 04:44:55,548.487784,,8,27,2017,0.749352
1,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,8a64381b-0c2f-49f3-9b0f-c15bb686775f,2015-03-28,2017-11-12 19:52:04,2720.2472,,3,28,2015,0.851681
2,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,d6c5d04e-7f7f-478b-994b-964196354b38,2017-08-27,2017-11-12 04:44:40,548.487784,,8,27,2017,0.84294
3,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,1b8537e7-f804-49d2-b22e-b80d589e1ae1,2015-03-28,2017-11-12 04:52:58,2720.2472,,3,28,2015,0.875589
4,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,508824a6-4eb5-4722-8861-b454e691fd3b,2015-03-28,2017-11-12 01:52:00,2720.2472,,3,28,2015,0.862141


In [297]:
pd.concat((outgoing,incoming), axis=0).head()

Unnamed: 0,id,first_name,last_name,age,email,occupation,political_views,nationality,university,academic_degree,...,source,target,date,time,amount,currency,month,day,year,node_similarity
0,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,06bda2fe-858f-4054-aea8-8765f6c0a2c0,2017-08-27,2017-11-12 04:44:55,548.487784,,8,27,2017,0.749352
1,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,8a64381b-0c2f-49f3-9b0f-c15bb686775f,2015-03-28,2017-11-12 19:52:04,2720.2472,,3,28,2015,0.851681
2,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,d6c5d04e-7f7f-478b-994b-964196354b38,2017-08-27,2017-11-12 04:44:40,548.487784,,8,27,2017,0.84294
3,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,1b8537e7-f804-49d2-b22e-b80d589e1ae1,2015-03-28,2017-11-12 04:52:58,2720.2472,,3,28,2015,0.875589
4,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,Nicholle,Berger,56,sighless1826@yandex.com,Nuclear Scientist,Liberal,Russian,Fitchburg State University,PhD,...,1eea5fcb-05ff-4d8f-a45b-3547a2ca81da,508824a6-4eb5-4722-8861-b454e691fd3b,2015-03-28,2017-11-12 01:52:00,2720.2472,,3,28,2015,0.862141


Here, we join the transactions target to source: each row in the result will have a pair of transactions with the target of the first one equal to the source of the second. Then we calculate the time difference between the two and filter out those with a great time or amount difference. Then we can get all of these pairs, or only those that have a later (or earlier) in time (time of incoming transaction) than their out time (time of outgoing transaction), or similarly for the amounts.

Next up is actually counting those that have some spread: one in, several out, such that the sum of the outs is close to the in. Or vice versa.

In [298]:
in_and_out = transactions.merge(transactions, left_on='target', right_on='source')
in_and_out.head()

Unnamed: 0,tid_x,source_x,target_x,date_x,time_x,amount_x,currency_x,month_x,day_x,year_x,...,source_y,target_y,date_y,time_y,amount_y,currency_y,month_y,day_y,year_y,node_similarity_y
0,0a6dfc9d-c66e-4546-a2d7-3e910e6478e3,a9c9e8db-c796-4d87-bdba-bbc9bf683d40,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,2017-05-24,2017-11-12 00:47:36,457.952172,ISK,5,24,2017,...,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,97ce7411-39fb-4da5-b990-463ea8425077,2017-05-24,2017-11-12 18:47:00,343.572126,,5,24,2017,0.941448
1,0a6dfc9d-c66e-4546-a2d7-3e910e6478e3,a9c9e8db-c796-4d87-bdba-bbc9bf683d40,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,2017-05-24,2017-11-12 00:47:36,457.952172,ISK,5,24,2017,...,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,c615b3e2-794e-4920-8e79-327ba280b98a,2015-08-30,2017-11-12 19:25:43,4197.05762,LBP,8,30,2015,0.909533
2,0a6dfc9d-c66e-4546-a2d7-3e910e6478e3,a9c9e8db-c796-4d87-bdba-bbc9bf683d40,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,2017-05-24,2017-11-12 00:47:36,457.952172,ISK,5,24,2017,...,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,2bef971a-f3d1-495c-b302-18ba49afb6fe,2016-12-31,2017-11-12 18:58:48,1707.600629,,12,31,2016,0.636809
3,0a6dfc9d-c66e-4546-a2d7-3e910e6478e3,a9c9e8db-c796-4d87-bdba-bbc9bf683d40,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,2017-05-24,2017-11-12 00:47:36,457.952172,ISK,5,24,2017,...,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,20828d8e-2442-49b2-88ba-dac4d9b85ae8,2016-12-31,2017-11-12 18:58:40,1707.600629,,12,31,2016,0.835557
4,0a6dfc9d-c66e-4546-a2d7-3e910e6478e3,a9c9e8db-c796-4d87-bdba-bbc9bf683d40,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,2017-05-24,2017-11-12 00:47:36,457.952172,ISK,5,24,2017,...,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,a0ac0e7d-abe2-46be-8ea5-a226e1dafef5,2017-05-24,2017-11-12 15:47:42,343.572126,,5,24,2017,0.902075


In [299]:
in_and_out_f1 = in_and_out.copy()
in_and_out_f1['timediff'] = in_and_out['time_x'] - in_and_out['time_y']
in_and_out_f1['timediff'] = in_and_out_f1['timediff'].apply(lambda x: timedelta.total_seconds(x))
in_and_out_f1['timediff'] = in_and_out_f1['timediff'] + 24*3600*(in_and_out_f1['day_x']-in_and_out_f1['day_y']) + 30*24*3600*(in_and_out_f1['month_x']-in_and_out_f1['month_y']) + 365*30*24*3600*(in_and_out_f1['year_x']-in_and_out_f1['year_y'])
in_and_out_f1['abstimediff'] = abs(in_and_out_f1['timediff'])
in_and_out_f1.head()

Unnamed: 0,tid_x,source_x,target_x,date_x,time_x,amount_x,currency_x,month_x,day_x,year_x,...,date_y,time_y,amount_y,currency_y,month_y,day_y,year_y,node_similarity_y,timediff,abstimediff
0,0a6dfc9d-c66e-4546-a2d7-3e910e6478e3,a9c9e8db-c796-4d87-bdba-bbc9bf683d40,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,2017-05-24,2017-11-12 00:47:36,457.952172,ISK,5,24,2017,...,2017-05-24,2017-11-12 18:47:00,343.572126,,5,24,2017,0.941448,-64764.0,64764.0
1,0a6dfc9d-c66e-4546-a2d7-3e910e6478e3,a9c9e8db-c796-4d87-bdba-bbc9bf683d40,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,2017-05-24,2017-11-12 00:47:36,457.952172,ISK,5,24,2017,...,2015-08-30,2017-11-12 19:25:43,4197.05762,LBP,8,30,2015,0.909533,1883799000.0,1883799000.0
2,0a6dfc9d-c66e-4546-a2d7-3e910e6478e3,a9c9e8db-c796-4d87-bdba-bbc9bf683d40,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,2017-05-24,2017-11-12 00:47:36,457.952172,ISK,5,24,2017,...,2016-12-31,2017-11-12 18:58:48,1707.600629,,12,31,2016,0.636809,927265700.0,927265700.0
3,0a6dfc9d-c66e-4546-a2d7-3e910e6478e3,a9c9e8db-c796-4d87-bdba-bbc9bf683d40,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,2017-05-24,2017-11-12 00:47:36,457.952172,ISK,5,24,2017,...,2016-12-31,2017-11-12 18:58:40,1707.600629,,12,31,2016,0.835557,927265700.0,927265700.0
4,0a6dfc9d-c66e-4546-a2d7-3e910e6478e3,a9c9e8db-c796-4d87-bdba-bbc9bf683d40,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,2017-05-24,2017-11-12 00:47:36,457.952172,ISK,5,24,2017,...,2017-05-24,2017-11-12 15:47:42,343.572126,,5,24,2017,0.902075,-54006.0,54006.0


The table "shady_deals_1" is concerned with pairs of one incoming and one outgoing payment of very similar values in a very short timeframe. It does not aggregate over payments, so it only deals with the most naive form of laundering: when you receive money and then send it away immediately. However, they level of confidence for these is very high.
We are focused on the "starting node" of the cycle, i.e. the node that first sends the money and receives almost the same amount later. In choosing which transaction to keep, priority is given to the closest transaction in time.

In [300]:
#Nonnegative timediff means this client first sent stuff and then received stuff.
in_and_out_f2 = in_and_out_f1[(in_and_out_f1.abstimediff < 7*24*3600) & (in_and_out_f1.timediff >= 0)]
in_and_out_f2 = in_and_out_f2[(abs(in_and_out_f2.amount_x  - in_and_out_f2.amount_y) < 0.1*in_and_out_f2.amount_x) & (in_and_out_f2.amount_x < in_and_out_f2.amount_y)]

In [301]:
shady_deals_1 = in_and_out_f2[['tid_x', 'tid_y', 'source_x', 'target_x', 'target_y','abstimediff','timediff','amount_x','currency_x','amount_y','currency_y','node_similarity_x', 'node_similarity_y']]
print(shady_deals_1.shape)
shady_deals_dropped = shady_deals_1.sort_values(by='abstimediff', ascending=True).drop_duplicates(subset=['tid_x'], keep='first')
temp1 = shady_deals_dropped[shady_deals_dropped.amount_x+shady_deals_dropped.amount_y > 40]
temp1['explanation'] = temp1.apply(lambda row: cycle_suspicion_desc(row['node_similarity_x'], row['node_similarity_y'], mean_similarity, std_similarity), axis=1)
temp1['suspicion_score'] = temp1.apply(lambda row: cycle_suspicion_score(row['node_similarity_x'], row['node_similarity_y'], mean_similarity, std_similarity), axis=1)
temp1

(558, 13)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,tid_x,tid_y,source_x,target_x,target_y,abstimediff,timediff,amount_x,currency_x,amount_y,currency_y,node_similarity_x,node_similarity_y,explanation,suspicion_score
84170,b16abcc1-60b0-49ab-94ec-fac47db90f7d,927b74ee-8d11-4109-b207-93e34db789d1,47baeac5-b075-437f-9b4d-9fc0b2fc2e63,743189b4-6ad1-4c66-97ac-4fb4072cc1d0,2132a902-7930-405a-aca0-da0160c06b4e,0.0,0.0,937.457723,,940.451548,,0.575907,0.758713,Client has had suspicious incoming transaction...,1.153144
51872,93929256-352c-4416-ba62-30382690fe57,2cdeb125-e83a-4d25-89f1-83e610d45e8c,d06357e2-41f2-4313-b8d5-6d8fd88b536b,b7b4bf00-65e8-4219-826c-17a41a030491,388350c5-da53-4c53-bbca-c0c15c72e6bb,0.0,0.0,937.457723,,940.451548,,0.880004,0.878915,Client's suspicious behaviour is with parties ...,1.000000
22012,80891dc0-0643-4854-9ddf-2c3ab7802f41,ce7a673d-a2e7-4cd4-a366-23a302bc61d9,dec6c080-7317-48d5-89d3-64fd7ddef45e,62ac9d0d-d095-4419-ac17-3a14403b3788,2b6d52d9-7a20-483c-950b-176f2d79c7be,0.0,0.0,937.457723,TRY,940.451548,,0.880699,0.817876,Client's suspicious behaviour is with parties ...,1.000000
137446,6d154a46-1ad4-4d08-89cf-bfe6951beb32,75936993-1fbc-4c11-b81c-e6977ff1d410,f296f4aa-904d-4398-be80-bea88949a1fe,ea979a24-5db3-4631-ab13-dbfe9be6bbe7,70b77aa2-a5ca-450f-a445-c8a85f0fa989,0.0,0.0,371.643714,,372.561046,,0.777662,0.903934,Client's suspicious behaviour is with parties ...,1.000000
21895,dfcaef03-ad13-472f-9e60-6cb214dd42ae,0362be13-bf8b-42cc-906f-e263c187fd68,14d283a5-c6e6-4c25-9afb-ebb2e1ff2a7e,62ac9d0d-d095-4419-ac17-3a14403b3788,d84e8a14-d531-497c-87cb-ee985099e9f9,0.0,0.0,937.457723,,940.451548,,0.914913,0.572161,Client has had suspicious outgoing transaction...,1.156890
116645,d1c84909-57b6-4bd0-a164-ff487bc67119,7e2cbd1b-d7d9-4488-9e75-588e23b43e2c,2b71c8cb-8069-4b08-9da3-c7870f44dae7,511cd838-e9f0-4e5d-b785-62ba666d4175,6e91725e-699f-4957-9d44-a17d733ba070,0.0,0.0,937.457723,,940.451548,,0.729382,0.787902,Client's suspicious behaviour is with parties ...,1.000000
60360,26658100-3fc5-47f8-9fe6-e5b677acc0e6,e631badf-07ca-4449-b820-185c3cb26cc7,124e9c28-03a7-4bf5-81e5-13c21513ad51,baec1c93-520f-449e-8f0c-67fcb8071f86,51bcc2ca-e13c-4d30-a7e7-ec2972f9857a,0.0,0.0,371.643714,,372.561046,BWP,0.798404,0.867003,Client's suspicious behaviour is with parties ...,1.000000
97778,9353421c-6c99-448e-a498-5d9ae89d9a4a,077c6dd7-9f32-4b43-a618-e67c820767de,eb858845-e4c6-4a6b-bae8-4f305fec2072,7208b226-5803-4ae0-8318-6145c8abfb82,03dd228f-b6a9-49c3-861e-711770f11a35,0.0,0.0,937.457723,TZS,940.451548,,0.953848,0.717299,Client has had suspicious outgoing transaction...,1.011752
62589,0d2c5f49-dd88-4b8a-b750-907c92e79966,a38313cc-3bc9-455b-94bf-743e7eef0e4c,8e9807f6-0b84-494c-9f5b-1dbc30f35aa4,773d2745-7c0c-4ec0-95d1-180a384316da,1f82df7d-5d1c-457f-9c24-b0b56da86bc0,0.0,0.0,937.457723,,940.451548,,0.492494,0.583143,Client has had both incoming and outgoing tran...,1.382465
163591,493cd271-495a-4a22-99bf-b8175c918f32,be2bbbaa-f54f-4790-be1d-4816d3b5f40f,a3f80d53-6e21-491a-90b3-334704a6e66a,311be468-c6ad-4ad4-8c59-c87ac6be6047,099c538d-0b5b-4b8e-b688-eb68b3b91f53,0.0,0.0,371.643714,,372.561046,,0.908462,0.783812,Client's suspicious behaviour is with parties ...,1.000000


In [302]:
shady_deals_1.drop_duplicates(subset=['tid_y'])

Unnamed: 0,tid_x,tid_y,source_x,target_x,target_y,abstimediff,timediff,amount_x,currency_x,amount_y,currency_y,node_similarity_x,node_similarity_y
1200,15895b8a-fa35-4f52-81c7-d015846401f3,3a667343-bc3b-4038-86bb-66358b38db36,0d819a35-7a25-42c0-a6cc-691672ea2d34,9e4c7d85-e510-4be8-9b6c-3442e2c9dadf,19ce1dd2-f9e8-4a59-9261-7bbd4dba0398,28780.0,28780.0,371.643714,VEF,372.561046,,0.825665,0.485097
1201,15895b8a-fa35-4f52-81c7-d015846401f3,75b7f80f-19f6-4432-8bdc-e55b1e5bac96,0d819a35-7a25-42c0-a6cc-691672ea2d34,9e4c7d85-e510-4be8-9b6c-3442e2c9dadf,70b77aa2-a5ca-450f-a445-c8a85f0fa989,35972.0,35972.0,371.643714,VEF,372.561046,,0.825665,0.636572
1204,15895b8a-fa35-4f52-81c7-d015846401f3,cae19cad-5cc1-4810-a30d-208f54ce99ab,0d819a35-7a25-42c0-a6cc-691672ea2d34,9e4c7d85-e510-4be8-9b6c-3442e2c9dadf,706e90d5-966b-4de2-a0a6-dda58e03b24a,7204.0,7204.0,371.643714,VEF,372.561046,,0.825665,0.651858
1205,15895b8a-fa35-4f52-81c7-d015846401f3,2a0a27cb-9945-468b-aaa1-e4f1ee3b6ec9,0d819a35-7a25-42c0-a6cc-691672ea2d34,9e4c7d85-e510-4be8-9b6c-3442e2c9dadf,099c538d-0b5b-4b8e-b688-eb68b3b91f53,14396.0,14396.0,371.643714,VEF,372.561046,SRD,0.825665,0.671334
1206,15895b8a-fa35-4f52-81c7-d015846401f3,702f3830-cd1b-42d6-9ef8-a589d6a5c204,0d819a35-7a25-42c0-a6cc-691672ea2d34,9e4c7d85-e510-4be8-9b6c-3442e2c9dadf,51bcc2ca-e13c-4d30-a7e7-ec2972f9857a,21588.0,21588.0,371.643714,VEF,372.561046,,0.825665,0.732781
1302,ce4165cc-0f9b-41c0-820a-dfe2170ffc2a,24312015-d5c3-41c7-98bd-82e1c5fd88e4,f296f4aa-904d-4398-be80-bea88949a1fe,9e4c7d85-e510-4be8-9b6c-3442e2c9dadf,6c4eeaaf-b089-4e19-ad9e-ae1f61f1b010,7192.0,7192.0,371.643714,,372.561046,,0.647214,0.740379
1303,ce4165cc-0f9b-41c0-820a-dfe2170ffc2a,86ffb82b-ed22-49dd-90d1-eb5174e0168f,f296f4aa-904d-4398-be80-bea88949a1fe,9e4c7d85-e510-4be8-9b6c-3442e2c9dadf,f14cf002-0a02-49a8-bbff-ed3a777edfe0,0.0,0.0,371.643714,,372.561046,,0.647214,0.723565
1567,5ac6091b-ed10-4417-94f0-c175c9585d4e,17aa76b7-275b-4783-9c0a-2431657c61a4,a1a53c61-32e0-454f-9db0-70d03c0b869c,9e4c7d85-e510-4be8-9b6c-3442e2c9dadf,75aea411-042e-41c9-b84d-b59ec594ac1a,568729.0,568729.0,11.923067,GTQ,12.638264,XBD,0.797900,0.840710
9068,d8322d33-86fa-41f1-8d5d-67b77857ca53,7a914f84-3052-4e0a-b05d-706344202027,a7de8922-df18-40fe-b288-775a1f99f07c,ad794383-40e8-4010-b171-18e481382032,9028cab1-27a9-4a9b-bba5-39f6212589c0,10770.0,10770.0,2708.577340,,2711.964847,,0.453937,0.714626
9781,eedd870f-4f35-4082-898b-0bb669734c42,59328a0d-92bb-43a5-8925-5db6a693dde9,2b71c8cb-8069-4b08-9da3-c7870f44dae7,c81d20dd-8b65-43d1-bb07-6988fffa299f,d84e8a14-d531-497c-87cb-ee985099e9f9,28816.0,28816.0,937.457723,,940.451548,IRR,0.884513,0.727009


This time, we want aggregations: we want to first match nearby incoming and outgoing transactions with the target of the later one being the source of the earlier one, and then we want to see if there are people who have, within that small timeframe, had a considerable circulation of credit without much of that credit actually remaining in their accounts. Obviously, this is less accurate and less reliable than the previous one, but we will account for that by assigning a lower "cycle suspiciousness score" to these.

We will work by assigning to each of these guys two "week numbers" (their week and their previous week) so that stuff can be matched based on time and then be aggregated; then we can drop any duplicates.

For the scoring, we will consider the mean of the similarities and also that the incoming money should be less than the outgoing.

In [303]:
in_and_out_f3 = in_and_out_f1[(in_and_out_f1.abstimediff < 4*24*3600) & (in_and_out_f1.timediff >= 0)]
in_and_out_f3[['tid_x', 'tid_y', 'source_x', 'target_x', 'target_y','timediff','date_x','amount_x','currency_x','amount_y','currency_y','node_similarity_x','node_similarity_y']]

Unnamed: 0,tid_x,tid_y,source_x,target_x,target_y,timediff,date_x,amount_x,currency_x,amount_y,currency_y,node_similarity_x,node_similarity_y
18,0a6dfc9d-c66e-4546-a2d7-3e910e6478e3,9f957d49-c2b0-4f09-ae0d-0b28b794876f,a9c9e8db-c796-4d87-bdba-bbc9bf683d40,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,509e52c6-e76b-4a02-981e-c9e55ccdf2e4,24.0,2017-05-24,457.952172,ISK,343.572126,,0.956734,0.905544
56,6a63e069-35dd-42d4-ad7b-c032c4906f2c,9f957d49-c2b0-4f09-ae0d-0b28b794876f,999aeafd-bb36-47d5-9875-bb8271d002b2,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,509e52c6-e76b-4a02-981e-c9e55ccdf2e4,12.0,2017-05-24,457.952172,XUA,343.572126,,0.971825,0.905544
94,dbc7647e-a122-431b-a5f7-69cded41045e,9f957d49-c2b0-4f09-ae0d-0b28b794876f,feb37038-d91c-4747-bb01-b7defe0c8a67,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,509e52c6-e76b-4a02-981e-c9e55ccdf2e4,36.0,2017-05-24,457.952172,,343.572126,,0.961954,0.905544
155,fd1bddb0-998c-4a74-8e08-eeb6bf75c08a,a117e2db-c6d0-4ef3-968b-f1ed52484e4a,2255b1df-3568-4749-b377-77961314683b,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,20828d8e-2442-49b2-88ba-dac4d9b85ae8,4.0,2016-12-31,3984.306397,,1707.600629,,0.881344,0.835557
157,fd1bddb0-998c-4a74-8e08-eeb6bf75c08a,f196cc35-5979-43b9-a2a5-2da9a4fab778,2255b1df-3568-4749-b377-77961314683b,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,37870994-3280-4a85-91ae-023fcee687df,43200.0,2016-12-31,3984.306397,,1707.600629,XBD,0.881344,0.903406
161,fd1bddb0-998c-4a74-8e08-eeb6bf75c08a,b7ce6aed-cee8-4356-9075-76fd630593c4,2255b1df-3568-4749-b377-77961314683b,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,0b1caa1b-2f32-4f32-ae98-fa62a8dcf27d,43208.0,2016-12-31,3984.306397,,1707.600629,,0.881344,0.839674
166,fd1bddb0-998c-4a74-8e08-eeb6bf75c08a,d2271a25-be54-411b-b579-041a6ba1d511,2255b1df-3568-4749-b377-77961314683b,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,53c727ff-f142-44fb-8876-37ef4863fd3d,12.0,2016-12-31,3984.306397,,1707.600629,,0.881344,0.706153
172,fd1bddb0-998c-4a74-8e08-eeb6bf75c08a,2f6f9649-8566-448b-8e15-e3f9198a95fe,2255b1df-3568-4749-b377-77961314683b,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,d369e6fc-5dc2-4fa7-8838-07aa4aa5670d,43192.0,2016-12-31,3984.306397,,1707.600629,,0.881344,0.763207
208,1fa01a8c-8936-4819-ab1a-ec995cdb2ab7,9f957d49-c2b0-4f09-ae0d-0b28b794876f,ad964847-eecb-4e52-95fc-a3d05a6bc3f7,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,509e52c6-e76b-4a02-981e-c9e55ccdf2e4,0.0,2017-05-24,457.952172,,343.572126,,0.747456,0.905544
398,940d8c41-2e5c-48c4-9e49-dd86177b8b67,9f957d49-c2b0-4f09-ae0d-0b28b794876f,a3b9bdf1-da56-4564-92be-728cef460ccf,98fdf8fb-ec37-42ed-a890-d862ef93e7bb,509e52c6-e76b-4a02-981e-c9e55ccdf2e4,12.0,2017-05-24,457.952172,BYR,343.572126,,0.916381,0.905544


In [304]:
dates_joiner = in_and_out_f3[['date_x']].drop_duplicates().apply(lambda x: int(((x['date_x']-datetime(1900,1,1)).days-1)/7), axis=1)
dates_joiner = pd.DataFrame(pd.concat((dates_joiner, in_and_out_f3[['date_x']].drop_duplicates().apply(lambda x: int(((x['date_x']-datetime(1900,1,1)).days)/7), axis=1))))

In [305]:
in_and_out_f4 = pd.merge(in_and_out_f3, dates_joiner, left_index = True, right_index = True)
in_and_out_f4 = in_and_out_f4.rename(columns = {0:'week_number'})

In [306]:
agg_on_weeks = in_and_out_f4.groupby(['week_number', 'target_x']).agg({'amount_x':'sum','amount_y':'sum','node_similarity_x':'mean','node_similarity_y':'mean'}).reset_index()
final_agg = agg_on_weeks[(abs(agg_on_weeks.amount_x  - agg_on_weeks.amount_y) < 0.1*agg_on_weeks.amount_x)].drop_duplicates(subset=['target_x', 'amount_x', 'amount_y'])

In [307]:
final_agg = final_agg[final_agg.amount_x + final_agg.amount_y > 40]
final_agg.shape

(31, 6)

In [308]:
final_agg['suspicion_score'] = final_agg.apply(lambda x: cycle_suspicion_for_agg(x.amount_x, x.amount_y, x.node_similarity_x, x.node_similarity_y, mean_similarity, std_similarity), axis=1)
final_agg['explanation'] = final_agg.apply(lambda row: cycle_suspicion_desc(row['node_similarity_x'], row['node_similarity_y'], mean_similarity, std_similarity), axis=1)

In [309]:
all_suspects = pd.concat((temp1['target_x'].drop_duplicates(), final_agg['target_x'].drop_duplicates())).drop_duplicates()
len(all_suspects)

71

In [310]:
loopsjson = pd.read_json('loops.json', lines=True)
loopsjson.shape

(22, 133)

In [311]:
all_json_ids = []
for i in range(loopsjson.shape[0]):
    if loopsjson.loc[i,0] is not None:
        all_json_ids.append(loopsjson.loc[i,0]['from']['id'])
        #all_json_ids.append(loopsjson.loc[i,j]['to']['id'])
            
len(all_json_ids)

22

In [312]:
counter=0
for item in all_suspects:
    if item in all_json_ids:
        counter += 1
        
print(counter)

14


In [318]:
final_df = pd.concat((temp1[['target_x', 'node_similarity_x', 'node_similarity_y', 'amount_x', 'amount_y', 'explanation', 'suspicion_score']], final_agg[['target_x', 'node_similarity_x', 'node_similarity_y', 'amount_x', 'amount_y', 'explanation', 'suspicion_score']])).sort_values(by='suspicion_score', ascending=False).drop_duplicates(subset=['target_x'], keep='first')

In [320]:
final_df = final_df.sort_values(by='suspicion_score', ascending=False)
final_df

Unnamed: 0,target_x,node_similarity_x,node_similarity_y,amount_x,amount_y,explanation,suspicion_score
62589,773d2745-7c0c-4ec0-95d1-180a384316da,0.492494,0.583143,937.457723,940.451548,Client has had both incoming and outgoing tran...,1.382465
195582,b2dadf07-d8bd-4aa6-b315-93f04c343d13,0.788934,0.416182,30265.157604,30460.319685,Client has had suspicious outgoing transaction...,1.312868
1220,9e4c7d85-e510-4be8-9b6c-3442e2c9dadf,0.679330,0.485097,371.643714,372.561046,Client has had both incoming and outgoing tran...,1.293675
9068,ad794383-40e8-4010-b171-18e481382032,0.453937,0.714626,2708.577340,2711.964847,Client has had both incoming and outgoing tran...,1.289538
119471,459a3be7-def8-451b-bf7f-a9e11056c2a5,0.537664,0.767979,371.643714,372.561046,Client has had suspicious incoming transaction...,1.191387
217461,1c323eb5-f70b-4b8c-aa94-7c71dbaceaa4,0.713849,0.582402,53548.837346,54152.905661,Client has had both incoming and outgoing tran...,1.161850
89762,c5932de4-b8d1-4f32-a3de-9b5c71b01c6e,0.569672,0.907460,937.457723,940.451548,Client has had suspicious incoming transaction...,1.159379
21895,62ac9d0d-d095-4419-ac17-3a14403b3788,0.914913,0.572161,937.457723,940.451548,Client has had suspicious outgoing transaction...,1.156890
84170,743189b4-6ad1-4c66-97ac-4fb4072cc1d0,0.575907,0.758713,937.457723,940.451548,Client has had suspicious incoming transaction...,1.153144
230210,8b429945-43c9-4004-a86a-232a7c509e49,0.639389,0.671561,26801.730804,26827.090601,Client has had both incoming and outgoing tran...,1.147151


In [281]:
json_result = final_df[['target_x','explanation','suspicion_score']].rename(columns={'target_x':'id', 'suspicion_score':'score', 'explanation':'message'}).to_json(orient='records')
json.dump(json_result, open('ranked_list.json', mode='w'))