In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Combining Wallet Cheating Probabilities from Clustering and MIDAS

In [6]:
# Importing clustering results
# avgMonthPay is in the unit of Storj token
# LSCPprob is the probability of cheating (the closer to 1 the more suspicious)
clustering_results = pd.read_csv('data/LSCPoutputFinal.csv', index_col=0)
clustering_results

Unnamed: 0,wallet,LSCP_score,avgMonthPay,prob
0,0x000000000c7c9cbb8485c5e38c6c0da6a1017c1f,0.931822,20.254692,0.824286
1,0x000000043dc3052d771845a71efc05b67f40abb4,-0.578138,10.161788,0.281585
2,0x00083c3ae5730507fa39a90220df704c09cf124d,0.040648,10.152021,0.516212
3,0x000e4b8d275cbe6af5c01a73bab484282317d8bb,1.794154,10.208978,0.963606
4,0x00101a0e98ecea02681a5447c3cab0c787d109f6,0.032456,1.091716,0.512946
...,...,...,...,...
5305,0xffa41d204d22962c199cf75bc08d01497b4467bb,0.682992,19.353835,0.752694
5306,0xffbfc143924950bbd4ac6b6b4879001b6252dc40,0.338919,5.391815,0.632665
5307,0xffcfa143fbbc303790b6e5458cc3e3b570cf5343,-0.018348,3.504446,0.492681
5308,0xffe336267854a7133a99a26910564439e5d57610,0.567333,5.269874,0.714756


In [8]:
# Importing midas results
# Values are the probability of cheating (the closer to 1 the more suspicious)
# max is the highest probability of transactions involving that wallet
# mean is the average probability of transactions involving that wallet
midas_results = pd.read_csv('data/midas_wallet_anomalousprobs.csv')
midas_results

Unnamed: 0,max,mean,wallet
0,6.582532e-06,4.870944e-08,0xf3d1f26b31db83c8899626746d0bbc59d9dd223c
1,2.661345e-10,1.330672e-10,0x4fc6044e44978cf9ea78eb43c2832416996c63ad
2,1.873271e-07,9.366356e-08,0xb8dcf450328f64182a2b3efbece2d072f558230a
3,0.000000e+00,0.000000e+00,0xaa92358737c1a59caee1a5114e1d1ef4c99498e2
4,3.349674e-04,4.762558e-05,0xe0a97c2634c0b1d2530b2f92e46697460692ca0c
...,...,...,...
32298,0.000000e+00,0.000000e+00,0xca265359f2e7466b755bbb6fccc63a07b05e656b
32299,0.000000e+00,0.000000e+00,0xbc7dcf868a5f465d8e0f37ae52ef8cd8b569d178
32300,0.000000e+00,0.000000e+00,0xd9a3b4cea7813cae12cf1c9e83829e631ef87831
32301,0.000000e+00,0.000000e+00,0x84ff6064e303ddde2abec41dc1f3d5ce700564e7


In [9]:
# This merging will filter midas results to only wallets that have received any payment in the last 6 months (up to Apr 2021)
wallet_probs = clustering_results.merge(midas_results, how='left', on='wallet')
wallet_probs

Unnamed: 0,wallet,LSCP_score,avgMonthPay,prob,max,mean
0,0x000000000c7c9cbb8485c5e38c6c0da6a1017c1f,0.931822,20.254692,0.824286,1.000000,0.547652
1,0x000000043dc3052d771845a71efc05b67f40abb4,-0.578138,10.161788,0.281585,1.000000,0.421226
2,0x00083c3ae5730507fa39a90220df704c09cf124d,0.040648,10.152021,0.516212,0.999999,0.545074
3,0x000e4b8d275cbe6af5c01a73bab484282317d8bb,1.794154,10.208978,0.963606,1.000000,0.646507
4,0x00101a0e98ecea02681a5447c3cab0c787d109f6,0.032456,1.091716,0.512946,0.999998,0.461926
...,...,...,...,...,...,...
5305,0xffa41d204d22962c199cf75bc08d01497b4467bb,0.682992,19.353835,0.752694,1.000000,0.713088
5306,0xffbfc143924950bbd4ac6b6b4879001b6252dc40,0.338919,5.391815,0.632665,1.000000,0.685635
5307,0xffcfa143fbbc303790b6e5458cc3e3b570cf5343,-0.018348,3.504446,0.492681,1.000000,0.615309
5308,0xffe336267854a7133a99a26910564439e5d57610,0.567333,5.269874,0.714756,1.000000,0.580501


In [10]:
# Check if there is any duplicated wallets in combined_results
wallet_probs.duplicated(keep=False).sum()

0

In [15]:
# Combining probability results and weighting the combined results based on the wallets' avgMonthPay
wallet_probs['payWeight'] = wallet_probs.apply(lambda row: 1/(1+np.exp(4-0.8*row['avgMonthPay'])) - (row['avgMonthPay']==0)*(1/(1+np.exp(4))) if row['avgMonthPay']<10 else 1, axis=1)
wallet_probs['combined_prob'] = wallet_probs.apply(lambda row: row['payWeight'] * (0.7*row['prob'] + 0.3*row['mean']), axis=1)
wallet_probs = wallet_probs.sort_values('combined_prob', ascending=False).reset_index(drop=True)
wallet_probs = wallet_probs.rename({'avgMonthPay': 'avg_month_pay','prob': 'LSCP_prob', 'max': 'MIDAS_max_prob', 'mean': 'MIDAS_mean_prob'}, axis=1)
wallet_probs

Unnamed: 0,wallet,LSCP_score,avg_month_pay,LSCP_prob,MIDAS_max_prob,MIDAS_mean_prob,payWeight,combined_prob
0,0xf16c1659ae0f623d0fed9c70d656010ee19e4cfb,3.981664,26.996233,0.999966,9.999998e-01,9.400243e-01,1.000000,0.981983
1,0xd4dc78f61b91e342ebb2b4ccbd96830b0387a674,2.377910,318.925693,0.991294,9.999999e-01,9.356822e-01,1.000000,0.974611
2,0xb3edb2b352396ecd5c2f30065ca8da7d9d45fc1f,2.278720,15.390177,0.988658,9.999998e-01,9.413098e-01,1.000000,0.974454
3,0xdbfd53f1b9b7d21926a96061624a597da0015295,2.611965,14.042047,0.995499,9.999998e-01,9.155026e-01,1.000000,0.971500
4,0xf9cd0615f1900f0ae661c6f8675d5289fc5b39d6,2.311699,94.454830,0.989603,9.999998e-01,9.276778e-01,1.000000,0.971025
...,...,...,...,...,...,...,...,...
5305,0x123ba8077274650a09c3cd2e5dfb1528ee76b7d4,-1.534603,0.063618,0.062441,8.021942e-07,2.673981e-07,0.018908,0.000826
5306,0x2a06cfcb392708612c593282ffbcff102563fdf9,-1.517442,0.000172,0.064578,0.000000e+00,0.000000e+00,0.017989,0.000813
5307,0x32daaefec55607c05678f998fa29152b7a85a7ef,-1.587363,0.012682,0.056215,0.000000e+00,0.000000e+00,0.018166,0.000715
5308,0x2658307f042be881bc24dc344bb6f4dca66326fe,-1.628406,0.028089,0.051719,0.000000e+00,0.000000e+00,0.018387,0.000666


In [16]:
wallet_probs[wallet_probs['combined_prob']>0.95]

Unnamed: 0,wallet,LSCP_score,avg_month_pay,LSCP_prob,MIDAS_max_prob,MIDAS_mean_prob,payWeight,combined_prob
0,0xf16c1659ae0f623d0fed9c70d656010ee19e4cfb,3.981664,26.996233,0.999966,1.0,0.940024,1.0,0.981983
1,0xd4dc78f61b91e342ebb2b4ccbd96830b0387a674,2.37791,318.925693,0.991294,1.0,0.935682,1.0,0.974611
2,0xb3edb2b352396ecd5c2f30065ca8da7d9d45fc1f,2.27872,15.390177,0.988658,1.0,0.94131,1.0,0.974454
3,0xdbfd53f1b9b7d21926a96061624a597da0015295,2.611965,14.042047,0.995499,1.0,0.915503,1.0,0.9715
4,0xf9cd0615f1900f0ae661c6f8675d5289fc5b39d6,2.311699,94.45483,0.989603,1.0,0.927678,1.0,0.971025
5,0xbd44e954f85d2c2188239d903c723697cf3bd37e,2.308915,45.357305,0.989526,1.0,0.926341,1.0,0.97057
6,0x8cbb890bbcdc4c0788c2017cce1eebb7b2c449c2,2.614006,506.575121,0.995526,1.0,0.909737,1.0,0.969789
7,0xb887239b672cf7c8c5c39c01eb061a9de691f70c,2.405947,27.708232,0.991935,1.0,0.917792,1.0,0.969692
8,0x650efbd5cf60eff44cad1999abdf3aabf7fef77d,2.708456,101.513118,0.99662,1.0,0.888693,1.0,0.964242
9,0x3c0da17b90323897e54d5732a72ae5f348ed7e99,2.207535,21.862317,0.986362,1.0,0.910752,1.0,0.963679


In [18]:
# (Optional) Export wallet probabilities to csv
# wallet_probs.to_csv('data/wallet_combinedProb.csv', columns=['wallet', 'avg_month_pay', 'LSCP_prob', 'MIDAS_max_prob', 'MIDAS_mean_prob', 'combined_prob'], index=False)

## Translating Wallets to Node Operators

In [19]:
apNodes = pd.read_csv('data/ap_nodes_a.csv', usecols=['id', 'created_at', 'wallet', 'email_hash'])
euNodes = pd.read_csv('data/eu_nodes_a.csv', usecols=['id', 'created_at', 'wallet', 'email_hash'])
usNodes = pd.read_csv('data/us_nodes_a.csv', usecols=['id', 'created_at', 'wallet', 'email_hash'])
allNodes = apNodes.append(euNodes).append(usNodes)
allNodes['wallet'] = allNodes['wallet'].astype(str).str.lower()
allNodes['id'] = allNodes['id'].astype(str).str.lower()
allNodes['email_hash'] = allNodes['email_hash'].astype(str).str.lower()
allNodes['created_at'] = pd.to_datetime(allNodes['created_at'])

In [20]:
len(allNodes)

111417

In [21]:
len(allNodes.drop_duplicates())

46661

In [22]:
# There are duplicated node IDs with multiple created_at values
allNodes.groupby('id').agg({'created_at': 'nunique'}).sort_values('created_at', ascending=False)

Unnamed: 0_level_0,created_at
id,Unnamed: 1_level_1
8e2422c64a04d197bb2045df3fc738f4880ef494fbb90be02fbafa1900000000,3
acde9d993f49609db2386b43096d3c23d4d1a5956405d031bd0d17c880000000,3
d1690f12ae3ea8c97f7e4257f66c2a715926ebe8fe246fcbc50ce05000000000,3
39801cf1d6984b3df51e4a1698daa066a208c557d0a04e774a186bad80000000,3
d16ce1110aee9de9dd527824856d69a87752235bd96c40932c93850140000000,3
...,...
5e86fdea798b7264ede7f7a5af164ef48c52aeb083e35e7b9442a88000000000,1
5e8543ea7fc31062dbee5dc5c39d836b9307aa39820ad6e3de036a5000000000,1
5e84dd880773bf6ce740159399d11de4aa8797795a38c1a6205d293000000000,1
5e82b22d457b1194eb056f9a159e6cb0f76b6a0c11581e4aedc3b5b000000000,1


In [23]:
# For this task, we are ignoring the duplication issue
allNodes = allNodes.drop_duplicates(subset=['id', 'wallet', 'email_hash'])
len(allNodes)

37277

### Nodes Having Multiple Wallets

There are node IDs that have multiple wallets. The maximum number of wallets used by a node ID is 2. We will assign the cheating probability of a node ID as the maximum cheating probability of the wallets that are associated to that node ID.

In [24]:
# Checking for node IDs that have multiple wallets
node_agg = allNodes.groupby('id').agg({'wallet': 'nunique'}).sort_values('wallet', ascending=False).reset_index().rename({'wallet': 'wallet_count'})
node_agg

Unnamed: 0,id,wallet
0,ace4e4644a0af8c49ba45b0b3bc23d91b208d44dbce55f...,2
1,f8bc81398b8e7a3a8cbf70bd38de6c87da4bfd29fd96bd...,2
2,524a841c8321108650bda2aa7d968906c84852d957a3a0...,2
3,1c77791adc39ed11c82332691328ed657cc6a767ab765a...,2
4,2797dac28ee81e166e8e4494859071877083b08a622139...,2
...,...,...
37245,554f02f9ba69ed248cd6e6e777f847748fd93edb4849fd...,1
37246,554f9187576595875338070f8488aef93a50da5a2b3643...,1
37247,55517281c98ddd0d2b0d01fc2186cfde952631ef3f75dc...,1
37248,5552365b9673e57f34ef455128b022078093c9c10bd520...,1


In [25]:
# A node's cheating probability is taken as the maximum cheating probability from the wallets that are associated to that node
node_probs = allNodes[['id', 'wallet']].merge(wallet_probs[['wallet', 'combined_prob']], how='left', on='wallet').fillna(0)
node_probs = node_probs.groupby('id').agg({'combined_prob': 'max'}).sort_values('combined_prob', ascending=False).reset_index()
node_probs

Unnamed: 0,id,combined_prob
0,84c9d498a3759a23eb237d70372148ba01fd5c1d758afb...,0.981983
1,789e8862a7414125f220bab72ffad3cf83f93701668c8f...,0.981983
2,477268212dd0031948591b8c970321a1c9807c2f164d5b...,0.981983
3,dd37c97114e40ce45623347b69de8cb108af44aa4d2e0a...,0.981983
4,476afc843485e97a7de5ee133ded17ac9e01cc0f47b58a...,0.981983
...,...,...
37245,72c77c90adbce066a073eb20a20d05e55c5187542c2b4f...,0.000000
37246,72b96faa0577359cddc2d8ef0aff03b22909a3454378f8...,0.000000
37247,72b780609ab53e21818fea008a26f59a54665ae025f2b6...,0.000000
37248,72b5dde5cf1415572f0efd538c5a20feb05b6561583578...,0.000000


In [26]:
# (Optional) Export node probabilities to csv
# node_probs.to_csv('data/node_combinedProb.csv', index=False)

In [27]:
# Ordering the nodes by the largest number of associated wallets
node_agg.merge(node_probs, how='left', on='id')

Unnamed: 0,id,wallet,combined_prob
0,ace4e4644a0af8c49ba45b0b3bc23d91b208d44dbce55f...,2,0.000000
1,f8bc81398b8e7a3a8cbf70bd38de6c87da4bfd29fd96bd...,2,0.000000
2,524a841c8321108650bda2aa7d968906c84852d957a3a0...,2,0.750000
3,1c77791adc39ed11c82332691328ed657cc6a767ab765a...,2,0.000000
4,2797dac28ee81e166e8e4494859071877083b08a622139...,2,0.028705
...,...,...,...
37245,554f02f9ba69ed248cd6e6e777f847748fd93edb4849fd...,1,0.000000
37246,554f9187576595875338070f8488aef93a50da5a2b3643...,1,0.010396
37247,55517281c98ddd0d2b0d01fc2186cfde952631ef3f75dc...,1,0.000000
37248,5552365b9673e57f34ef455128b022078093c9c10bd520...,1,0.041497


From the above output, having multiple wallets do not seem to be strongly associated with high cheating probability.

### Wallets Used by Multiple Nodes

On the other hand, there are wallets that used by multiple node IDs. We will check the cheating probabilities of such wallets, ordered from the largest number of associated node IDs.

In [28]:
# Checking for wallets that are used by multiple node IDs
wallet_agg = allNodes.groupby('wallet').agg({'id': 'nunique'}).sort_values('id', ascending=False).reset_index().rename({'id': 'node_count'})
wallet_agg

Unnamed: 0,wallet,id
0,0x37c611913ac8a0f3004e3180606720bf186588d6,1390
1,0x74756b75fc77896a88ba3544572ce1595d78d7d9,265
2,0x9b354bdc877839d290c6d2a729a9a9c2e875a4a6,222
3,0xd4dc78f61b91e342ebb2b4ccbd96830b0387a674,203
4,0x6ecd8f156488ca3591ef01e34daff6c360274069,196
...,...,...
16524,0x671fd14199c751ebb0916c7f983de8f6cb8e86d9,1
16525,0x6723035c3d8472f43c4012aa2a9b7f01c9c44999,1
16526,0x672380b39ade17a286128b9ffbbaef9cb9ea234e,1
16527,0x6723a33fd0add3c925b072c94b751c166e91a7b5,1


In [29]:
wallet_agg.merge(wallet_probs[['wallet', 'combined_prob']], how='left', on='wallet')

Unnamed: 0,wallet,id,combined_prob
0,0x37c611913ac8a0f3004e3180606720bf186588d6,1390,0.962678
1,0x74756b75fc77896a88ba3544572ce1595d78d7d9,265,
2,0x9b354bdc877839d290c6d2a729a9a9c2e875a4a6,222,0.897160
3,0xd4dc78f61b91e342ebb2b4ccbd96830b0387a674,203,0.974611
4,0x6ecd8f156488ca3591ef01e34daff6c360274069,196,
...,...,...,...
16524,0x671fd14199c751ebb0916c7f983de8f6cb8e86d9,1,
16525,0x6723035c3d8472f43c4012aa2a9b7f01c9c44999,1,0.187297
16526,0x672380b39ade17a286128b9ffbbaef9cb9ea234e,1,
16527,0x6723a33fd0add3c925b072c94b751c166e91a7b5,1,


While some of the top wallets have NaN values as their cheating probabilities (meaning they were excluded from the clustering approach because they did not receive any payment in the last six months up to April 2021), some of the other top wallets have high cheating probabilities. In fact, the node IDs with cheating probabilities above 0.95 seem to be closely associated with such multi-account wallets, as shown below.

In [30]:
# Node IDs with cheating probabilities > 0.95
node_probs[node_probs['combined_prob']>0.95]

Unnamed: 0,id,combined_prob
0,84c9d498a3759a23eb237d70372148ba01fd5c1d758afb...,0.981983
1,789e8862a7414125f220bab72ffad3cf83f93701668c8f...,0.981983
2,477268212dd0031948591b8c970321a1c9807c2f164d5b...,0.981983
3,dd37c97114e40ce45623347b69de8cb108af44aa4d2e0a...,0.981983
4,476afc843485e97a7de5ee133ded17ac9e01cc0f47b58a...,0.981983
...,...,...
2695,2435f720db5f5c4509a028d029112fdc1cff2c9c88cfb6...,0.951487
2696,2a63b9ba91f796cd6569a5d75ca8985651da060e9fdeb9...,0.951487
2697,b048bfaa8f0d5950cef84e82cb3ca27cb9271d2ae4af28...,0.951487
2698,86dcde2b6bfd2c9ec3eb81b15339b74f1e0066255c9890...,0.951487


In [31]:
# Counts of nodes with the same cheating probabilities (> 0.95)
node_probs[node_probs['combined_prob']>0.95]['combined_prob'].value_counts()

0.962678    1390
0.974611     203
0.981983     193
0.969789     180
0.969692     134
0.974454      79
0.956579      72
0.952820      67
0.970570      61
0.952733      53
0.961475      52
0.971500      49
0.951487      49
0.964242      43
0.963679      30
0.971025      27
0.953393      18
Name: combined_prob, dtype: int64