In [None]:
# anomaly detection using the dataset https://www.kaggle.com/datasets/vagifa/ethereum-frauddetection-dataset/data

In [1]:
%env TF_CPP_MIN_LOG_LEVEL=3

env: TF_CPP_MIN_LOG_LEVEL=3


In [2]:
import numpy as np 
import pandas as pd 
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [13]:
data = pd.read_csv("./data/dataset.csv")

In [14]:
data.shape

(9841, 51)

In [15]:
data.head()

Unnamed: 0.1,Unnamed: 0,Index,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,...,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name,ERC20 most sent token type,ERC20_most_rec_token_type
0,0,1,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,844.26,1093.71,704785.63,721,89,0,...,0.0,16831000.0,271779.92,0.0,0.0,0.0,39.0,57.0,Cofoundit,Numeraire
1,1,2,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,...,2.260809,2.260809,2.260809,0.0,0.0,0.0,1.0,7.0,Livepeer Token,Livepeer Token
2,2,3,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,,XENON
3,3,4,0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e,0,10219.6,15785.09,397555.9,25,9,0,...,100.0,9029.231,3804.076893,0.0,0.0,0.0,1.0,11.0,Raiden,XENON
4,4,5,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,...,0.0,45000.0,13726.65922,0.0,0.0,0.0,6.0,27.0,StatusNetwork,EOS


In [16]:
data.FLAG.value_counts() / data.FLAG.value_counts().values.sum()

FLAG
0    0.778579
1    0.221421
Name: count, dtype: float64

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 51 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Unnamed: 0                                            9841 non-null   int64  
 1   Index                                                 9841 non-null   int64  
 2   Address                                               9841 non-null   object 
 3   FLAG                                                  9841 non-null   int64  
 4   Avg min between sent tnx                              9841 non-null   float64
 5   Avg min between received tnx                          9841 non-null   float64
 6   Time Diff between first and last (Mins)               9841 non-null   float64
 7   Sent tnx                                              9841 non-null   int64  
 8   Received Tnx                                          9841

In [18]:
## data cleaning 

data.columns

Index(['Unnamed: 0', 'Index', 'Address', 'FLAG', 'Avg min between sent tnx',
       'Avg min between received tnx',
       'Time Diff between first and last (Mins)', 'Sent tnx', 'Received Tnx',
       'Number of Created Contracts', 'Unique Received From Addresses',
       'Unique Sent To Addresses', 'min value received', 'max value received ',
       'avg val received', 'min val sent', 'max val sent', 'avg val sent',
       'min value sent to contract', 'max val sent to contract',
       'avg value sent to contract',
       'total transactions (including tnx to create contract',
       'total Ether sent', 'total ether received',
       'total ether sent contracts', 'total ether balance',
       ' Total ERC20 tnxs', ' ERC20 total Ether received',
       ' ERC20 total ether sent', ' ERC20 total Ether sent contract',
       ' ERC20 uniq sent addr', ' ERC20 uniq rec addr',
       ' ERC20 uniq sent addr.1', ' ERC20 uniq rec contract addr',
       ' ERC20 avg time between sent tnx', ' ERC20 

In [19]:

data.columns = [x.lower().strip() for x in data.columns]

cols_to_drop = ["erc20 most sent token type",
                "erc20_most_rec_token_type",
                "address",
                "index",
                "unnamed: 0"]

features = [x for x in data.columns if (x != "flag" and x not in cols_to_drop)]

In [20]:
features

['avg min between sent tnx',
 'avg min between received tnx',
 'time diff between first and last (mins)',
 'sent tnx',
 'received tnx',
 'number of created contracts',
 'unique received from addresses',
 'unique sent to addresses',
 'min value received',
 'max value received',
 'avg val received',
 'min val sent',
 'max val sent',
 'avg val sent',
 'min value sent to contract',
 'max val sent to contract',
 'avg value sent to contract',
 'total transactions (including tnx to create contract',
 'total ether sent',
 'total ether received',
 'total ether sent contracts',
 'total ether balance',
 'total erc20 tnxs',
 'erc20 total ether received',
 'erc20 total ether sent',
 'erc20 total ether sent contract',
 'erc20 uniq sent addr',
 'erc20 uniq rec addr',
 'erc20 uniq sent addr.1',
 'erc20 uniq rec contract addr',
 'erc20 avg time between sent tnx',
 'erc20 avg time between rec tnx',
 'erc20 avg time between rec 2 tnx',
 'erc20 avg time between contract tnx',
 'erc20 min val rec',
 'erc20

In [22]:
nunique_values = data.loc[:, features].nunique()
nunique_values

avg min between sent tnx                                5013
avg min between received tnx                            6223
time diff between first and last (mins)                 7810
sent tnx                                                 641
received tnx                                             727
number of created contracts                               20
unique received from addresses                           256
unique sent to addresses                                 258
min value received                                      4589
max value received                                      6302
avg val received                                        6767
min val sent                                            4719
max val sent                                            6647
avg val sent                                            5854
min value sent to contract                                 3
max val sent to contract                                   4
avg value sent to contra

In [23]:
# removing constant values
features = [x for x in features if x in nunique_values.loc[(nunique_values > 1)]]

In [24]:
features

['avg min between sent tnx',
 'avg min between received tnx',
 'time diff between first and last (mins)',
 'sent tnx',
 'received tnx',
 'number of created contracts',
 'unique received from addresses',
 'unique sent to addresses',
 'min value received',
 'max value received',
 'avg val received',
 'min val sent',
 'max val sent',
 'avg val sent',
 'min value sent to contract',
 'max val sent to contract',
 'avg value sent to contract',
 'total transactions (including tnx to create contract',
 'total ether sent',
 'total ether received',
 'total ether sent contracts',
 'total ether balance',
 'total erc20 tnxs',
 'erc20 total ether received',
 'erc20 total ether sent',
 'erc20 total ether sent contract',
 'erc20 uniq sent addr',
 'erc20 uniq rec addr',
 'erc20 uniq sent addr.1',
 'erc20 uniq rec contract addr',
 'erc20 min val rec',
 'erc20 max val rec',
 'erc20 avg val rec',
 'erc20 min val sent',
 'erc20 max val sent',
 'erc20 avg val sent',
 'erc20 uniq sent token name',
 'erc20 uni

In [25]:
data.loc[:, features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 38 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   avg min between sent tnx                              9841 non-null   float64
 1   avg min between received tnx                          9841 non-null   float64
 2   time diff between first and last (mins)               9841 non-null   float64
 3   sent tnx                                              9841 non-null   int64  
 4   received tnx                                          9841 non-null   int64  
 5   number of created contracts                           9841 non-null   int64  
 6   unique received from addresses                        9841 non-null   int64  
 7   unique sent to addresses                              9841 non-null   int64  
 8   min value received                                    9841

In [31]:
class PipeSteps(BaseEstimator, TransformerMixin):

    def __init__(self, columns=[]):        
        self.columns = columns

    def fit(self, X, y=None):

        return self # return the instance (the method does not make any modification)

    def transform(self, X):
        X = X.copy()

        return X
    
class SelectColumns(PipeSteps):

    def transform(self, X):
        X = X.copy()

        return X.loc[:, self.columns]
    
class FillMissingData(PipeSteps):

    def fit(self, X, y=None):
        self.means = { col: X.loc[:, col].mean() for col in self.columns }

        return self

    def transform(self, X):
        
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].fillna(self.means[col])
        
        return X
    
class StandardizesData(PipeSteps):

    def fit(self, X, y=None):
        
        self.scaler = StandardScaler()
        self.scaler.fit(X.loc[:, self.columns])
        
        return self

    def transform(self, X):
        
        X = X.copy()
        X.loc[:, self.columns] = self.scaler.transform(X.loc[:, self.columns])
        
        return X
    
class GetData(PipeSteps):

    def transform(self, X):
        X = X.copy()

        return X.values

In [32]:
preprocessing_pipe = Pipeline([
    ("feature_selection", SelectColumns(features)),
    ("fill_missing", FillMissingData(features)),
    ("standard_scaling", StandardizesData(features)),
    ("returnValues", GetData())
    
    ])

In [34]:
X, y = data.loc[:, features], data.flag.values
y = to_categorical(y)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [37]:
X_train_preprocessed = preprocessing_pipe.fit_transform(X_train)
X_test_preprocessed  = preprocessing_pipe.transform(X_test)

In [38]:
X_train_preprocessed[0]

array([-0.23259911, -0.35751318, -0.67960935, -0.15850238, -0.17742011,
       -0.02949086, -0.10752043, -0.09576188, -0.12799126, -0.04205048,
       -0.03300198, -0.04812766, -0.0635131 , -0.19730908,  0.        ,
       -0.01204994, -0.01204994, -0.21338291, -0.03013806, -0.03642245,
       -0.01204994, -0.01933439, -0.08228942, -0.06411348, -0.01425745,
       -0.02258837, -0.05367989, -0.09036273, -0.05319145, -0.22783703,
       -0.0278394 , -0.05511047, -0.0240207 , -0.01364824, -0.0135741 ,
       -0.01331307, -0.21172682, -0.2313969 ])

In [40]:
## modeling

### simple sequential model
model = Sequential()

model.add(Input( shape=(len(features),) ))

model.add(Dense(len(features), activation="relu"))

model.add(Dense(20, activation="relu"))

model.add(Dense(5, activation="relu"))

model.add(Dense(2, activation="softmax"))

In [41]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [42]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 38)                1482      
                                                                 
 dense_1 (Dense)             (None, 20)                780       
                                                                 
 dense_2 (Dense)             (None, 5)                 105       
                                                                 
 dense_3 (Dense)             (None, 2)                 12        
                                                                 
Total params: 2379 (9.29 KB)
Trainable params: 2379 (9.29 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [43]:
%%time
model.fit(
    X_train_preprocessed, y_train, 
    validation_data=(X_test_preprocessed, y_test), 
    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 26 s, sys: 2.81 s, total: 28.8 s
Wall time: 16.2 s


<keras.src.callbacks.History at 0x7ff1a4782f70>

In [48]:
predictions = [np.argmax(x) for x in model.predict(X_test_preprocessed)]



In [49]:
y_test

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [50]:
acc = metrics.accuracy_score(predictions, [np.argmax(y) for y in y_test])
acc

0.9576701659329495

In [52]:
auc = metrics.roc_auc_score([np.argmax(y) for y in y_test], model.predict(X_test_preprocessed)[:,1])
auc



0.9802698820935902

In [59]:
## "deploy"

new_data = pd.read_csv("./data/test_data.csv")

new_data.columns = [x.lower().strip() for x in new_data.columns]

In [60]:
new_data.loc[:, features].head()

Unnamed: 0,avg min between sent tnx,avg min between received tnx,time diff between first and last (mins),sent tnx,received tnx,number of created contracts,unique received from addresses,unique sent to addresses,min value received,max value received,...,erc20 uniq sent addr.1,erc20 uniq rec contract addr,erc20 min val rec,erc20 max val rec,erc20 avg val rec,erc20 min val sent,erc20 max val sent,erc20 avg val sent,erc20 uniq sent token name,erc20 uniq rec token name
0,2570.59,3336.01,30572.7,8,3,0,2,4,0.1,40.0,...,0.0,1.0,600.0,600.0,600.0,0.0,0.0,0.0,0.0,1.0


In [61]:

X = preprocessing_pipe.transform(new_data)

In [62]:
predictions = [np.argmax(x) for x in model.predict(X)]



In [64]:
predictions

[0]

In [63]:
# Resultado
if predictions[0] == 0:
    print("Not fraud.")
else:
    print("Fraud!")

Not fraud.
