In [None]:
import numpy as np
import pandas as pd
import os
from sklearn import datasets
from sklearn.metrics import accuracy_score, f1_score
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
df = pd.read_csv('data/address_data_combined.csv')
df.head()

Unnamed: 0,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Unique Received From Addresses,min value received,max value received,avg val received,min val sent,avg val sent,total transactions (including tnx to create contract,total ether received,total ether balance
0,0x87d884aaa6ff9e9b6014631b0abae80b53953fb8,1,5151.68,15159.08,71235.62,1,0.01,0.02,0.013367,0.0,0.0,8,0.0401,0.0401
1,0xd42393df90d582bd8a5493171f0173e3a017d391,1,1179.02,1124.89,25126.45,13,0.0,0.75,0.176667,0.145,0.41927,22,2.65,-0.284889
2,0x3025c36d8a9620d3df89e9e9b1acbdfd639a6f37,1,361.73,0.0,723.47,1,4.999916,4.999916,4.999916,2.49,2.499538,3,4.999916,0.00084
3,0x6309f709faad518fc158af4c14edfa7b06424770,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
4,0x3d020954e30c3d40b7f0c533cf198bc10dd45a49,1,14280.6,1479.86,45357.57,21,0.035,0.2,0.099286,2.084658,2.084658,22,2.085,0.000342


In [None]:
X = df.drop(columns=['Address', 'FLAG'])
y = df['FLAG']

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print(X_train_full.shape)
X_train_full.head()

(9908, 12)


Unnamed: 0,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Unique Received From Addresses,min value received,max value received,avg val received,min val sent,avg val sent,total transactions (including tnx to create contract,total ether received,total ether balance
6709,17.06,0.0,51.17,1,101.0,101.0,101.0,10.80478,33.666149,4,101.0,0.001554
13808,2679.43,2841.02,275214.32,21,0.0001,15.0,2.29585,0.0,1.877316,100,103.313259,0.060902
8333,103299.49,37551.75,826452.87,6,0.0,52.532971,23.290155,0.0,16.6275,15,163.031084,30.011085
6981,4756.98,11283.84,46352.6,1,11.0,20.0,15.5,9.975152,10.194942,7,31.0,-19.974711
676,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0


In [None]:
# Feature Engineering
from sklearn.preprocessing import MinMaxScaler

columns = ['Avg min between sent tnx', 'Avg min between received tnx',
       'Time Diff between first and last (Mins)',
       'Unique Received From Addresses', 'min value received',
       'max value received ', 'avg val received', 'min val sent',
       'avg val sent', 'total transactions (including tnx to create contract',
       'total ether received', 'total ether balance']
    
scaler = MinMaxScaler()

# Log for Skewed Data
# log on both train and test data
for c in columns:
  X_train_full[c] = X_train_full[c].apply(lambda x: np.log(x) if x > 0 else 0)
  X_test[c] = X_test[c].apply(lambda x: np.log(x) if x > 0 else 0)

# Scaling
# only use training data to fit, to avoid data leakage
# Not required for Naive Bayes but performed to ensure all variables are standardized through all ML models
X_train_full = scaler.fit_transform(X_train_full)
X_test = scaler.transform(X_test)

sum(np.isnan(X_train_full))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
###Creating Naive Bayes Classifier Model
GNB = GaussianNB(var_smoothing=2e-9)
GNB.fit(X_train_full,y_train_full)
y_pr = GNB.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pr))
print('F1 Score: ', f1_score(y_test, y_pr))

Accuracy Score:  0.7732517070873558
F1 Score:  0.768231046931408


In [None]:
### Doing Grid Search to find best hyperparameters
from sklearn.model_selection import GridSearchCV

param_grid_nb = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

grid_search_nb = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=2, cv=5, scoring='f1')

In [None]:
grid_search_nb.fit(X_train_full,y_train_full)
print(grid_search_nb.best_estimator_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ...................var_smoothing=0.8111308307896871; total time=   0.0s
[CV] END ...................var_smoothing=0.8111308307896871; total time=   0.0s
[CV] END ...................var_smoothing=0.8111308307896871; total time=   0.0s
[CV] END ...................var_smoothing=0.8111308307896871; total time=   0.0s
[CV] END ...................var_smoothing=0.8111308307896871; total time=   0.0s
[CV] END ....................var_smoothing=0.657933224657568; total time=   0.0s
[CV] END ....................var_smoothing=0.6

In [None]:
best_y_pr = grid_search_nb.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, best_y_pr))
print('F1 Score: ', f1_score(y_test, best_y_pr))

Accuracy Score:  0.7854956439839887
F1 Score:  0.7796856106408706


In [None]:
best_nb = GaussianNB(var_smoothing=0.0533669923120631)
best_nb.fit(X_train_full,y_train_full)

GaussianNB(var_smoothing=0.0533669923120631)

In [None]:
best_nb.theta_

array([[0.3753114 , 0.54892822, 0.68676335, 0.11545696, 0.81067569,
        0.55676798, 0.57059979, 0.5635936 , 0.65220652, 0.30497745,
        0.55926168, 0.30943354],
       [0.38906671, 0.46369528, 0.5386103 , 0.14103064, 0.77928058,
        0.47821933, 0.50062382, 0.58831371, 0.60499555, 0.19274126,
        0.46400865, 0.3211177 ]])

In [None]:
best_nb.var_

array([[0.05071803, 0.0672638 , 0.06402842, 0.02110447, 0.00898148,
        0.01563075, 0.01503675, 0.01420475, 0.01553808, 0.04751733,
        0.0147941 , 0.02812505],
       [0.06648541, 0.04586197, 0.09118215, 0.03047831, 0.00814782,
        0.0103426 , 0.00890123, 0.01260221, 0.01124974, 0.03573011,
        0.01082393, 0.01877809]])

In [None]:
best_nb.classes_

array([0, 1])

In [None]:
feature_dict = {'feature_name': [], 'mean': [], 'variance': [], 'fraud': []}

for label in range(len(best_nb.theta_)):
    for mean, var, col_name in zip(best_nb.theta_[label], best_nb.var_[label], columns):
    feature_dict['feature_name'].append(col_name)
    feature_dict['mean'].append(mean)
    feature_dict['variance'].append(var)
    feature_dict['fraud'].append(label)

df2 = pd.DataFrame(feature_dict)

In [None]:
df2

Unnamed: 0,feature_name,mean,variance,fraud
0,Avg min between sent tnx,0.375311,0.050718,0
1,Avg min between received tnx,0.548928,0.067264,0
2,Time Diff between first and last (Mins),0.686763,0.064028,0
3,Unique Received From Addresses,0.115457,0.021104,0
4,min value received,0.810676,0.008981,0
5,max value received,0.556768,0.015631,0
6,avg val received,0.5706,0.015037,0
7,min val sent,0.563594,0.014205,0
8,avg val sent,0.652207,0.015538,0
9,total transactions (including tnx to create co...,0.304977,0.047517,0


In [None]:
best_nb.get_params()

{'priors': None, 'var_smoothing': 0.0533669923120631}