In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv('../Data/address_data_combined.csv')
df.head()

Unnamed: 0,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Unique Received From Addresses,min value received,max value received,avg val received,min val sent,avg val sent,total transactions (including tnx to create contract,total ether received,total ether balance
0,0x87d884aaa6ff9e9b6014631b0abae80b53953fb8,1,5151.68,15159.08,71235.62,1,0.01,0.02,0.013367,0.0,0.0,8,0.0401,0.0401
1,0xd42393df90d582bd8a5493171f0173e3a017d391,1,1179.02,1124.89,25126.45,13,0.0,0.75,0.176667,0.145,0.41927,22,2.65,-0.284889
2,0x3025c36d8a9620d3df89e9e9b1acbdfd639a6f37,1,361.73,0.0,723.47,1,4.999916,4.999916,4.999916,2.49,2.499538,3,4.999916,0.00084
3,0x6309f709faad518fc158af4c14edfa7b06424770,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
4,0x3d020954e30c3d40b7f0c533cf198bc10dd45a49,1,14280.6,1479.86,45357.57,21,0.035,0.2,0.099286,2.084658,2.084658,22,2.085,0.000342


In [3]:
X = df.drop(columns=['Address','FLAG'])
y = df['FLAG']
print(X.shape)
print(y.shape)

(14180, 12)
(14180,)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.3,stratify=y)

print(X_train.shape)
print(X_test.shape)

(9926, 12)
(4254, 12)


In [5]:
import numpy as np

columns = ['Avg min between sent tnx', 'Avg min between received tnx',
       'Time Diff between first and last (Mins)',
       'Unique Received From Addresses', 'min value received',
       'max value received ', 'avg val received', 'min val sent',
       'avg val sent', 'total transactions (including tnx to create contract',
       'total ether received', 'total ether balance']
    
scaler = MinMaxScaler()

# Log for Skewed Data
for c in columns:
  X_train[c] = X_train[c].apply(lambda x: np.log(x) if x > 0 else 0)
  X_test[c] = X_test[c].apply(lambda x: np.log(x) if x > 0 else 0)

# Scaling
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

np.isnan(X_train)

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [6]:
tuned_parameters = [
    {"kernel": ["rbf"], "gamma": [1,0.1,0.01,0.001], "C": [1, 10, 100, 1000]},
    {"kernel": ["poly"], "gamma": [1,0.1,0.01,0.001], "C": [1, 10, 100, 1000]},
    {"kernel": ["sigmoid"], "gamma": [1,0.1,0.01,0.001], "C": [1, 10, 100, 1000]},
]

grid = GridSearchCV(SVC(),tuned_parameters, refit=True,verbose=2,cv=5)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time=   1.1s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time=   1.1s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time=   1.1s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time=   1.1s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time=   1.1s
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=   1.6s
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=   1.7s
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=   1.6s
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=   1.6s
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=   1.7s
[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=   2.5s
[CV] END ........................C=1, gamma=0.0

In [8]:
grid.best_params_

{'C': 1000, 'gamma': 1, 'kernel': 'rbf'}