# Modeling - ethereum - v1

# Setup

## Library import
We import all the required Python libraries

In [1]:
import os

# Data manipulation
from feature_engine.encoding import RareLabelEncoder, CountFrequencyEncoder
import pandas as pd
import numpy as np

# Visualizations
import plotly
import plotly.graph_objs as go
import plotly.offline as ply
plotly.offline.init_notebook_mode(connected=True)
import matplotlib as plt

import lightgbm as lgb
import missingno as msno
from pycaret.classification import ClassificationExperiment
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from ydata_profiling import ProfileReport

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

# Options for pandas
# pd.options.display.max_columns = None
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', '{:.5f}'.format)
# pd.options.display.float_format = '{:.5f}'.format
# pd.options.display.max_rows = 120

## Local library import
We import all the required local libraries libraries

In [2]:
os.chdir('../')
from src.utils.data_describe import breve_descricao, serie_nulos, cardinalidade, check_for_equal_columns
os.chdir('./notebooks/')

# Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.

In [3]:
RAW_FOLDER = '../data/raw/'
INTERIM_FOLDER = '../data/interim/'
PROCESSED_FOLDER = '../data/processed/'
REPORTS_FOLDER = '../reports/'
RANDOM_STATE = 42

train_size = 0.15

# Experience 01 - Only numerical attributes

## Data import

In [None]:
df_train_exp_01 = pd.read_parquet(INTERIM_FOLDER + 'df_train_exp_01.pqt')
df_test_exp_01 = pd.read_parquet(INTERIM_FOLDER + 'df_test_exp_01.pqt')

display(df_train_exp_01.head(3))

## Testing with pycaret

In [None]:
exp = ClassificationExperiment()
exp.setup(
    df_train_exp_01.drop(columns=['Address']),
    target='FLAG', train_size=train_size, session_id=RANDOM_STATE
);

In [None]:
best = exp.compare_models()

In [None]:
exp.plot_model(best, plot='feature')

# Experience 02 - Categorical features encoded by frequency¶

## Data import

In [4]:
df_train_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'df_train_exp_02.pqt')
df_test_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'df_test_exp_02.pqt')

display(df_test_exp_02.head(3))

Unnamed: 0,Address,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,total_transactions_(including_tnx_to_create_contract),total_Ether_sent,total_ether_received,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_rec_contract_addr,ERC20_avg_time_between_contract_tnx,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
2493,0x3f023af0a857d0a1591d887194e045301ea2d585,0,17491.39,30810.71,259001.92,6,5,0,5,2,0.7095,199.99958,45.16721,0.00371,199.99516,37.63876,11,225.83257,225.83608,0.00351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89009,0.51434
33,0x046e3a705d9bcd0e53d2b45161b48e39f4bc4090,1,140.5,0.0,281.0,2,1,0,1,2,5.0,5.0,5.0,2.49694,2.5,2.49847,3,4.99694,5.0,0.00306,1.0,13.37,0.0,0.0,0.0,1.0,1.0,0.0,13.37,13.37,13.37,0.0,0.0,0.0,0.0,1.0,0.89009,0.167
3683,0x350d038b70e4acea961427347c6bed44ad8a3d50,0,0.0,2031.71,201139.0,0,99,1,7,0,0.0,8.999,0.9554,0.0,0.0,0.0,100,0.0,94.58509,94.58509,2.0,2.06025,0.0,0.0,0.0,2.0,2.0,0.0,0.74222,1.31803,1.03012,0.0,0.0,0.0,0.0,2.0,0.89009,0.31866


## Testing with pycaret

In [None]:
exp = ClassificationExperiment()
exp.setup(
    df_train_exp_02.drop(columns=['Address']),
    target='FLAG', train_size=train_size, session_id=RANDOM_STATE
);

In [None]:
best = exp.compare_models()

In [None]:
exp.plot_model(best, plot='feature')

## Random Forest Classifier

In [11]:
exp = ClassificationExperiment()
exp.setup(
    df_train_exp_02.drop(columns=['Address']),
    target='FLAG', train_size=train_size, session_id=RANDOM_STATE
)

classifier = exp.create_model('rf')

Unnamed: 0,Description,Value
0,Session id,42
1,Target,FLAG
2,Target type,Binary
3,Original data shape,"(3976, 37)"
4,Transformed data shape,"(3976, 37)"
5,Transformed train set shape,"(596, 37)"
6,Transformed test set shape,"(3380, 37)"
7,Numeric features,36
8,Rows with missing values,17.9%
9,Preprocess,True


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9333,0.9799,0.9643,0.9,0.931,0.8667,0.8686
1,0.95,0.9939,0.9286,0.963,0.9455,0.8993,0.8998
2,0.9167,0.9905,0.9286,0.8966,0.9123,0.833,0.8334
3,0.9833,0.9989,0.9643,1.0,0.9818,0.9664,0.967
4,0.9667,1.0,0.9286,1.0,0.963,0.9327,0.9349
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.8814,0.9827,0.8571,0.8889,0.8727,0.7617,0.7621
7,0.9831,1.0,0.9643,1.0,0.9818,0.966,0.9665
8,0.9153,0.9942,0.9643,0.871,0.9153,0.8309,0.8353
9,0.9831,0.9988,0.963,1.0,0.9811,0.9658,0.9663


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
tuned_classifier = exp.tune_model(classifier, n_iter=25, fold=10, optimize='AUC')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9167,0.9866,0.9286,0.8966,0.9123,0.833,0.8334
1,0.9333,0.9888,0.8929,0.9615,0.9259,0.8655,0.8674
2,0.95,0.9877,0.9643,0.931,0.9474,0.8998,0.9003
3,0.95,0.9955,0.9643,0.931,0.9474,0.8998,0.9003
4,0.95,0.99,0.8929,1.0,0.9434,0.8989,0.9035
5,0.9833,1.0,0.9643,1.0,0.9818,0.9664,0.967
6,0.8814,0.9839,0.8214,0.92,0.8679,0.7609,0.7649
7,0.9661,0.9977,0.9643,0.9643,0.9643,0.932,0.932
8,0.9322,0.9896,0.9643,0.9,0.931,0.8645,0.8665
9,0.9322,0.9965,0.8889,0.96,0.9231,0.8626,0.8647


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 25 candidates, totalling 250 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


## Light Gradient Boosting Machine

In [13]:
exp = ClassificationExperiment()
exp.setup(
    df_train_exp_02.drop(columns=['Address']),
    target='FLAG', train_size=train_size, session_id=RANDOM_STATE
)

classifier = exp.create_model('lightgbm')

Unnamed: 0,Description,Value
0,Session id,42
1,Target,FLAG
2,Target type,Binary
3,Original data shape,"(3976, 37)"
4,Transformed data shape,"(3976, 37)"
5,Transformed train set shape,"(596, 37)"
6,Transformed test set shape,"(3380, 37)"
7,Numeric features,36
8,Rows with missing values,17.9%
9,Preprocess,True


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.95,0.9844,1.0,0.9032,0.9492,0.9002,0.9047
1,0.9667,0.9978,0.9643,0.9643,0.9643,0.933,0.933
2,0.95,0.9922,1.0,0.9032,0.9492,0.9002,0.9047
3,0.9667,0.9967,0.9643,0.9643,0.9643,0.933,0.933
4,0.9833,1.0,0.9643,1.0,0.9818,0.9664,0.967
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9322,0.9896,0.9286,0.9286,0.9286,0.8641,0.8641
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.9492,1.0,0.8889,1.0,0.9412,0.8967,0.9015


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
tuned_classifier = exp.tune_model(classifier, n_iter=25, fold=10, optimize='AUC')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.95,0.9888,1.0,0.9032,0.9492,0.9002,0.9047
1,0.9667,0.9944,0.9643,0.9643,0.9643,0.933,0.933
2,0.9167,0.9877,1.0,0.8485,0.918,0.8344,0.8461
3,0.9833,0.9967,0.9643,1.0,0.9818,0.9664,0.967
4,0.9667,0.9989,0.9286,1.0,0.963,0.9327,0.9349
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.8983,0.9804,0.8929,0.8929,0.8929,0.7961,0.7961
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,0.9661,0.9988,0.9643,0.9643,0.9643,0.932,0.932
9,0.9661,1.0,0.9259,1.0,0.9615,0.9313,0.9335


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 25 candidates, totalling 250 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
