In [3]:
import os

# Data manipulation
from feature_engine.encoding import RareLabelEncoder, CountFrequencyEncoder
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np

# Visualizations
import plotly
import plotly.graph_objs as go
import plotly.offline as ply
plotly.offline.init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import (
    LGBMClassifier, plot_importance, create_tree_digraph, plot_tree
)
import missingno as msno
# from pycaret.classification import ClassificationExperiment
from sklearn.experimental import enable_halving_search_cv
from sklearn.feature_selection import RFECV
from sklearn.model_selection import (
    train_test_split, RandomizedSearchCV, GridSearchCV, HalvingGridSearchCV, cross_validate, KFold
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn import set_config

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

# Options for pandas
set_config(transform_output = "pandas")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.5f}'.format)
# pd.options.display.float_format = '{:.5f}'.format
# pd.options.display.max_rows = 120

sns.set_context(context='paper', font_scale=2, rc=None)
sns.set_style("ticks")
sns.set_palette(sns.color_palette())


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [4]:
os.chdir('../')
from src.utils.data_describe import breve_descricao, serie_nulos, cardinalidade, check_for_equal_columns
os.chdir('./notebooks/')

def round_4(x):
    return x.round(4)

In [5]:
RAW_FOLDER = '../data/raw/'
INTERIM_FOLDER = '../data/interim/'
PROCESSED_FOLDER = '../data/processed/'
REPORTS_FOLDER = '../reports/'
RANDOM_STATE = 42

train_size = 0.15

# palette = sns.color_palette("Spectral", as_cmap=True)
palette = sns.color_palette("husl", 10)
palette

In [6]:
df_wallets_misclassified = pd.read_excel(PROCESSED_FOLDER + 'wallets_misclassified.xlsx')

df_wallets_misclassified.head()

Unnamed: 0,Resultado,Address
0,Falso positivo,0x6d57fe045dcced8b289db59f66cd4354b6483d63
1,Falso positivo,0x1c3f580daeaac2f540c998c8ae3e4b18440f7c45
2,Falso negativo,0xd9cd7461f960e56364a294f124aac77b25e2b784
3,Falso negativo,0x684ede6645f1b71d77e0aeac519114ee8be3c410
4,Falso negativo,0x9f4562c9be26c7020909b50ccde3447f1b8c4b21


In [8]:
df_raw = pd.read_parquet(INTERIM_FOLDER +  'ethereum_complete.pqt')
df_raw.head()

Unnamed: 0,Address,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,min_value_sent_to_contract,max_val_sent_to_contract,avg_value_sent_to_contract,total_transactions_(including_tnx_to_create_contract),total_Ether_sent,total_ether_received,total_ether_sent_contracts,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_sent_addr.1,ERC20_uniq_rec_contract_addr,ERC20_avg_time_between_sent_tnx,ERC20_avg_time_between_rec_tnx,ERC20_avg_time_between_rec_2_tnx,ERC20_avg_time_between_contract_tnx,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_min_val_sent_contract,ERC20_max_val_sent_contract,ERC20_avg_val_sent_contract,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
0,0x0020731604c882cf7bf8c444be97d17b19ea4316,1,1457.31,34.12,4815.43,3,13,0,10,3,1.0,2.50105,1.34844,1.00087,11.27787,5.84292,0,0,0,16,17.52875,17.52978,0,0.00104,,,,,,,,,,,,,,,,,,,,,,,,,
1,0x002bf459dc58584d58886169ea0e80f3ca95ffaf,1,3976.5,834.77,9622.53,2,2,0,1,2,0.58627,0.94751,0.76689,0.58541,0.94728,0.76635,0,0,0,4,1.53269,1.53378,0,0.00109,1.0,1.337,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.337,1.337,1.337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,Blockwell say NOTSAFU
2,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,1,112.9,31.87,321.42,2,3,0,3,1,0.00102,0.8178,0.43961,0.50039,0.81751,0.65895,0,0,0,5,1.3179,1.31882,0,0.00092,1.0,1.337,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.337,1.337,1.337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,Blockwell say NOTSAFU
3,0x0059b14e35dab1b4eee1e2926c7a5660da66f747,1,2300.37,65.1,73091.0,29,98,0,89,26,0.00078,15.72907,0.38322,0.0,36.7,1.31496,0,0,0,127,38.13377,37.55605,0,-0.57772,96.0,142677.3829,120354.7684,0.0,6.0,55.0,0.0,37.0,0.0,0.0,0.0,0.0,0.0,26436.081,1954.4847,0.0,81324.0746,5232.81602,0.0,0.0,0.0,22.0,37.0,OCoin,OCoin
4,0x005b9f4516f8e640bbe48136901738b323c53b00,1,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,


In [10]:
df_merged_wallets_misclassified = df_wallets_misclassified.merge(
    df_raw, how='left', on='Address'
)

df_merged_wallets_misclassified.head()

Unnamed: 0,Resultado,Address,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,min_value_sent_to_contract,max_val_sent_to_contract,avg_value_sent_to_contract,total_transactions_(including_tnx_to_create_contract),total_Ether_sent,total_ether_received,total_ether_sent_contracts,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_sent_addr.1,ERC20_uniq_rec_contract_addr,ERC20_avg_time_between_sent_tnx,ERC20_avg_time_between_rec_tnx,ERC20_avg_time_between_rec_2_tnx,ERC20_avg_time_between_contract_tnx,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_min_val_sent_contract,ERC20_max_val_sent_contract,ERC20_avg_val_sent_contract,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
0,Falso positivo,0x6d57fe045dcced8b289db59f66cd4354b6483d63,0,2077.07,976.44,125690.6,45,33,0,11,24,0.001,25.61688,3.48814,0.0,35.0,2.5575,0,0,0,78,115.0876,115.10856,0,0.02096,56.0,183980.2117,183959.5854,0.0,23.0,14.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,144500.0,6132.67372,0.0,114499.0,7075.36867,0.0,0.0,0.0,13.0,20.0,DGD,Aragon
1,Falso positivo,0x1c3f580daeaac2f540c998c8ae3e4b18440f7c45,0,0.0,0.62,6173.43,1,9999,0,9999,1,0.02395,284.1994,6.00801,12000.0,12000.0,12000.0,0,0,0,10000,12000.0,60074.08233,0,48074.08233,1.0,1.21955,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.21955,1.21955,1.21955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,OmiseGO
2,Falso negativo,0xd9cd7461f960e56364a294f124aac77b25e2b784,1,1292.62,15.92,15591.0,12,5,0,4,9,0.00057,16.65104,3.79867,0.0,17.94768,1.61986,0,0,0,17,19.43836,18.99336,0,-0.445,11.0,31358.75008,31358.75008,0.0,2.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,505.0,23052.29258,7839.68752,505.0,23052.29258,4479.82144,0.0,0.0,0.0,4.0,4.0,VeChain,Data
3,Falso negativo,0x684ede6645f1b71d77e0aeac519114ee8be3c410,1,1525.53,34602.99,287502.62,7,8,0,7,7,0.00585,1.85,0.57333,0.0,3.44129,0.64847,0,0,0,15,4.53932,4.58662,0,0.0473,7.0,397236.8967,397136.8967,0.0,1.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,325756.271,99309.22418,0.0,325756.271,132378.9656,0.0,0.0,0.0,3.0,4.0,Pundi X Token,blockwell.ai KYC Casper Token
4,Falso negativo,0x9f4562c9be26c7020909b50ccde3447f1b8c4b21,1,10428.11,6539.4,473852.33,26,31,0,10,15,0.0,0.15002,0.0396,0.0,0.14859,0.02192,0,0,0,57,0.56993,1.22768,0,0.65775,14.0,698.37518,93.03737,0.0,4.0,8.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,365.0,69.83752,0.0,67.84,23.25934,0.0,0.0,0.0,3.0,9.0,CarTaxi,Authoreon
