# Data preparation - ethereum - v1

# Setup

## Library import
We import all the required Python libraries

In [42]:
import os

# Data manipulation
from feature_engine.encoding import RareLabelEncoder, CountFrequencyEncoder
from feature_engine.imputation import MeanMedianImputer
import pandas as pd
import numpy as np

# Visualizations
import plotly
import plotly.graph_objs as go
import plotly.offline as ply
plotly.offline.init_notebook_mode(connected=True)
import matplotlib as plt

from tqdm import tqdm
from lightgbm import (
    LGBMClassifier, plot_importance, create_tree_digraph, plot_tree
)
import missingno as msno
from pycaret.classification import ClassificationExperiment
from sklearn.pipeline import Pipeline
from sklearn.model_selection import (
    train_test_split
    , cross_val_score
    , StratifiedKFold
    , StratifiedShuffleSplit
    , cross_validate
    , GridSearchCV
)
from ydata_profiling import ProfileReport
from sklearn.metrics import (
    confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve
)

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

# Options for pandas
# pd.options.display.max_columns = None
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', '{:.5f}'.format)
# pd.options.display.float_format = '{:.5f}'.format
# pd.options.display.max_rows = 120

## Local library import
We import all the required local libraries libraries

In [2]:
os.chdir('../')
from src.utils.data_describe import breve_descricao, serie_nulos, cardinalidade, check_for_equal_columns
os.chdir('./notebooks/')

# Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.

In [3]:
RAW_FOLDER = '../data/raw/'
INTERIM_FOLDER = '../data/interim/'
PROCESSED_FOLDER = '../data/processed/'
REPORTS_FOLDER = '../reports/'
RANDOM_STATE = 42


# Data import
We retrieve all the required data for the analysis.

In [4]:
try:
    df_raw = pd.read_parquet(INTERIM_FOLDER + 'ethereum_complete.pqt')
    print(f'PARQUET file loaded. Shape: {df_raw.shape}')
except FileNotFoundError as e:
    df_raw = pd.read_csv(RAW_FOLDER + 'ethereum_complete.csv')
    df_raw.drop(columns=['Index'], inplace=True)
   
    df_raw.to_parquet(INTERIM_FOLDER +  'ethereum_complete.pqt', index=False)
    print(f'EXCEL file loaded and PARQUET created. Shape: {df_raw.shape}')

PARQUET file loaded. Shape: (4681, 49)


In [5]:
display(df_raw.head(3))

Unnamed: 0,Address,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,min_value_sent_to_contract,max_val_sent_to_contract,avg_value_sent_to_contract,total_transactions_(including_tnx_to_create_contract),total_Ether_sent,total_ether_received,total_ether_sent_contracts,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_sent_addr.1,ERC20_uniq_rec_contract_addr,ERC20_avg_time_between_sent_tnx,ERC20_avg_time_between_rec_tnx,ERC20_avg_time_between_rec_2_tnx,ERC20_avg_time_between_contract_tnx,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_min_val_sent_contract,ERC20_max_val_sent_contract,ERC20_avg_val_sent_contract,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
0,0x0020731604c882cf7bf8c444be97d17b19ea4316,1,1457.31,34.12,4815.43,3,13,0,10,3,1.0,2.50105,1.34844,1.00087,11.27787,5.84292,0,0,0,16,17.52875,17.52978,0,0.00104,,,,,,,,,,,,,,,,,,,,,,,,,
1,0x002bf459dc58584d58886169ea0e80f3ca95ffaf,1,3976.5,834.77,9622.53,2,2,0,1,2,0.58627,0.94751,0.76689,0.58541,0.94728,0.76635,0,0,0,4,1.53269,1.53378,0,0.00109,1.0,1.337,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.337,1.337,1.337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,Blockwell say NOTSAFU
2,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,1,112.9,31.87,321.42,2,3,0,3,1,0.00102,0.8178,0.43961,0.50039,0.81751,0.65895,0,0,0,5,1.3179,1.31882,0,0.00092,1.0,1.337,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.337,1.337,1.337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,Blockwell say NOTSAFU


# Data cleaning

1. Remove duplicated registries
2. Remove duplicated columns
3. Remove constant columns
4. Remove equal columns
5. Transforming categorical features

## Removing duplicated registries

In [6]:
df_cleaned = df_raw.copy()

In [7]:
df_cleaned.drop_duplicates(inplace=True)

print(f'Shape before data cleasing: {df_raw.shape}')
print(f'After dropping duplicated rows: {df_cleaned.shape}')

Shape before data cleasing: (4681, 49)
After dropping duplicated rows: (4677, 49)


### Removing duplicated wallet with different 'FLAG' 

In [8]:
print(f'Shape before dropping duplicated wallet: {df_cleaned.shape}')

df_cleaned.drop(df_cleaned.loc[(df_cleaned['Address']=='0xd624d046edbdef805c5e4140dce5fb5ec1b39a3c') &
    (df_cleaned['FLAG']==0)].index, inplace=True)

print(f'After dropping dropping duplicated wallet: {df_cleaned.shape}')

Shape before dropping duplicated wallet: (4677, 49)
After dropping dropping duplicated wallet: (4676, 49)


## Removing duplicated columns

In [9]:
lst_duplicated_columns = [
    'ERC20_uniq_sent_addr.1', 'ERC20_avg_time_between_rec_2_tnx', 'ERC20_avg_val_sent_contract',
    'ERC20_max_val_sent_contract', 'ERC20_min_val_sent_contract',
]

df_cleaned.drop(columns=lst_duplicated_columns, inplace=True)

print(f'Shape before data cleasing: {df_raw.shape}')
print(f'After dropping duplicated columns: {df_cleaned.shape}')

Shape before data cleasing: (4681, 49)
After dropping duplicated columns: (4676, 44)


## Removing constant columns

In [10]:
lst_unitary_columns = [
    'total_ether_sent_contracts', 'min_value_sent_to_contract',
    'max_val_sent_to_contract', 'avg_value_sent_to_contract'
]

df_cleaned.drop(columns=lst_unitary_columns, inplace=True)

print(f'Shape before data cleasing: {df_raw.shape}')
print(f'After dropping duplicated columns: {df_cleaned.shape}')

Shape before data cleasing: (4681, 49)
After dropping duplicated columns: (4676, 40)


## Removing equal columns

In [11]:
lst_equal_columns = [
    'min_value_sent_to_contract', 'max_val_sent_to_contract', 'avg_value_sent_to_contract',
    'ERC20_avg_time_between_sent_tnx', 'ERC20_avg_time_between_rec_tnx'
]

for column in lst_equal_columns:
    try:
        df_cleaned.drop(columns=column, inplace=True)
    except:
        print(f"'{column}' was exluded before.")

print(30*'-')
print(f'Shape before data cleasing: {df_raw.shape}')
print(f'After dropping duplicated columns: {df_cleaned.shape}')

'min_value_sent_to_contract' was exluded before.
'max_val_sent_to_contract' was exluded before.
'avg_value_sent_to_contract' was exluded before.
------------------------------
Shape before data cleasing: (4681, 49)
After dropping duplicated columns: (4676, 38)


## Pre-processing categorical features

### Replacing "    " and "0" to "no information"

In [12]:
for column in ['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']:
    df_cleaned.loc[df_cleaned[column].isnull(), column] = "0"
    for category in [" ", "", "0"]:
        df_cleaned[column] = df_cleaned[column].str.strip()
        df_cleaned[column].replace(category, "no information", inplace=True)
    

    display(
        pd.DataFrame(
            100*df_cleaned.loc[:, column].value_counts()/df_cleaned.loc[:, column].value_counts().sum()
        ).head(10)
    )

Unnamed: 0,ERC20_most_sent_token_type
no information,89.11463
blockwell.ai KYC Casper Token,1.39008
OmiseGO,0.98375
EOS,0.85543
Golem,0.57742
StatusNetwork,0.47049
Tronix,0.32079
Qtum,0.25663
TenXPay,0.23524
BAT,0.23524


Unnamed: 0,ERC20_most_rec_token_type
no information,51.41146
Blockwell say NOTSAFU,16.65954
OmiseGO,7.59196
VIU,1.73225
GSENetwork,1.71086
INS Promo,1.15483
Livepeer Token,1.13345
blockwell.ai KYC Casper Token,1.00513
AICRYPTO,0.96236
EOS,0.79127


# Experiment 01 - Only numerical features

In [None]:
df_cleaned_exp_01 = df_cleaned.drop(columns=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']).copy()

## Splitting data set

In [None]:
test_size = 0.15
train_size = 1 - test_size

lst_test_sample = df_cleaned_exp_01.sample(int(test_size*len(df_cleaned_exp_01)), random_state=RANDOM_STATE).index.values
lst_train_sample = df_cleaned_exp_01.drop(index=lst_test_sample).index.values

print(f"""
test_size: {test_size}
test_sample (registries): {len(lst_test_sample)}

train_size: {train_size}
train_sample (registries): {len(lst_train_sample)}
""")

df_cleaned_exp_01.loc[lst_train_sample, :].head()

## Exporting

In [None]:
df_cleaned_exp_01.loc[lst_train_sample, :].to_parquet(INTERIM_FOLDER + 'df_train_exp_01.pqt')
df_cleaned_exp_01.loc[lst_test_sample, :].to_parquet(INTERIM_FOLDER + 'df_test_exp_01.pqt')

# Experiment 02 - Categorical features encoded by frequency

In [None]:
df_cleaned_exp_02 = df_cleaned.copy()
df_cleaned_exp_02.shape

In [None]:
df_temp = df_cleaned_exp_02.drop(columns=['Address', 'FLAG'])

df_temp = cardinalidade(df_temp.select_dtypes(include=[float, int]))
df_temp['Proporção Nulos'].value_counts()

## Splitting data set

In [None]:
test_size = 0.15
train_size = 1 - test_size

X = df_cleaned_exp_02.drop(columns=['Address', 'FLAG']).copy()
y = df_cleaned_exp_02[['FLAG']]

df_cleaned_train_exp_02, df_cleaned_test_exp_02 = train_test_split(
    df_cleaned_exp_02.drop(columns=['Address']), train_size=train_size, 
    random_state=RANDOM_STATE,
)

print(f"""
test_size: {test_size}
test_sample (registries): {len(df_cleaned_test_exp_02)}
test_sample's target's proportion:
{df_cleaned_test_exp_02['FLAG'].value_counts(normalize=True)}

train_size: {train_size}
train_sample (registries): {len(df_cleaned_train_exp_02)}
train_sample's target's proportion:
{df_cleaned_train_exp_02['FLAG'].value_counts(normalize=True)}
""")

df_cleaned_train_exp_02.head()

## Preprocessing

In [None]:

lst_numerical_columns = df_cleaned_train_exp_02.drop(columns=[
    'FLAG', 'ERC20_most_sent_token_type', 'ERC20_most_rec_token_type'
]).columns.tolist()

lst_categorical_columns = ['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']

pipe = Pipeline([
    ('rare', RareLabelEncoder(
        n_categories=2, max_n_categories=3, replace_with='rare', tol=0.1,
        variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
    )),
    ('frq', CountFrequencyEncoder(
        encoding_method='frequency',
        variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
    )),
    ('imputer', MeanMedianImputer(
        imputation_method='median',
        variables=lst_numerical_columns
    ))
])

df_cleaned_train_exp_02_transformed = pipe.fit_transform(df_cleaned_train_exp_02)
df_cleaned_test_exp_02_transformed = pipe.transform(df_cleaned_test_exp_02)

display(df_cleaned_train_exp_02_transformed['ERC20_most_rec_token_type'].value_counts())
display(df_cleaned_train_exp_02_transformed['ERC20_most_sent_token_type'].value_counts())

### Effect of transformation over numerical features

#### Training dataset

In [None]:

df_num_features_before_transf = df_cleaned_train_exp_02.loc[:, lst_numerical_columns].describe().T[['count', 'mean', 'std', '50%']]
df_num_features_after_transf = df_cleaned_train_exp_02_transformed.loc[:, lst_numerical_columns].describe().T[['count', 'mean', 'std', '50%']]

pd.concat([df_num_features_before_transf, df_num_features_after_transf], axis=1)
# .to_excel(PROCESSED_FOLDER + 'df_after_preprocessing.xlsx')

#### Testing dataset

In [None]:

df_num_features_before_transf = df_cleaned_test_exp_02.loc[:, lst_numerical_columns].describe().T[['count', 'mean', 'std', '50%']]
df_num_features_after_transf = df_cleaned_test_exp_02_transformed.loc[:, lst_numerical_columns].describe().T[['count', 'mean', 'std', '50%']]

pd.concat([df_num_features_before_transf, df_num_features_after_transf], axis=1).to_excel(PROCESSED_FOLDER + 'df_after_preprocessing.xlsx')

In [None]:
lst_numeric_columns = df_cleaned_train_exp_02.drop(columns=['FLAG']).select_dtypes(exclude=object).columns.tolist()

pd.DataFrame(
    df_cleaned_train_exp_02_transformed.loc[:, lst_numeric_columns].median(axis=0), columns=['Mediana']
).to_excel(PROCESSED_FOLDER + 'preprocessing_numerical.xlsx')

### Effect of transformation over categorical features

#### Training dataset

In [None]:
column = 'ERC20_most_rec_token_type'

# Before
df_temp = pd.DataFrame(
    100*df_cleaned_train_exp_02[[column]].value_counts() /\
    df_cleaned_train_exp_02[[column]].value_counts().sum(),
    columns=['Proporção']
)

df_temp['cumsum'] = df_temp['Proporção'].cumsum()
df_temp.to_excel(PROCESSED_FOLDER + f'Before_{column}.xlsx', engine='xlsxwriter')
display(df_temp.head())

# After

display(df_cleaned_train_exp_02_transformed[column].unique())

In [None]:
column = 'ERC20_most_sent_token_type'

# Before
df_temp = pd.DataFrame(
    100*df_cleaned_train_exp_02[[column]].value_counts() /\
    df_cleaned_train_exp_02[[column]].value_counts().sum(),
    columns=['Proporção']
)

df_temp['cumsum'] = df_temp['Proporção'].cumsum()
df_temp.to_excel(PROCESSED_FOLDER + f'Before_{column}.xlsx', engine='xlsxwriter')
display(df_temp.head())

# After

display(df_cleaned_train_exp_02_transformed[column].unique())

### Testing dataset

In [None]:
column = 'ERC20_most_rec_token_type'

# Before
df_temp = pd.DataFrame(
    100*df_cleaned_test_exp_02[[column]].value_counts() /\
    df_cleaned_test_exp_02[[column]].value_counts().sum(),
    columns=['Proporção']
)

df_temp['cumsum'] = df_temp['Proporção'].cumsum()
display(df_temp.head())

# After
display(df_cleaned_test_exp_02_transformed[column].unique())

In [None]:
# Before
df_temp = pd.DataFrame(
    100*df_cleaned_test_exp_02[['ERC20_most_sent_token_type']].value_counts() /\
    df_cleaned_test_exp_02[['ERC20_most_sent_token_type']].value_counts().sum(),
    columns=['Proporção']
)

df_temp['cumsum'] = df_temp['Proporção'].cumsum()
display(df_temp.head())

# After
display(df_cleaned_test_exp_02_transformed['ERC20_most_sent_token_type'].unique())

## Exporting

In [None]:
df_cleaned_train_exp_02_transformed.to_parquet(INTERIM_FOLDER + 'df_cleaned_train_exp_02_transformed.pqt')
df_cleaned_test_exp_02_transformed.to_parquet(INTERIM_FOLDER + 'df_cleaned_test_exp_02_transformed.pqt')

# Experiment 03 - Train-test proportion

In [13]:
df_cleaned_exp_03 = df_cleaned.copy()
df_cleaned_exp_03.shape

(4676, 38)

In [14]:
df_temp = df_cleaned_exp_03.drop(columns=['Address', 'FLAG']).copy()

df_temp = cardinalidade(df_temp.select_dtypes(include=[float, int]))
df_temp['Proporção Nulos'].value_counts()

0.00000    18
0.17729    16
Name: Proporção Nulos, dtype: int64

In [18]:
lst_numerical_columns = df_cleaned_exp_03.drop(columns=[
    'Address', 'FLAG', 'ERC20_most_sent_token_type', 'ERC20_most_rec_token_type'
]).columns.tolist()

lst_categorical_columns = ['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']

## Splitting data set

In [24]:
df_cleaned_train_exp_03_transformed

Unnamed: 0,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,total_transactions_(including_tnx_to_create_contract),total_Ether_sent,total_ether_received,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_rec_contract_addr,ERC20_avg_time_between_contract_tnx,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
2086,1,0.00000,0.00000,0.00000,0,0,0,0,0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0,0.00000,0.00000,0.00000,1.00000,1.33700,0.00000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,1.33700,0.75435,0.00000,0.00000,0.00000,0.00000,1.00000,0.89332,0.51684
3106,0,5.65000,10140.71000,385561.47000,38,38,0,2,38,0.20002,0.80000,0.22188,0.19922,0.79956,0.22136,76,8.41169,8.43133,0.01965,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.89332,0.51684
2865,0,6.27000,6710.88000,26868.58000,4,4,0,3,4,0.49000,1.99000,1.27614,0.48951,1.98950,1.27567,8,5.10268,5.10456,0.00188,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.89332,0.51684
4249,0,50.98000,2066.45000,1197704.03000,551,566,0,1,1,0.42579,1.12725,0.99727,0.51160,5.03699,1.02332,1117,563.85013,564.45631,0.60618,4.00000,302.30583,0.00000,0.00000,0.00000,4.00000,4.00000,0.00000,0.00000,202.30583,75.57646,0.00000,0.00000,0.00000,0.00000,4.00000,0.89332,0.31631
831,1,0.00000,0.00000,23012.52000,1,1,0,1,1,0.40000,0.40000,0.40000,0.39990,0.39990,0.39990,2,0.39990,0.40000,0.00010,1.00000,1.33700,0.00000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,1.33700,0.75435,0.00000,0.00000,0.00000,0.00000,1.00000,0.89332,0.51684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4431,0,0.00000,0.00000,452.63000,1,1,0,1,1,5.00000,5.00000,5.00000,4.99938,4.99938,4.99938,2,4.99938,5.00000,0.00062,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.89332,0.51684
466,1,3645.85000,482.76000,20160.32000,5,4,0,4,4,0.50000,1.28182,0.71883,0.00250,1.86000,0.57450,9,2.87250,2.87530,0.00280,1.00000,1.33700,0.00000,0.00000,0.00000,1.00000,1.00000,0.00000,1.33700,1.33700,1.33700,0.00000,0.00000,0.00000,0.00000,1.00000,0.89332,0.16684
3092,0,23.19000,0.33000,70.25000,3,2,0,2,3,16.83425,84.16575,50.50000,8.77963,47.15525,33.66615,5,100.99845,101.00000,0.00155,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.89332,0.51684
3773,0,7.28000,0.00000,21.83000,3,2,0,2,3,467.50181,1533.49819,1000.50000,0.17413,1373.29161,666.99953,5,2000.99859,2001.00000,0.00141,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.89332,0.51684


In [52]:
dct_results = {}
i = 0

for test_size in tqdm(np.arange(0.1, 0.525, 0.05)):
    for random_state in np.arange(RANDOM_STATE, RANDOM_STATE+10):
        train_size = 1 - test_size

        df_cleaned_train_exp_03, df_cleaned_test_exp_03 = train_test_split(
            df_cleaned_exp_03.drop(columns=['Address']), train_size=train_size, 
            random_state=random_state,
        )

        pipe = Pipeline([
            ('rare', RareLabelEncoder(
                n_categories=2, max_n_categories=3, replace_with='rare', tol=0.1,
                variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
            )),
            ('frq', CountFrequencyEncoder(
                encoding_method='frequency',
                variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
            )),
            ('imputer', MeanMedianImputer(
                imputation_method='median',
                variables=lst_numerical_columns
            ))
        ])

        df_cleaned_train_exp_03_transformed = pipe.fit_transform(df_cleaned_train_exp_03)
        df_cleaned_test_exp_03_transformed = pipe.transform(df_cleaned_test_exp_03)

        X_train_exp_03, y_train_exp_03 = df_cleaned_train_exp_03_transformed.drop(columns=['FLAG']), df_cleaned_train_exp_03_transformed['FLAG'] 
        X_test_exp_03, y_test_exp_03 = df_cleaned_test_exp_03_transformed.drop(columns=['FLAG']), df_cleaned_test_exp_03_transformed['FLAG'] 

        classifier_lgbm = LGBMClassifier(
            random_state=RANDOM_STATE, n_jobs=-1, silent='warn', objective='binary', importance_type='gain'
        )

        classifier_lgbm.fit(X_train_exp_03, y_train_exp_03)

        y_pred = classifier_lgbm.predict(X_test_exp_03)
        
        dct_results[i] = {'test_size': test_size, 'random_state': random_state, 'roc_auc': roc_auc_score(y_pred, y_test_exp_03)}
        i += 1
        
df_results = pd.DataFrame.from_dict(dct_results, orient='index')

df_results_agg = df_results.groupby(by=['test_size']).agg({'roc_auc': ['mean', 'std']}).reset_index()
df_results_agg.columns = ['test_size', 'auc - mean', 'auc - std']
df_results_agg.sort_values(by=['auc - mean'], ascending=False)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:18<00:00,  2.06s/it]


Unnamed: 0,test_size,auc - mean,auc - std
1,0.15,0.98138,0.00378
0,0.1,0.98135,0.00435
3,0.25,0.98125,0.00337
4,0.3,0.98094,0.00417
5,0.35,0.98048,0.0026
2,0.2,0.9804,0.00372
7,0.45,0.98015,0.00303
6,0.4,0.97935,0.00206
8,0.5,0.97909,0.00207


In [48]:
dct_results = {}
i = 0

for test_size in tqdm(np.arange(0.1, 0.225, 0.05)):
    for random_state in np.arange(RANDOM_STATE, RANDOM_STATE+10):
        train_size = 1 - test_size

        df_cleaned_train_exp_03, df_cleaned_test_exp_03 = train_test_split(
            df_cleaned_exp_03.drop(columns=['Address']), train_size=train_size, 
            random_state=random_state,
        )

        pipe = Pipeline([
            ('rare', RareLabelEncoder(
                n_categories=2, max_n_categories=3, replace_with='rare', tol=0.1,
                variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
            )),
            ('frq', CountFrequencyEncoder(
                encoding_method='frequency',
                variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
            )),
            ('imputer', MeanMedianImputer(
                imputation_method='mean',
                variables=lst_numerical_columns
            ))
        ])

        df_cleaned_train_exp_03_transformed = pipe.fit_transform(df_cleaned_train_exp_03)
        df_cleaned_test_exp_03_transformed = pipe.transform(df_cleaned_test_exp_03)

        X_train_exp_03, y_train_exp_03 = df_cleaned_train_exp_03_transformed.drop(columns=['FLAG']), df_cleaned_train_exp_03_transformed['FLAG'] 
        X_test_exp_03, y_test_exp_03 = df_cleaned_test_exp_03_transformed.drop(columns=['FLAG']), df_cleaned_test_exp_03_transformed['FLAG'] 

        classifier_lgbm = LGBMClassifier(
            random_state=RANDOM_STATE, n_jobs=-1, silent='warn', objective='binary', importance_type='gain'
        )

        classifier_lgbm.fit(X_train_exp_03, y_train_exp_03)

        y_pred = classifier_lgbm.predict(X_test_exp_03)
        
        dct_results[i] = {'test_size': test_size, 'random_state': random_state, 'roc_auc': roc_auc_score(y_pred, y_test_exp_03)}
        i += 1
        
df_results = pd.DataFrame.from_dict(dct_results, orient='index')

df_results_agg = df_results.groupby(by=['test_size']).agg({'roc_auc': ['mean', 'std', 'median']}).reset_index()
df_results_agg.columns = ['test_size', 'mean', 'std', 'median']
df_results_agg.sort_values(by=['mean'], ascending=False)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:08<00:00,  2.97s/it]


Unnamed: 0,test_size,mean,std,median
0,0.1,0.9828,0.00318,0.98334
1,0.15,0.98198,0.0044,0.98347
2,0.2,0.98126,0.00344,0.98012
