# Data preparation - ethereum - v1

# Setup

## Library import
We import all the required Python libraries

In [1]:
import os

# Data manipulation
from feature_engine.encoding import RareLabelEncoder, CountFrequencyEncoder
from feature_engine.imputation import MeanMedianImputer
import pandas as pd
import numpy as np

# Visualizations
import plotly
import plotly.graph_objs as go
import plotly.offline as ply
plotly.offline.init_notebook_mode(connected=True)
import matplotlib as plt

from tqdm import tqdm
from lightgbm import (
    LGBMClassifier, plot_importance, create_tree_digraph, plot_tree
)
import missingno as msno
# from pycaret.classification import ClassificationExperiment
from sklearn.pipeline import Pipeline
from sklearn.model_selection import (
    train_test_split
    , cross_val_score
    , StratifiedKFold
    , StratifiedShuffleSplit
    , cross_validate
    , GridSearchCV
)
from ydata_profiling import ProfileReport
from sklearn.metrics import (
    confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve
)

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

# Options for pandas
# pd.options.display.max_columns = None
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', '{:.5f}'.format)
# pd.options.display.float_format = '{:.5f}'.format
# pd.options.display.max_rows = 120


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



## Local library import
We import all the required local libraries libraries

In [2]:
os.chdir('../')
from src.utils.data_describe import breve_descricao, serie_nulos, cardinalidade, check_for_equal_columns
os.chdir('./notebooks/')

# Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.

In [3]:
RAW_FOLDER = '../data/raw/'
INTERIM_FOLDER = '../data/interim/'
PROCESSED_FOLDER = '../data/processed/'
REPORTS_FOLDER = '../reports/'
RANDOM_STATE = 42


# Data import
We retrieve all the required data for the analysis.

In [4]:
try:
    df_raw = pd.read_parquet(INTERIM_FOLDER + 'ethereum_complete.pqt')
    print(f'PARQUET file loaded. Shape: {df_raw.shape}')
except FileNotFoundError as e:
    df_raw = pd.read_csv(RAW_FOLDER + 'ethereum_complete.csv')
    df_raw.drop(columns=['Index'], inplace=True)
   
    df_raw.to_parquet(INTERIM_FOLDER +  'ethereum_complete.pqt', index=False)
    print(f'EXCEL file loaded and PARQUET created. Shape: {df_raw.shape}')

PARQUET file loaded. Shape: (4681, 49)


In [5]:
display(df_raw.head(3))

Unnamed: 0,Address,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,min_value_sent_to_contract,max_val_sent_to_contract,avg_value_sent_to_contract,total_transactions_(including_tnx_to_create_contract),total_Ether_sent,total_ether_received,total_ether_sent_contracts,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_sent_addr.1,ERC20_uniq_rec_contract_addr,ERC20_avg_time_between_sent_tnx,ERC20_avg_time_between_rec_tnx,ERC20_avg_time_between_rec_2_tnx,ERC20_avg_time_between_contract_tnx,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_min_val_sent_contract,ERC20_max_val_sent_contract,ERC20_avg_val_sent_contract,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
0,0x0020731604c882cf7bf8c444be97d17b19ea4316,1,1457.31,34.12,4815.43,3,13,0,10,3,1.0,2.50105,1.34844,1.00087,11.27787,5.84292,0,0,0,16,17.52875,17.52978,0,0.00104,,,,,,,,,,,,,,,,,,,,,,,,,
1,0x002bf459dc58584d58886169ea0e80f3ca95ffaf,1,3976.5,834.77,9622.53,2,2,0,1,2,0.58627,0.94751,0.76689,0.58541,0.94728,0.76635,0,0,0,4,1.53269,1.53378,0,0.00109,1.0,1.337,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.337,1.337,1.337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,Blockwell say NOTSAFU
2,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,1,112.9,31.87,321.42,2,3,0,3,1,0.00102,0.8178,0.43961,0.50039,0.81751,0.65895,0,0,0,5,1.3179,1.31882,0,0.00092,1.0,1.337,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.337,1.337,1.337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,Blockwell say NOTSAFU


# Data cleaning

1. Remove duplicated registries
2. Remove duplicated columns
3. Remove constant columns
4. Remove equal columns
5. Transforming categorical features

## Removing duplicated registries

In [6]:
df_cleaned = df_raw.copy()

In [7]:
df_cleaned.drop_duplicates(inplace=True)

print(f'Shape before data cleasing: {df_raw.shape}')
print(f'After dropping duplicated rows: {df_cleaned.shape}')

Shape before data cleasing: (4681, 49)
After dropping duplicated rows: (4677, 49)


### Removing duplicated wallet with different 'FLAG' 

In [8]:
print(f'Shape before dropping duplicated wallet: {df_cleaned.shape}')

df_cleaned.drop(df_cleaned.loc[(df_cleaned['Address']=='0xd624d046edbdef805c5e4140dce5fb5ec1b39a3c') &
    (df_cleaned['FLAG']==0)].index, inplace=True)

print(f'After dropping dropping duplicated wallet: {df_cleaned.shape}')

Shape before dropping duplicated wallet: (4677, 49)
After dropping dropping duplicated wallet: (4676, 49)


## Removing duplicated columns

In [9]:
lst_duplicated_columns = [
    'ERC20_uniq_sent_addr.1', 'ERC20_avg_time_between_rec_2_tnx', 'ERC20_avg_val_sent_contract',
    'ERC20_max_val_sent_contract', 'ERC20_min_val_sent_contract',
]

df_cleaned.drop(columns=lst_duplicated_columns, inplace=True)

print(f'Shape before data cleasing: {df_raw.shape}')
print(f'After dropping duplicated columns: {df_cleaned.shape}')

Shape before data cleasing: (4681, 49)
After dropping duplicated columns: (4676, 44)


## Removing constant columns

In [10]:
lst_unitary_columns = [
    'total_ether_sent_contracts', 'min_value_sent_to_contract',
    'max_val_sent_to_contract', 'avg_value_sent_to_contract'
]

df_cleaned.drop(columns=lst_unitary_columns, inplace=True)

print(f'Shape before data cleasing: {df_raw.shape}')
print(f'After dropping duplicated columns: {df_cleaned.shape}')

Shape before data cleasing: (4681, 49)
After dropping duplicated columns: (4676, 40)


## Removing equal columns

In [11]:
lst_equal_columns = [
    'min_value_sent_to_contract', 'max_val_sent_to_contract', 'avg_value_sent_to_contract',
    'ERC20_avg_time_between_sent_tnx', 'ERC20_avg_time_between_rec_tnx'
]

for column in lst_equal_columns:
    try:
        df_cleaned.drop(columns=column, inplace=True)
    except:
        print(f"'{column}' was exluded before.")

print(30*'-')
print(f'Shape before data cleasing: {df_raw.shape}')
print(f'After dropping duplicated columns: {df_cleaned.shape}')

'min_value_sent_to_contract' was exluded before.
'max_val_sent_to_contract' was exluded before.
'avg_value_sent_to_contract' was exluded before.
------------------------------
Shape before data cleasing: (4681, 49)
After dropping duplicated columns: (4676, 38)


## Pre-processing categorical features

### Replacing "    " and "0" to "no information"

In [12]:
for column in ['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']:
    df_cleaned.loc[df_cleaned[column].isnull(), column] = "0"
    for category in [" ", "", "0"]:
        df_cleaned[column] = df_cleaned[column].str.strip()
        df_cleaned[column].replace(category, "no information", inplace=True)
    

    display(
        pd.DataFrame(
            100*df_cleaned.loc[:, column].value_counts()/df_cleaned.loc[:, column].value_counts().sum()
        ).head(10)
    )


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





Unnamed: 0_level_0,count
ERC20_most_sent_token_type,Unnamed: 1_level_1
no information,89.11463
blockwell.ai KYC Casper Token,1.39008
OmiseGO,0.98375
EOS,0.85543
Golem,0.57742
StatusNetwork,0.47049
Tronix,0.32079
Qtum,0.25663
TenXPay,0.23524
BAT,0.23524



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





Unnamed: 0_level_0,count
ERC20_most_rec_token_type,Unnamed: 1_level_1
no information,51.41146
Blockwell say NOTSAFU,16.65954
OmiseGO,7.59196
VIU,1.73225
GSENetwork,1.71086
INS Promo,1.15483
Livepeer Token,1.13345
blockwell.ai KYC Casper Token,1.00513
AICRYPTO,0.96236
EOS,0.79127


# Experiment 01 - Only numerical features

In [13]:
df_cleaned_exp_01 = df_cleaned.drop(columns=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']).copy()

## Splitting data set

In [14]:
test_size = 0.15
train_size = 1 - test_size

lst_test_sample = df_cleaned_exp_01.sample(int(test_size*len(df_cleaned_exp_01)), random_state=RANDOM_STATE).index.values
lst_train_sample = df_cleaned_exp_01.drop(index=lst_test_sample).index.values

print(f"""
test_size: {test_size}
test_sample (registries): {len(lst_test_sample)}

train_size: {train_size}
train_sample (registries): {len(lst_train_sample)}
""")

df_cleaned_exp_01.loc[lst_train_sample, :].head()


test_size: 0.15
test_sample (registries): 701

train_size: 0.85
train_sample (registries): 3975



Unnamed: 0,Address,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,total_transactions_(including_tnx_to_create_contract),total_Ether_sent,total_ether_received,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_rec_contract_addr,ERC20_avg_time_between_contract_tnx,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name
0,0x0020731604c882cf7bf8c444be97d17b19ea4316,1,1457.31,34.12,4815.43,3,13,0,10,3,1.0,2.50105,1.34844,1.00087,11.27787,5.84292,16,17.52875,17.52978,0.00104,,,,,,,,,,,,,,,,
1,0x002bf459dc58584d58886169ea0e80f3ca95ffaf,1,3976.5,834.77,9622.53,2,2,0,1,2,0.58627,0.94751,0.76689,0.58541,0.94728,0.76635,4,1.53269,1.53378,0.00109,1.0,1.337,0.0,0.0,0.0,1.0,1.0,0.0,1.337,1.337,1.337,0.0,0.0,0.0,0.0,1.0
2,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,1,112.9,31.87,321.42,2,3,0,3,1,0.00102,0.8178,0.43961,0.50039,0.81751,0.65895,5,1.3179,1.31882,0.00092,1.0,1.337,0.0,0.0,0.0,1.0,1.0,0.0,1.337,1.337,1.337,0.0,0.0,0.0,0.0,1.0
3,0x0059b14e35dab1b4eee1e2926c7a5660da66f747,1,2300.37,65.1,73091.0,29,98,0,89,26,0.00078,15.72907,0.38322,0.0,36.7,1.31496,127,38.13377,37.55605,-0.57772,96.0,142677.3829,120354.7684,0.0,6.0,55.0,37.0,0.0,0.0,26436.081,1954.4847,0.0,81324.0746,5232.81602,22.0,37.0
4,0x005b9f4516f8e640bbe48136901738b323c53b00,1,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,,,,,,,,,,,,,,,,


## Exporting

In [15]:
df_cleaned_exp_01.loc[lst_train_sample, :].to_parquet(INTERIM_FOLDER + 'df_train_exp_01.pqt')
df_cleaned_exp_01.loc[lst_test_sample, :].to_parquet(INTERIM_FOLDER + 'df_test_exp_01.pqt')

# Experiment 02 - Categorical features encoded by frequency

In [16]:
df_cleaned_exp_02 = df_cleaned.copy()
df_cleaned_exp_02.shape

(4676, 38)

In [17]:
df_temp = df_cleaned_exp_02.drop(columns=['Address', 'FLAG'])

df_temp = cardinalidade(df_temp.select_dtypes(include=[float, int]))
df_temp['Proporção Nulos'].value_counts()


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explic

Proporção Nulos
0.00000    18
0.17729    16
Name: count, dtype: int64

## Splitting data set

In [50]:
test_size = 0.15
train_size = 1 - test_size

X = df_cleaned_exp_02.drop(columns=['Address', 'FLAG']).copy()
y = df_cleaned_exp_02[['FLAG']]

df_cleaned_train_exp_02, df_cleaned_test_exp_02 = train_test_split(
    df_cleaned_exp_02.drop(columns=['Address']), train_size=train_size, 
    random_state=RANDOM_STATE,
)

print(f"""
test_size: {test_size}
test_sample (registries): {len(df_cleaned_test_exp_02)}
test_sample's target's proportion:
{df_cleaned_test_exp_02['FLAG'].value_counts(normalize=True)}

train_size: {train_size}
train_sample (registries): {len(df_cleaned_train_exp_02)}
train_sample's target's proportion:
{df_cleaned_train_exp_02['FLAG'].value_counts(normalize=True)}
""")

df_cleaned_train_exp_02.head()


test_size: 0.15
test_sample (registries): 702
test_sample's target's proportion:
FLAG
0   0.54986
1   0.45014
Name: proportion, dtype: float64

train_size: 0.85
train_sample (registries): 3974
train_sample's target's proportion:
FLAG
0   0.53120
1   0.46880
Name: proportion, dtype: float64



Unnamed: 0,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,total_transactions_(including_tnx_to_create_contract),total_Ether_sent,total_ether_received,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_rec_contract_addr,ERC20_avg_time_between_contract_tnx,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
3692,0,0.0,0.0,537.6,1,1,0,1,1,1.99,1.99,1.99,1.98975,1.98975,1.98975,2,1.98975,1.99,0.00025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,no information,no information
152,1,23488.73,33.36,48178.53,2,36,0,26,2,0.01038,9.99,1.33005,0.07579,48.30435,24.19007,38,48.38014,47.88187,-0.49828,2.0,1.337,0.0,0.0,0.0,2.0,2.0,0.0,0.0,1.337,0.6685,0.0,0.0,0.0,0.0,2.0,no information,Blockwell say NOTSAFU
1175,1,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,no information,no information
3874,0,54666.62,2308.98,223284.43,4,2,0,1,4,0.13384,0.39856,0.2662,0.0,0.50877,0.12994,6,0.51978,0.5324,0.01262,10.0,9837.41839,0.0,0.0,1.0,6.0,9.0,0.0,0.0,8888.0,1093.04649,0.0,0.0,0.0,1.0,9.0,Trustcoin,Intelion
3011,0,22.18,0.0,66.55,3,1,0,1,3,101.0,101.0,101.0,0.61,93.11405,33.66615,4,100.99845,101.0,0.00155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,no information,no information


## Preprocessing

In [51]:

lst_numerical_columns = df_cleaned_train_exp_02.drop(columns=[
    'FLAG', 'ERC20_most_sent_token_type', 'ERC20_most_rec_token_type'
]).columns.tolist()

lst_categorical_columns = ['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']

pipe = Pipeline([
    ('rare', RareLabelEncoder(
        n_categories=2, max_n_categories=3, replace_with='rare', tol=0.1,
        variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
    )),
    ('frq', CountFrequencyEncoder(
        encoding_method='frequency',
        variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
    )),
    ('imputer', MeanMedianImputer(
        imputation_method='median',
        variables=lst_numerical_columns
    ))
])

df_cleaned_train_exp_02_transformed = pipe.fit_transform(df_cleaned_train_exp_02)
df_cleaned_test_exp_02_transformed = pipe.transform(df_cleaned_test_exp_02)

display(df_cleaned_train_exp_02_transformed['ERC20_most_rec_token_type'].value_counts())
display(df_cleaned_train_exp_02_transformed['ERC20_most_sent_token_type'].value_counts())

ERC20_most_rec_token_type
0.51636    2052
0.31656    1258
0.16709     664
Name: count, dtype: int64

ERC20_most_sent_token_type
0.89155    3543
0.10845     431
Name: count, dtype: int64

### Effect of transformation over numerical features

#### Training dataset

In [52]:

df_num_features_before_transf = df_cleaned_train_exp_02.loc[:, lst_numerical_columns].describe().T[['count', 'mean', 'std', '50%']]
df_num_features_after_transf = df_cleaned_train_exp_02_transformed.loc[:, lst_numerical_columns].describe().T[['count', 'mean', 'std', '50%']]

pd.concat([df_num_features_before_transf, df_num_features_after_transf], axis=1)
# .to_excel(PROCESSED_FOLDER + 'df_after_preprocessing.xlsx')

Unnamed: 0,count,mean,std,50%,count.1,mean.1,std.1,50%.1
Avg_min_between_sent_tnx,3974.0,3297.65844,15376.97486,3.11,3974.0,3297.65844,15376.97486,3.11
Avg_min_between_received_tnx,3974.0,5070.66055,17245.81655,95.365,3974.0,5070.66055,17245.81655,95.365
Time_Diff_between_first_and_last_(Mins),3974.0,131233.13116,242549.63525,10655.115,3974.0,131233.13116,242549.63525,10655.115
Sent_tnx,3974.0,91.36814,738.85187,2.0,3974.0,91.36814,738.85187,2.0
Received_Tnx,3974.0,173.90916,1080.82837,3.0,3974.0,173.90916,1080.82837,3.0
Number_of_Created_Contracts,3974.0,5.81002,201.31172,0.0,3974.0,5.81002,201.31172,0.0
Unique_Received_From_Addresses,3974.0,40.58505,334.19898,2.0,3974.0,40.58505,334.19898,2.0
Unique_Sent_To_Addresses,3974.0,25.64821,256.35317,1.0,3974.0,25.64821,256.35317,1.0
min_value_received,3974.0,49.99333,463.52696,0.04434,3974.0,49.99333,463.52696,0.04434
max_value_received,3974.0,697.46284,16330.23027,4.0,3974.0,697.46284,16330.23027,4.0


#### Testing dataset

In [53]:

df_num_features_before_transf = df_cleaned_test_exp_02.loc[:, lst_numerical_columns].describe().T[['count', 'mean', 'std', '50%']]
df_num_features_after_transf = df_cleaned_test_exp_02_transformed.loc[:, lst_numerical_columns].describe().T[['count', 'mean', 'std', '50%']]

pd.concat([df_num_features_before_transf, df_num_features_after_transf], axis=1).to_excel(PROCESSED_FOLDER + 'df_after_preprocessing.xlsx')

In [54]:
lst_numeric_columns = df_cleaned_train_exp_02.drop(columns=['FLAG']).select_dtypes(exclude=object).columns.tolist()

pd.DataFrame(
    df_cleaned_train_exp_02_transformed.loc[:, lst_numeric_columns].median(axis=0), columns=['Mediana']
).to_excel(PROCESSED_FOLDER + 'preprocessing_numerical.xlsx')

### Effect of transformation over categorical features

#### Training dataset

In [55]:
column = 'ERC20_most_rec_token_type'

# Before
df_temp = pd.DataFrame(
    100*df_cleaned_train_exp_02[[column]].value_counts() /\
    df_cleaned_train_exp_02[[column]].value_counts().sum(),
    columns=['Proporção']
)

df_temp['cumsum'] = df_temp['Proporção'].cumsum()
df_temp.to_excel(PROCESSED_FOLDER + f'Before_{column}.xlsx', engine='xlsxwriter')
display(df_temp.head())

# After

display(df_cleaned_train_exp_02_transformed[column].unique())

Unnamed: 0,Proporção,cumsum


array([0.51635632, 0.16708606, 0.31655762])

In [56]:
column = 'ERC20_most_sent_token_type'

# Before
df_temp = pd.DataFrame(
    100*df_cleaned_train_exp_02[[column]].value_counts() /\
    df_cleaned_train_exp_02[[column]].value_counts().sum(),
    columns=['Proporção']
)

df_temp['cumsum'] = df_temp['Proporção'].cumsum()
df_temp.to_excel(PROCESSED_FOLDER + f'Before_{column}.xlsx', engine='xlsxwriter')
display(df_temp.head())

# After

display(df_cleaned_train_exp_02_transformed[column].unique())

Unnamed: 0,Proporção,cumsum


array([0.89154504, 0.10845496])

### Testing dataset

In [57]:
column = 'ERC20_most_rec_token_type'

# Before
df_temp = pd.DataFrame(
    100*df_cleaned_test_exp_02[[column]].value_counts() /\
    df_cleaned_test_exp_02[[column]].value_counts().sum(),
    columns=['Proporção']
)

df_temp['cumsum'] = df_temp['Proporção'].cumsum()
display(df_temp.head())

# After
display(df_cleaned_test_exp_02_transformed[column].unique())

Unnamed: 0,Proporção,cumsum


array([0.51635632, 0.16708606, 0.31655762])

In [58]:
# Before
df_temp = pd.DataFrame(
    100*df_cleaned_test_exp_02[['ERC20_most_sent_token_type']].value_counts() /\
    df_cleaned_test_exp_02[['ERC20_most_sent_token_type']].value_counts().sum(),
    columns=['Proporção']
)

df_temp['cumsum'] = df_temp['Proporção'].cumsum()
display(df_temp.head())

# After
display(df_cleaned_test_exp_02_transformed['ERC20_most_sent_token_type'].unique())

Unnamed: 0,Proporção,cumsum


array([0.89154504, 0.10845496])

## Exporting

In [59]:
df_cleaned_train_exp_02_transformed.to_parquet(INTERIM_FOLDER + 'df_cleaned_train_exp_02_transformed.pqt')
df_cleaned_test_exp_02_transformed.to_parquet(INTERIM_FOLDER + 'df_cleaned_test_exp_02_transformed.pqt')

# Experiment 03 - Train-test proportion

In [60]:
df_cleaned_exp_03 = df_cleaned.copy()
df_cleaned_exp_03.shape

(4676, 38)

In [61]:
df_temp = df_cleaned_exp_03.drop(columns=['Address', 'FLAG']).copy()

df_temp = cardinalidade(df_temp.select_dtypes(include=[float, int]))
df_temp['Proporção Nulos'].value_counts()


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explic

Proporção Nulos
0.00000    18
0.17729    16
Name: count, dtype: int64

In [62]:
lst_numerical_columns = df_cleaned_exp_03.drop(columns=[
    'Address', 'FLAG', 'ERC20_most_sent_token_type', 'ERC20_most_rec_token_type'
]).columns.tolist()

lst_categorical_columns = ['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']

## Splitting data set

In [63]:
# df_cleaned_train_exp_03_transformed

In [64]:
dct_results = {}
i = 0

for test_size in tqdm(np.arange(0.1, 0.525, 0.05)):
    for random_state in np.arange(RANDOM_STATE, RANDOM_STATE+10):
        train_size = 1 - test_size

        df_cleaned_train_exp_03, df_cleaned_test_exp_03 = train_test_split(
            df_cleaned_exp_03.drop(columns=['Address']), train_size=train_size, 
            random_state=random_state,
        )

        pipe = Pipeline([
            ('rare', RareLabelEncoder(
                n_categories=2, max_n_categories=3, replace_with='rare', tol=0.1,
                variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
            )),
            ('frq', CountFrequencyEncoder(
                encoding_method='frequency',
                variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
            )),
            ('imputer', MeanMedianImputer(
                imputation_method='median',
                variables=lst_numerical_columns
            ))
        ])

        df_cleaned_train_exp_03_transformed = pipe.fit_transform(df_cleaned_train_exp_03)
        df_cleaned_test_exp_03_transformed = pipe.transform(df_cleaned_test_exp_03)

        X_train_exp_03, y_train_exp_03 = df_cleaned_train_exp_03_transformed.drop(columns=['FLAG']), df_cleaned_train_exp_03_transformed['FLAG'] 
        X_test_exp_03, y_test_exp_03 = df_cleaned_test_exp_03_transformed.drop(columns=['FLAG']), df_cleaned_test_exp_03_transformed['FLAG'] 

        classifier_lgbm = LGBMClassifier(
            random_state=RANDOM_STATE, n_jobs=-1, silent='warn', objective='binary', importance_type='gain'
        )

        classifier_lgbm.fit(X_train_exp_03, y_train_exp_03)

        y_pred = classifier_lgbm.predict(X_test_exp_03)
        
        dct_results[i] = {'test_size': test_size, 'random_state': random_state, 'roc_auc': roc_auc_score(y_pred, y_test_exp_03)}
        i += 1
        
df_results = pd.DataFrame.from_dict(dct_results, orient='index')

df_results_agg = df_results.groupby(by=['test_size']).agg({'roc_auc': ['mean', 'std']}).reset_index()
df_results_agg.columns = ['test_size', 'auc - mean', 'auc - std']
df_results_agg.sort_values(by=['auc - mean'], ascending=False)

  0%|          | 0/9 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 1961, number of negative: 2247
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000939 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5905
[LightGBM] [Info] Number of data points in the train set: 4208, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466017 -> initscore=-0.136141
[LightGBM] [Info] Start training from score -0.136141
[LightGBM] [Info] Number of positive: 1958, number of negative: 2250
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5903
[LightGBM] [Info] Number of data points in the train set: 4208, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.465304 -> initscore=-0.139007
[LightGB

 11%|█         | 1/9 [00:02<00:18,  2.25s/it]

[LightGBM] [Info] Number of positive: 1863, number of negative: 2111
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000939 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5847
[LightGBM] [Info] Number of data points in the train set: 3974, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.468797 -> initscore=-0.124974
[LightGBM] [Info] Start training from score -0.124974
[LightGBM] [Info] Number of positive: 1852, number of negative: 2122
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000674 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5856
[LightGBM] [Info] Number of data points in the train set: 3974, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466029 -> initscore=-0.136093
[LightGBM] [Info] Start training from score -0.136093
[LightGBM] [Info] 

 22%|██▏       | 2/9 [00:04<00:16,  2.30s/it]

[LightGBM] [Info] Number of positive: 1745, number of negative: 1995
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000799 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5782
[LightGBM] [Info] Number of data points in the train set: 3740, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466578 -> initscore=-0.133889
[LightGBM] [Info] Start training from score -0.133889
[LightGBM] [Info] Number of positive: 1735, number of negative: 2005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000905 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5802
[LightGBM] [Info] Number of data points in the train set: 3740, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.463904 -> initscore=-0.144637
[LightGBM] [Info] Start training from score -0.144637
[LightGBM] [Info] 

 33%|███▎      | 3/9 [00:06<00:13,  2.27s/it]

[LightGBM] [Info] Number of positive: 1642, number of negative: 1865
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000819 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5735
[LightGBM] [Info] Number of data points in the train set: 3507, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.468206 -> initscore=-0.127346
[LightGBM] [Info] Start training from score -0.127346
[LightGBM] [Info] Number of positive: 1619, number of negative: 1888
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000701 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5633
[LightGBM] [Info] Number of data points in the train set: 3507, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.461648 -> initscore=-0.153709
[LightGBM] [Info] Start training from score -0.153709
[LightGBM] [Info] 

 44%|████▍     | 4/9 [00:08<00:10,  2.19s/it]

[LightGBM] [Info] Number of positive: 1648, number of negative: 1859
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000586 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5759
[LightGBM] [Info] Number of data points in the train set: 3507, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.469917 -> initscore=-0.120476
[LightGBM] [Info] Start training from score -0.120476
[LightGBM] [Info] Number of positive: 1529, number of negative: 1744
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000578 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5575
[LightGBM] [Info] Number of data points in the train set: 3273, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.467156 -> initscore=-0.131567
[LightGBM] [Info] Start training from score -0.131567
[LightGBM] [Info] 

 56%|█████▌    | 5/9 [00:11<00:08,  2.23s/it]

[LightGBM] [Info] Number of positive: 1413, number of negative: 1626
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000712 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5502
[LightGBM] [Info] Number of data points in the train set: 3039, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.464956 -> initscore=-0.140408
[LightGBM] [Info] Start training from score -0.140408
[LightGBM] [Info] Number of positive: 1403, number of negative: 1636
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000855 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5496
[LightGBM] [Info] Number of data points in the train set: 3039, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.461665 -> initscore=-0.153641
[LightGBM] [Info] Start training from score -0.153641
[LightGBM] [Info] 

 67%|██████▋   | 6/9 [00:13<00:06,  2.22s/it]

[LightGBM] [Info] Number of positive: 1429, number of negative: 1610
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000762 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5532
[LightGBM] [Info] Number of data points in the train set: 3039, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470220 -> initscore=-0.119259
[LightGBM] [Info] Start training from score -0.119259
[LightGBM] [Info] Number of positive: 1304, number of negative: 1501
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000691 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5438
[LightGBM] [Info] Number of data points in the train set: 2805, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.464884 -> initscore=-0.140695
[LightGBM] [Info] Start training from score -0.140695
[LightGBM] [Info] 

 78%|███████▊  | 7/9 [00:14<00:03,  1.98s/it]

[LightGBM] [Info] Number of positive: 1204, number of negative: 1367
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5374
[LightGBM] [Info] Number of data points in the train set: 2571, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.468300 -> initscore=-0.126969
[LightGBM] [Info] Start training from score -0.126969
[LightGBM] [Info] Number of positive: 1193, number of negative: 1378
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000548 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5336
[LightGBM] [Info] Number of data points in the train set: 2571, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.464022 -> initscore=-0.144162
[LightGBM] [Info] Start training from score -0.144162
[LightGBM] [Info] 

 89%|████████▉ | 8/9 [00:16<00:01,  1.90s/it]

[LightGBM] [Info] Number of positive: 1204, number of negative: 1367
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5391
[LightGBM] [Info] Number of data points in the train set: 2571, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.468300 -> initscore=-0.126969
[LightGBM] [Info] Start training from score -0.126969
[LightGBM] [Info] Number of positive: 1094, number of negative: 1243
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000520 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5207
[LightGBM] [Info] Number of data points in the train set: 2337, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.468122 -> initscore=-0.127687
[LightGBM] [Info] Start training from score -0.127687
[LightGBM] [Info] 

100%|██████████| 9/9 [00:18<00:00,  2.10s/it]






Unnamed: 0,test_size,auc - mean,auc - std
1,0.15,0.98138,0.00378
0,0.1,0.98135,0.00435
3,0.25,0.98125,0.00337
4,0.3,0.98094,0.00417
5,0.35,0.98048,0.0026
2,0.2,0.9804,0.00372
7,0.45,0.98015,0.00303
6,0.4,0.97935,0.00206
8,0.5,0.97909,0.00207


In [65]:
dct_results = {}
i = 0

for test_size in tqdm(np.arange(0.1, 0.225, 0.05)):
    for random_state in np.arange(RANDOM_STATE, RANDOM_STATE+10):
        train_size = 1 - test_size

        df_cleaned_train_exp_03, df_cleaned_test_exp_03 = train_test_split(
            df_cleaned_exp_03.drop(columns=['Address']), train_size=train_size, 
            random_state=random_state,
        )

        pipe = Pipeline([
            ('rare', RareLabelEncoder(
                n_categories=2, max_n_categories=3, replace_with='rare', tol=0.1,
                variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
            )),
            ('frq', CountFrequencyEncoder(
                encoding_method='frequency',
                variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
            )),
            ('imputer', MeanMedianImputer(
                imputation_method='mean',
                variables=lst_numerical_columns
            ))
        ])

        df_cleaned_train_exp_03_transformed = pipe.fit_transform(df_cleaned_train_exp_03)
        df_cleaned_test_exp_03_transformed = pipe.transform(df_cleaned_test_exp_03)

        X_train_exp_03, y_train_exp_03 = df_cleaned_train_exp_03_transformed.drop(columns=['FLAG']), df_cleaned_train_exp_03_transformed['FLAG'] 
        X_test_exp_03, y_test_exp_03 = df_cleaned_test_exp_03_transformed.drop(columns=['FLAG']), df_cleaned_test_exp_03_transformed['FLAG'] 

        classifier_lgbm = LGBMClassifier(
            random_state=RANDOM_STATE, n_jobs=-1, silent='warn', objective='binary', importance_type='gain'
        )

        classifier_lgbm.fit(X_train_exp_03, y_train_exp_03)

        y_pred = classifier_lgbm.predict(X_test_exp_03)
        
        dct_results[i] = {'test_size': test_size, 'random_state': random_state, 'roc_auc': roc_auc_score(y_pred, y_test_exp_03)}
        i += 1
        
df_results = pd.DataFrame.from_dict(dct_results, orient='index')

df_results_agg = df_results.groupby(by=['test_size']).agg({'roc_auc': ['mean', 'std', 'median']}).reset_index()
df_results_agg.columns = ['test_size', 'mean', 'std', 'median']
df_results_agg.sort_values(by=['mean'], ascending=False)

  0%|          | 0/3 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 1961, number of negative: 2247
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001736 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6226
[LightGBM] [Info] Number of data points in the train set: 4208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466017 -> initscore=-0.136141
[LightGBM] [Info] Start training from score -0.136141


[LightGBM] [Info] Number of positive: 1958, number of negative: 2250
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000828 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6227
[LightGBM] [Info] Number of data points in the train set: 4208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.465304 -> initscore=-0.139007
[LightGBM] [Info] Start training from score -0.139007
[LightGBM] [Info] Number of positive: 1964, number of negative: 2244
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6224
[LightGBM] [Info] Number of data points in the train set: 4208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466730 -> initscore=-0.133277
[LightGBM] [Info] Start training from score -0.133277
[LightGBM] [Info] 

 33%|███▎      | 1/3 [00:03<00:07,  3.67s/it]

[LightGBM] [Info] Number of positive: 1971, number of negative: 2237
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000850 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6236
[LightGBM] [Info] Number of data points in the train set: 4208, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.468394 -> initscore=-0.126595
[LightGBM] [Info] Start training from score -0.126595
[LightGBM] [Info] Number of positive: 1863, number of negative: 2111
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000941 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6194
[LightGBM] [Info] Number of data points in the train set: 3974, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.468797 -> initscore=-0.124974
[LightGBM] [Info] Start training from score -0.124974
[LightGBM] [Info] 

 67%|██████▋   | 2/3 [00:06<00:03,  3.16s/it]

[LightGBM] [Info] Number of positive: 1745, number of negative: 1995
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6157
[LightGBM] [Info] Number of data points in the train set: 3740, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466578 -> initscore=-0.133889
[LightGBM] [Info] Start training from score -0.133889
[LightGBM] [Info] Number of positive: 1735, number of negative: 2005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000708 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6170
[LightGBM] [Info] Number of data points in the train set: 3740, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.463904 -> initscore=-0.144637
[LightGBM] [Info] Start training from score -0.144637
[LightGBM] [Info] 

100%|██████████| 3/3 [00:08<00:00,  2.98s/it]

[LightGBM] [Info] Number of positive: 1770, number of negative: 1970
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000910 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6175
[LightGBM] [Info] Number of data points in the train set: 3740, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.473262 -> initscore=-0.107054
[LightGBM] [Info] Start training from score -0.107054





Unnamed: 0,test_size,mean,std,median
0,0.1,0.9828,0.00318,0.98334
1,0.15,0.98198,0.0044,0.98347
2,0.2,0.98126,0.00344,0.98012
