# Data preparation - ethereum - v1

# Setup

## Library import
We import all the required Python libraries

In [182]:
import os

# Data manipulation
from feature_engine.encoding import RareLabelEncoder, CountFrequencyEncoder
from feature_engine.imputation import MeanMedianImputer
import pandas as pd
import numpy as np

# Visualizations
import plotly
import plotly.graph_objs as go
import plotly.offline as ply
plotly.offline.init_notebook_mode(connected=True)
import matplotlib as plt

from tqdm import tqdm
from lightgbm import (
    LGBMClassifier, plot_importance, create_tree_digraph, plot_tree
)
import missingno as msno
# from pycaret.classification import ClassificationExperiment
from sklearn.pipeline import Pipeline
from sklearn.model_selection import (
    train_test_split
    , cross_val_score
    , StratifiedKFold
    , StratifiedShuffleSplit
    , cross_validate
    , GridSearchCV
)
# from ydata_profiling import ProfileReport
from sklearn.metrics import (
    confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve
)

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

# Options for pandas
# pd.options.display.max_columns = None
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', '{:.5f}'.format)
# pd.options.display.float_format = '{:.5f}'.format
# pd.options.display.max_rows = 120

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Local library import
We import all the required local libraries libraries

In [183]:
os.chdir('../')
from src.utils.data_describe import breve_descricao, serie_nulos, cardinalidade, check_for_equal_columns
os.chdir('./notebooks/')

# Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.

In [184]:
RAW_FOLDER = '../data/raw/'
INTERIM_FOLDER = '../data/interim/'
PROCESSED_FOLDER = '../data/processed/'
REPORTS_FOLDER = '../reports/'
RANDOM_STATE = 42

## Custom functions

In [None]:
def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    df_temp = df.copy()
    lst_columns = [column.strip().replace(' ', '_') for column in df_temp.columns]
    df_temp.columns = lst_columns
    return df_temp


def load_file_into_df(file_path: str) -> pd.DataFrame:
    filename_with_ext = file_path.split('/')[-1].split('\\')[-1]

    if '.' in filename_with_ext:
        file_name = filename_with_ext.split('.')[-2]
        extension = filename_with_ext.split('.')[-1]
    else:
        file_name = filename_with_ext
        extension = ''

    try:
        df_raw = pd.read_parquet(INTERIM_FOLDER + f'{file_name}.pqt')
        print(f'PARQUET file loaded. Shape: {df_raw.shape}')
    except FileNotFoundError as e:
        df_raw = pd.read_csv(RAW_FOLDER + f'{file_name}.csv', index_col=0)
        df_raw.drop(columns=['Index'], inplace=True)
        df_raw = rename_columns(df_raw)
        df_raw.to_parquet(INTERIM_FOLDER +  f'{file_name}.pqt', index=False)
        print(f'EXCEL file loaded and PARQUET created. Shape: {df_raw.shape}')
    return df_raw


def list_equal_columns(df: pd. DataFrame, verbose: bool = False) -> tuple[list, dict]:
    dct_equal_columns = {}
    lst_equal_columns = []

    for i, column_to_test in enumerate(df.columns, start=1):
        lst_equal_columns_temp = []
        for column in [column for column in df.columns[i:] if column not in lst_equal_columns]:
            is_equal = df[column_to_test].equals(df[column])
            if is_equal:
                lst_equal_columns_temp.append(column)
                lst_equal_columns.append(column)
                if verbose:
                    print(f'{column_to_test} is EQUAL to {column}.')
        if len(lst_equal_columns_temp) > 0:
            dct_equal_columns[column_to_test] = lst_equal_columns_temp

    return lst_equal_columns, dct_equal_columns


def drop_unitary_columns(df: pd.DataFrame, verbose: bool = False) -> tuple[pd.DataFrame, list]:
    lst_unitary_columns = []
    df_temp = df.copy()

    for column in df_temp.columns:
        if len(df_temp[column].value_counts()) == 1:
            df_temp.drop(columns=[column], inplace=True)
            lst_unitary_columns.append(column)

    if verbose:
        print(f'Removed columns: {lst_unitary_columns}\n')

    return df_temp, lst_unitary_columns


# Data import
We retrieve all the required data for the analysis.

In [186]:
df_raw = load_file_into_df(file_path='transaction_dataset.csv')

display(df_raw.head(3))

PARQUET file loaded. Shape: (9841, 49)


Unnamed: 0,Address,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,min_value_sent_to_contract,max_val_sent_to_contract,avg_value_sent_to_contract,total_transactions_(including_tnx_to_create_contract,total_Ether_sent,total_ether_received,total_ether_sent_contracts,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_sent_addr.1,ERC20_uniq_rec_contract_addr,ERC20_avg_time_between_sent_tnx,ERC20_avg_time_between_rec_tnx,ERC20_avg_time_between_rec_2_tnx,ERC20_avg_time_between_contract_tnx,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_min_val_sent_contract,ERC20_max_val_sent_contract,ERC20_avg_val_sent_contract,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
0,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,844.26,1093.71,704785.63,721,89,0,40,118,0.0,45.80678,6.58951,0.0,31.22,1.20068,0.0,0.0,0.0,810,865.69109,586.46667,0.0,-279.22442,265.0,35588543.78,35603169.52,0.0,30.0,54.0,0.0,58.0,0.0,0.0,0.0,0.0,0.0,15000000.0,265586.1476,0.0,16830998.35,271779.92,0.0,0.0,0.0,39.0,57.0,Cofoundit,Numeraire
1,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,5,14,0.0,2.61327,0.38569,0.0,1.8,0.03284,0.0,0.0,0.0,102,3.0873,3.08548,0.0,-0.00182,8.0,403.42831,2.26081,0.0,1.0,5.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,365.0,57.63262,2.26081,2.26081,2.26081,0.0,0.0,0.0,1.0,7.0,Livepeer Token,Livepeer Token
2,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,10,2,0.11312,1.16545,0.35891,0.05,3.53862,1.79431,0.0,0.0,0.0,12,3.58862,3.58906,0.0,0.00044,8.0,521.51207,0.0,0.0,0.0,7.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,442.81984,65.18901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,,XENON


# Data cleaning

1. Remove duplicated registries
2. Remove duplicated columns
3. Remove constant columns
4. Remove equal columns
5. Transforming categorical features

## Removing duplicated registries

In [187]:
df_cleaned = df_raw.copy()

df_cleaned.drop_duplicates(inplace=True)

print(f'Shape before data cleasing: {df_raw.shape}')
print(f'After dropping duplicated rows: {df_cleaned.shape}')

Shape before data cleasing: (9841, 49)
After dropping duplicated rows: (9823, 49)


### Removing duplicated wallet with different 'FLAG' 

In [188]:
print(f'Shape before dropping duplicated wallet: {df_cleaned.shape}')

df_cleaned.sort_values(by=['Address', 'FLAG'], ascending=[True, False])

df_cleaned.drop_duplicates(subset=['Address', 'FLAG'], keep='first', inplace=True)

print(f'After dropping dropping duplicated wallet: {df_cleaned.shape}')

Shape before dropping duplicated wallet: (9823, 49)
After dropping dropping duplicated wallet: (9816, 49)


## Removing duplicated columns

In [189]:
print(f'Shape before dropping duplicated columns: {df_cleaned.shape}')

dct_equal_columns = list_equal_columns(df_cleaned, verbose=True)[1]

for key, lst_to_drop in dct_equal_columns.items():
    df_cleaned.drop(columns=lst_to_drop, inplace=True)

print(f'Shape AFTER dropping duplicated columns: {df_cleaned.shape}')

Shape before dropping duplicated columns: (9816, 49)
ERC20_avg_time_between_sent_tnx is EQUAL to ERC20_avg_time_between_rec_tnx.
ERC20_avg_time_between_sent_tnx is EQUAL to ERC20_avg_time_between_rec_2_tnx.
ERC20_avg_time_between_sent_tnx is EQUAL to ERC20_avg_time_between_contract_tnx.
ERC20_avg_time_between_sent_tnx is EQUAL to ERC20_min_val_sent_contract.
ERC20_avg_time_between_sent_tnx is EQUAL to ERC20_max_val_sent_contract.
ERC20_avg_time_between_sent_tnx is EQUAL to ERC20_avg_val_sent_contract.
Shape AFTER dropping duplicated columns: (9816, 43)


## Removing constant columns

In [191]:
df_cleaned = drop_unitary_columns(df_cleaned, verbose=True)[0]

print(f'Shape before data cleasing: {df_raw.shape}')
print(f'After dropping duplicated columns: {df_cleaned.shape}')

Removed columns: ['ERC20_avg_time_between_sent_tnx']

Shape before data cleasing: (9841, 49)
After dropping duplicated columns: (9816, 42)


## Pre-processing categorical features

### Replacing "    " and "0" to "no information"

In [None]:
for column in ['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']:
    df_cleaned.loc[df_cleaned[column].isnull(), column] = "0"
    for category in [" ", "", "0"]:
        df_cleaned[column] = df_cleaned[column].str.strip()
        df_cleaned[column].replace(category, "no information", inplace=True)
    

    display(
        pd.DataFrame(
            100*df_cleaned.loc[:, column].value_counts(normalize=True)
        ).head(10)
    )


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





Unnamed: 0_level_0,proportion
ERC20_most_sent_token_type,Unnamed: 1_level_1
no information,84.22983
EOS,1.40587
OmiseGO,1.38549
Golem,1.31418
blockwell.ai KYC Casper Token,1.29381
StatusNetwork,0.62143
BAT,0.38712
Qtum,0.34637
Bancor,0.326
Reputation,0.26487



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





Unnamed: 0_level_0,proportion
ERC20_most_rec_token_type,Unnamed: 1_level_1
no information,53.78973
OmiseGO,8.85289
Blockwell say NOTSAFU,7.93602
DATAcoin,3.63692
Livepeer Token,2.07824
EOS,1.64018
XENON,1.47718
Golem,1.28362
GSENetwork,0.815
Tronix,0.77425


# Experiment 01 - Only numerical features

In [194]:
df_cleaned_exp_01 = df_cleaned.drop(columns=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']).copy()

## Splitting data set

In [195]:
test_size = 0.15
train_size = 1 - test_size

lst_test_sample = df_cleaned_exp_01.sample(int(test_size*len(df_cleaned_exp_01)), random_state=RANDOM_STATE).index.values
lst_train_sample = df_cleaned_exp_01.drop(index=lst_test_sample).index.values

print(f"""
test_size: {test_size}
test_sample (registries): {len(lst_test_sample)}

train_size: {train_size}
train_sample (registries): {len(lst_train_sample)}
""")

df_cleaned_exp_01.loc[lst_train_sample, :].head()


test_size: 0.15
test_sample (registries): 1472

train_size: 0.85
train_sample (registries): 8344



Unnamed: 0,Address,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,min_value_sent_to_contract,max_val_sent_to_contract,avg_value_sent_to_contract,total_transactions_(including_tnx_to_create_contract,total_Ether_sent,total_ether_received,total_ether_sent_contracts,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_sent_addr.1,ERC20_uniq_rec_contract_addr,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name
1,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,5,14,0.0,2.61327,0.38569,0.0,1.8,0.03284,0.0,0.0,0.0,102,3.0873,3.08548,0.0,-0.00182,8.0,403.42831,2.26081,0.0,1.0,5.0,0.0,7.0,0.0,365.0,57.63262,2.26081,2.26081,2.26081,1.0,7.0
2,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,10,2,0.11312,1.16545,0.35891,0.05,3.53862,1.79431,0.0,0.0,0.0,12,3.58862,3.58906,0.0,0.00044,8.0,521.51207,0.0,0.0,0.0,7.0,0.0,8.0,0.0,442.81984,65.18901,0.0,0.0,0.0,0.0,8.0
4,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,7,19,0.0,12.80241,2.6711,0.0,9.0,0.02269,0.0,0.0,0.0,4619,104.31888,53.4219,0.0,-50.89699,42.0,162829.6609,123539.9329,0.0,4.0,23.0,0.0,27.0,0.0,90000.0,4934.23215,0.0,45000.0,13726.65922,6.0,27.0
5,0x000895ad78f4403ecd9468900e68d6ee506136fd,0,9900.12,375.48,20926.68,2,3,0,2,1,0.72415,4.81378,3.23491,4.1668,5.53692,4.85186,0.0,0.0,0.0,5,9.70372,9.70472,0.0,0.00101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0x000d63fc5df52b0204374c2f5a3249779805d5d1,0,69.46,629.44,8660.35,25,11,0,9,20,0.049,2.65,1.09811,0.00946,4.28479,0.4825,0.0,0.0,0.0,36,12.06239,12.07927,0.0,0.01687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Exporting

In [196]:
df_cleaned_exp_01.loc[lst_train_sample, :].to_parquet(INTERIM_FOLDER + 'artigo_df_train_exp_01.pqt')
df_cleaned_exp_01.loc[lst_test_sample, :].to_parquet(INTERIM_FOLDER + 'artigo_df_test_exp_01.pqt')

# Experiment 02 - Categorical features encoded by frequency

In [197]:
df_cleaned_exp_02 = df_cleaned.copy()
df_cleaned_exp_02.shape

(9816, 42)

In [198]:
df_temp = df_cleaned_exp_02.drop(columns=['Address', 'FLAG'])

df_temp = cardinalidade(df_temp.select_dtypes(include=[float, int]))
df_temp['Proporção Nulos'].value_counts()


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'NaN' has dtype incompatible with int64, please explic

Proporção Nulos
0.00000    22
0.08445    16
Name: count, dtype: int64

## Splitting data set

In [199]:
test_size = 0.15
train_size = 1 - test_size

X = df_cleaned_exp_02.drop(columns=['Address', 'FLAG']).copy()
y = df_cleaned_exp_02[['FLAG']]

df_cleaned_train_exp_02, df_cleaned_test_exp_02 = train_test_split(
    df_cleaned_exp_02.drop(columns=['Address']), train_size=train_size, 
    random_state=RANDOM_STATE,
)

print(f"""
test_size: {test_size}
test_sample (registries): {len(df_cleaned_test_exp_02)}
test_sample's target's proportion:
{df_cleaned_test_exp_02['FLAG'].value_counts(normalize=True)}

train_size: {train_size}
train_sample (registries): {len(df_cleaned_train_exp_02)}
train_sample's target's proportion:
{df_cleaned_train_exp_02['FLAG'].value_counts(normalize=True)}
""")

df_cleaned_train_exp_02.head()


test_size: 0.15
test_sample (registries): 1473
test_sample's target's proportion:
FLAG
0   0.77733
1   0.22267
Name: proportion, dtype: float64

train_size: 0.85
train_sample (registries): 8343
train_sample's target's proportion:
FLAG
0   0.77814
1   0.22186
Name: proportion, dtype: float64



Unnamed: 0,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,min_value_sent_to_contract,max_val_sent_to_contract,avg_value_sent_to_contract,total_transactions_(including_tnx_to_create_contract,total_Ether_sent,total_ether_received,total_ether_sent_contracts,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_sent_addr.1,ERC20_uniq_rec_contract_addr,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
2859,0,163.07,0.17,326.47,2,2,0,2,2,35.49665,65.50334,50.5,3.0,97.99902,50.49951,0.0,0.0,0.0,4,100.99902,101.0,0.0,0.00098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,no information,no information
2149,0,0.0,4.64,2726.18,1,2,0,1,1,0.015,6.90985,3.46242,6.92436,6.92436,6.92436,0.0,0.0,0.0,3,6.92436,6.92485,0.0,0.00049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,no information,no information
2979,0,1050.5,887.86,1393223.92,689,754,0,405,448,0.0,81.8288,1.66417,0.0,59.98,1.52056,0.0,0.0,0.0,1443,1047.66474,1254.78214,0.0,207.1174,31.0,2547558.025,387.73596,0.0,1.0,19.0,0.0,23.0,0.0,2537935.0,87846.82844,0.7,387.03596,193.86798,1.0,23.0,Golem,Golem
185,0,136.47,482175.49,964896.88,4,2,0,2,3,0.12225,30.0,15.06113,0.00069,29.39944,7.53027,0.0,0.0,0.0,6,30.12108,30.12225,0.0,0.00117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,no information,no information
1002,0,0.0,1417.28,226764.5,0,160,1,4,0,0.0,6.87895,2.69873,0.0,0.0,0.0,0.0,0.0,0.0,161,0.0,431.79653,0.0,431.79653,3.0,14.09318,0.0,0.0,0.0,3.0,0.0,3.0,1.07236,11.46613,4.69773,0.0,0.0,0.0,0.0,3.0,no information,DATAcoin


## Preprocessing

In [200]:

lst_numerical_columns = df_cleaned_train_exp_02.drop(columns=[
    'FLAG', 'ERC20_most_sent_token_type', 'ERC20_most_rec_token_type'
]).columns.tolist()

lst_categorical_columns = ['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']

pipe = Pipeline([
    ('rare', RareLabelEncoder(
        n_categories=2, max_n_categories=3, replace_with='rare', tol=0.1,
        variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
    )),
    ('frq', CountFrequencyEncoder(
        encoding_method='frequency',
        variables=['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
    )),
    ('imputer', MeanMedianImputer(
        imputation_method='median',
        variables=lst_numerical_columns
    ))
])

df_cleaned_train_exp_02_transformed = pipe.fit_transform(df_cleaned_train_exp_02)
df_cleaned_test_exp_02_transformed = pipe.transform(df_cleaned_test_exp_02)

display(df_cleaned_train_exp_02_transformed['ERC20_most_rec_token_type'].value_counts())
display(df_cleaned_train_exp_02_transformed['ERC20_most_sent_token_type'].value_counts())

ERC20_most_rec_token_type
0.53506    4464
0.46494    3879
Name: count, dtype: int64

ERC20_most_sent_token_type
0.84238    7028
0.15762    1315
Name: count, dtype: int64

### Effect of transformation over numerical features

#### Training dataset

In [201]:

df_num_features_before_transf = df_cleaned_train_exp_02.loc[:, lst_numerical_columns].describe().T[['count', 'mean', 'std', '50%']]
df_num_features_after_transf = df_cleaned_train_exp_02_transformed.loc[:, lst_numerical_columns].describe().T[['count', 'mean', 'std', '50%']]

pd.concat([df_num_features_before_transf, df_num_features_after_transf], axis=1)
# .to_excel(PROCESSED_FOLDER + 'df_after_preprocessing.xlsx')

Unnamed: 0,count,mean,std,50%,count.1,mean.1,std.1,50%.1
Avg_min_between_sent_tnx,8343.0,5030.61067,21578.15147,16.88,8343.0,5030.61067,21578.15147,16.88
Avg_min_between_received_tnx,8343.0,7989.67142,23008.38888,503.66,8343.0,7989.67142,23008.38888,503.66
Time_Diff_between_first_and_last_(Mins),8343.0,219932.64152,325301.55593,46352.6,8343.0,219932.64152,325301.55593,46352.6
Sent_tnx,8343.0,120.88541,771.23614,3.0,8343.0,120.88541,771.23614,3.0
Received_Tnx,8343.0,166.82776,941.89285,4.0,8343.0,166.82776,941.89285,4.0
Number_of_Created_Contracts,8343.0,3.78006,148.75941,0.0,8343.0,3.78006,148.75941,0.0
Unique_Received_From_Addresses,8343.0,31.34939,306.65191,2.0,8343.0,31.34939,306.65191,2.0
Unique_Sent_To_Addresses,8343.0,27.1008,278.24392,2.0,8343.0,27.1008,278.24392,2.0
min_value_received,8343.0,45.00561,343.07167,0.095,8343.0,45.00561,343.07167,0.095
max_value_received,8343.0,556.19442,13854.83002,5.99954,8343.0,556.19442,13854.83002,5.99954


#### Testing dataset

In [202]:

df_num_features_before_transf = df_cleaned_test_exp_02.loc[:, lst_numerical_columns].describe().T[['count', 'mean', 'std', '50%']]
df_num_features_after_transf = df_cleaned_test_exp_02_transformed.loc[:, lst_numerical_columns].describe().T[['count', 'mean', 'std', '50%']]

pd.concat([df_num_features_before_transf, df_num_features_after_transf], axis=1).to_excel(PROCESSED_FOLDER + 'artigo_df_after_preprocessing.xlsx')

In [203]:
lst_numeric_columns = df_cleaned_train_exp_02.drop(columns=['FLAG']).select_dtypes(exclude=object).columns.tolist()

pd.DataFrame(
    df_cleaned_train_exp_02_transformed.loc[:, lst_numeric_columns].median(axis=0), columns=['Mediana']
).to_excel(PROCESSED_FOLDER + 'artigo_preprocessing_numerical.xlsx')

### Effect of transformation over categorical features

#### Training dataset

In [204]:
column = 'ERC20_most_rec_token_type'

# Before
df_temp = pd.DataFrame(
    100*df_cleaned_train_exp_02[[column]].value_counts() /\
    df_cleaned_train_exp_02[[column]].value_counts().sum(),
    columns=['Proporção']
)

df_temp['cumsum'] = df_temp['Proporção'].cumsum()
df_temp.to_excel(PROCESSED_FOLDER + f'artigo_before_{column}.xlsx', engine='xlsxwriter')
display(df_temp.head())

# After

display(df_cleaned_train_exp_02_transformed[column].unique())

Unnamed: 0,Proporção,cumsum


array([0.53505933, 0.46494067])

In [56]:
column = 'ERC20_most_sent_token_type'

# Before
df_temp = pd.DataFrame(
    100*df_cleaned_train_exp_02[[column]].value_counts() /\
    df_cleaned_train_exp_02[[column]].value_counts().sum(),
    columns=['Proporção']
)

df_temp['cumsum'] = df_temp['Proporção'].cumsum()
df_temp.to_excel(PROCESSED_FOLDER + f'Before_{column}.xlsx', engine='xlsxwriter')
display(df_temp.head())

# After

display(df_cleaned_train_exp_02_transformed[column].unique())

Unnamed: 0,Proporção,cumsum


array([0.89154504, 0.10845496])

### Testing dataset

In [205]:
column = 'ERC20_most_rec_token_type'

# Before
df_temp = pd.DataFrame(
    100*df_cleaned_test_exp_02[[column]].value_counts() /\
    df_cleaned_test_exp_02[[column]].value_counts().sum(),
    columns=['Proporção']
)

df_temp['cumsum'] = df_temp['Proporção'].cumsum()
display(df_temp.head())

# After
display(df_cleaned_test_exp_02_transformed[column].unique())

Unnamed: 0,Proporção,cumsum


array([0.46494067, 0.53505933])

In [58]:
# Before
df_temp = pd.DataFrame(
    100*df_cleaned_test_exp_02[['ERC20_most_sent_token_type']].value_counts() /\
    df_cleaned_test_exp_02[['ERC20_most_sent_token_type']].value_counts().sum(),
    columns=['Proporção']
)

df_temp['cumsum'] = df_temp['Proporção'].cumsum()
display(df_temp.head())

# After
display(df_cleaned_test_exp_02_transformed['ERC20_most_sent_token_type'].unique())

Unnamed: 0,Proporção,cumsum


array([0.89154504, 0.10845496])

## Exporting

In [206]:
df_cleaned_train_exp_02_transformed.to_parquet(INTERIM_FOLDER + 'artigo_df_cleaned_train_exp_02_transformed.pqt')
df_cleaned_test_exp_02_transformed.to_parquet(INTERIM_FOLDER + 'artigo_df_cleaned_test_exp_02_transformed.pqt')