The only difference for this notebook is that we use alternative dataset

The test period is now shorter

- If features were calculated with the time limitation, they would not change
- But if they used some of the future data, they will change because this data is no longer available

In [1]:
DIR = '../data/pump-fun-graduation-february-2025-shorter'

In [2]:
IS_DEBUG = False

In [3]:
!ls {DIR}

chunk_1.csv   chunk_18.csv  chunk_26.csv  chunk_8.csv
chunk_10.csv  chunk_19.csv  chunk_27.csv  chunk_9.csv
chunk_11.csv  chunk_2.csv   chunk_28.csv  dune_token_info.csv
chunk_12.csv  chunk_20.csv  chunk_29.csv  dune_token_info_v2.csv
chunk_13.csv  chunk_21.csv  chunk_3.csv   pump-fun-graduation-february-2025.zip
chunk_14.csv  chunk_22.csv  chunk_4.csv   test_unlabeled.csv
chunk_15.csv  chunk_23.csv  chunk_5.csv   token_info_onchain_divers.csv
chunk_16.csv  chunk_24.csv  chunk_6.csv   token_info_onchain_divers_v2.csv
chunk_17.csv  chunk_25.csv  chunk_7.csv   train.csv


In [4]:
import pandas as pd
import os
import catboost

In [5]:
train = pd.read_csv(os.path.join(DIR, 'train.csv'))

train.shape

(639557, 6)

In [6]:
train.columns

Index(['Unnamed: 0', 'mint', 'slot_min', 'slot_graduated', 'has_graduated',
       'is_valid'],
      dtype='object')

In [7]:
filenames = !ls {DIR}/chunk*.csv

In [8]:
from tqdm.auto import tqdm

def generate_features(filenames):
    all_data = []
    for chunk_filename in tqdm(filenames):
        all_data.append(
            pd.read_csv(chunk_filename)
        )
    data = pd.concat(all_data)
    data.info()
    features = data.groupby('base_coin').agg({
        'quote_coin_amount': ['sum', 'min', 'max', 'count', 'last', 'first', 'median', 'std'], 
        'virtual_sol_balance_after': ['sum', 'min', 'max', 'last', 'first', 'median', 'std'],
        'base_coin_amount': ['sum', 'min', 'max', 'last', 'first', 'median', 'std', 'mean'],
        'provided_gas_fee': ['sum', 'min', 'max', 'last', 'first', 'median', 'std', 'mean'],
        'provided_gas_limit': ['sum', 'min', 'max', 'last', 'first', 'median', 'std', 'mean'],
        'fee': ['sum', 'min', 'max', 'last', 'first', 'median', 'std', 'mean'],
        'consumed_gas': ['sum', 'min', 'max', 'last', 'first', 'median', 'std', 'mean'],
        'direction': ['last',],
        'slot': ['min', 'max', 'median', 'mean']
    })
    features.columns = ['_'.join(f) for f in features.columns]
    features['slot_max_offset'] = features['slot_max'] - features['slot_min']
    features['slot_mean_offset'] = features['slot_mean'] - features['slot_min']
    features['slot_median_offset'] = features['slot_median'] - features['slot_min']
    features = features.drop(columns=['slot_min', 'slot_max', 'slot_median', 'slot_mean'])
    
    return features

if IS_DEBUG:
    filenames = filenames[:2]+filenames[-2:]
features = generate_features(filenames)

  0%|          | 0/29 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 12344685 entries, 0 to 430066
Data columns (total 15 columns):
 #   Column                       Dtype 
---  ------                       ----- 
 0   block_time                   object
 1   slot                         int64 
 2   tx_idx                       int64 
 3   signing_wallet               object
 4   direction                    object
 5   base_coin                    object
 6   base_coin_amount             int64 
 7   quote_coin_amount            int64 
 8   virtual_token_balance_after  int64 
 9   virtual_sol_balance_after    int64 
 10  signature                    object
 11  provided_gas_fee             int64 
 12  provided_gas_limit           int64 
 13  fee                          int64 
 14  consumed_gas                 int64 
dtypes: int64(10), object(5)
memory usage: 1.5+ GB


In [9]:
feature_names = list(features.columns)

In [10]:
Xy = train[['mint', 'has_graduated']].merge(features, left_on='mint', right_on='base_coin', how='left')

In [11]:
onchain_divers = pd.read_csv(os.path.join(DIR, 'token_info_onchain_divers_v2.csv'))

In [12]:
onchain_divers.columns

Index(['block_time', 'slot', 'tx_idx', 'creator', 'name', 'symbol', 'url',
       'mint', 'bundle_size', 'gas_used', 'amount_of_instructions',
       'amount_of_lookup_reads', 'amount_of_lookup_writes', 'bundle_structure',
       'bundled_buys', 'bundled_buys_count', 'dev_balance',
       'creation_ix_index', 'curve_address', 'pf_program_index',
       'direct_pf_invocation', 'version'],
      dtype='object')

In [13]:
onchain_divers = onchain_divers.drop_duplicates(subset=['mint'])

In [14]:
for c in ['name', 'creator']:
    onchain_divers[c] = onchain_divers[c].str.lower()

In [15]:
name_count = onchain_divers['name'].value_counts()
creator_count = onchain_divers['creator'].value_counts()

In [16]:
name_count_train = onchain_divers.loc[onchain_divers['mint'].isin(train.mint), 'name'].value_counts()
creator_count_train = onchain_divers.loc[onchain_divers['mint'].isin(train.mint), 'creator'].value_counts()

## Introducing two similar feature groups (one with a leak)

`duplicate_count` has leak

`train_duplicate_count` only uses train and therefore has no leak

In [17]:
name_count.name = 'name_duplicate_count'
creator_count.name = 'creator_duplicate_count'

In [18]:
name_count_train.name = 'name_train_duplicate_count'
creator_count_train.name = 'creator_train_duplicate_count'

In [19]:
onchain_divers = onchain_divers.merge(name_count, on='name', how='left')
onchain_divers = onchain_divers.merge(creator_count, on='creator', how='left')

In [20]:
onchain_divers = onchain_divers.merge(name_count_train, on='name', how='left')
onchain_divers = onchain_divers.merge(creator_count_train, on='creator', how='left')

In [21]:
onchain_divers = onchain_divers.drop(columns=['curve_address', 'pf_program_index', 'slot', 'block_time', 'name', 'url', 'symbol']).set_index('mint')

In [22]:
Xy = Xy.merge(onchain_divers, on='mint', how='left')

In [23]:
if IS_DEBUG:
    model = catboost.CatBoostClassifier(task_type="GPU", iterations=50)
else:
    model = catboost.CatBoostClassifier(task_type="GPU", iterations=1000)

In [24]:
feature_names += list(onchain_divers.columns)

In [25]:
cat_features = [c for c in feature_names if Xy[c].dtype == 'object' or Xy[c].nunique() < 10]
cat_features

['direction_last',
 'creator',
 'bundle_size',
 'amount_of_lookup_reads',
 'amount_of_lookup_writes',
 'bundle_structure',
 'bundled_buys_count',
 'creation_ix_index',
 'direct_pf_invocation',
 'version']

In [26]:
Xy[cat_features] = Xy[cat_features].astype(str)

In [27]:
model.fit(Xy[feature_names], Xy['has_graduated'], cat_features=cat_features, metric_period=100)

Learning rate set to 0.023629
0:	learn: 0.6370664	total: 28.1ms	remaining: 28.1s
100:	learn: 0.0396857	total: 2.39s	remaining: 21.3s
200:	learn: 0.0375752	total: 4.78s	remaining: 19s
300:	learn: 0.0369738	total: 7.18s	remaining: 16.7s
400:	learn: 0.0365977	total: 9.55s	remaining: 14.3s
500:	learn: 0.0362699	total: 11.9s	remaining: 11.9s
600:	learn: 0.0359991	total: 14.3s	remaining: 9.52s
700:	learn: 0.0357525	total: 16.7s	remaining: 7.14s
800:	learn: 0.0355344	total: 19.1s	remaining: 4.75s
900:	learn: 0.0353237	total: 21.5s	remaining: 2.36s
999:	learn: 0.0351115	total: 23.8s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7c64eacff730>

In [28]:
test = pd.read_csv(os.path.join(DIR, 'test_unlabeled.csv'))

In [29]:
X_test = test[['mint']].merge(features, left_on='mint', right_on='base_coin', how='left')

In [30]:
X_test = X_test.merge(onchain_divers, on='mint', how='left')

In [31]:
X_test[cat_features] = X_test[cat_features].astype(str)

In [32]:
p = model.predict_proba(X_test[feature_names])[:, 1]

In [33]:
X_test.set_index('mint', drop=True)[feature_names].to_parquet('shorter_features.parquet')

In [34]:
pd.Series(p).value_counts()

0.001119    21
0.001079    20
0.001060    16
0.000919    16
0.001076    10
            ..
0.003725     1
0.001142     1
0.000473     1
0.000606     1
0.001109     1
Name: count, Length: 177692, dtype: int64

In [35]:
submission = X_test[['mint']]

In [36]:
submission['has_graduated'] = p

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['has_graduated'] = p


In [37]:
submission

Unnamed: 0,mint,has_graduated
0,9Wt3N7etKMX9cioTdEJ5S4b8A9nK3M66n9RFVgBGpump,0.002552
1,9q5y2X2P8ZEKTjyXBVcS5q2EZM7HbNV8DURY2qnvqi2f,0.001634
2,HL2di8dcQ7eYDmkcFoZ4zJyHX5SbRZXAJxTegL3JPfx2,0.000537
3,7iAFj9Pc5QH9jbGmHwYe8T6yzNVbjhL13PNJXVTspump,0.012567
4,F7U1Rdgz2KFpneKpAnYytWF2jggnsrLScfi2A668pump,0.001052
...,...,...
178813,8wSwkksQj6tgyiXKTSgnXegJFHdRkmvU9ku5tG1Xpump,0.001759
178814,GXu6DZzEE9n7d7micEPmrE8AXZjwhPAXbyEx28FMpump,0.002262
178815,4ur7rxDE6gWrC6w6BW2FUKFspR5h8Em38dJ9V3j5pump,0.002151
178816,6T72nMS1vV5quTuaxQBYTVH3KGvb3iQTGjyJhCRfpump,0.007799


In [38]:
assert submission.shape[0] == test.shape[0]

In [39]:
submission.to_csv('leak_submission_shorter.csv', index=False)

In [40]:
sorted(zip(map(float, model.get_feature_importance()), feature_names))

[(0.0015310745940092917, 'version'),
 (0.007380465826386668, 'bundle_size'),
 (0.03042287383781125, 'direct_pf_invocation'),
 (0.050339111444171004, 'amount_of_lookup_writes'),
 (0.061559126132716566, 'provided_gas_fee_min'),
 (0.09416662494711821, 'amount_of_lookup_reads'),
 (0.09948853673403896, 'provided_gas_limit_max'),
 (0.10243485121272328, 'fee_min'),
 (0.135507204493868, 'fee_last'),
 (0.14558436552925375, 'consumed_gas_first'),
 (0.16402563306060167, 'fee_std'),
 (0.17988546951642215, 'gas_used'),
 (0.1824397518242916, 'provided_gas_limit_min'),
 (0.22318181889740635, 'base_coin_amount_first'),
 (0.24020447851113702, 'provided_gas_fee_last'),
 (0.2414345954334332, 'fee_max'),
 (0.24897412673351857, 'direction_last'),
 (0.2490605752238472, 'quote_coin_amount_std'),
 (0.26358539038864987, 'consumed_gas_max'),
 (0.2692249709318303, 'quote_coin_amount_max'),
 (0.2831094811695359, 'consumed_gas_mean'),
 (0.2864797735005906, 'bundled_buys_count'),
 (0.2988906621322408, 'provided_gas