In [1]:
DIR = '../data/pump-fun-graduation-february-2025'

In [2]:
IS_DEBUG = False

In [3]:
!ls {DIR}

chunk_1.csv   chunk_20.csv  chunk_31.csv  chunk_5.csv
chunk_10.csv  chunk_21.csv  chunk_32.csv  chunk_6.csv
chunk_11.csv  chunk_22.csv  chunk_33.csv  chunk_7.csv
chunk_12.csv  chunk_23.csv  chunk_34.csv  chunk_8.csv
chunk_13.csv  chunk_24.csv  chunk_35.csv  chunk_9.csv
chunk_14.csv  chunk_25.csv  chunk_36.csv  dune_token_info.csv
chunk_15.csv  chunk_26.csv  chunk_37.csv  dune_token_info_v2.csv
chunk_16.csv  chunk_27.csv  chunk_38.csv  pump-fun-graduation-february-2025.zip
chunk_17.csv  chunk_28.csv  chunk_39.csv  test_unlabeled.csv
chunk_18.csv  chunk_29.csv  chunk_4.csv   token_info_onchain_divers.csv
chunk_19.csv  chunk_3.csv   chunk_40.csv  token_info_onchain_divers_v2.csv
chunk_2.csv   chunk_30.csv  chunk_41.csv  train.csv


In [4]:
import pandas as pd
import os
import catboost

In [5]:
train = pd.read_csv(os.path.join(DIR, 'train.csv'))

train.shape

(639557, 6)

In [6]:
train.columns

Index(['Unnamed: 0', 'mint', 'slot_min', 'slot_graduated', 'has_graduated',
       'is_valid'],
      dtype='object')

In [7]:
filenames = !ls {DIR}/chunk*.csv

In [8]:
from tqdm.auto import tqdm

def generate_features(filenames):
    all_data = []
    for chunk_filename in tqdm(filenames):
        all_data.append(
            pd.read_csv(chunk_filename)
        )
    data = pd.concat(all_data)
    data.info()
    features = data.groupby('base_coin').agg({
        'quote_coin_amount': ['sum', 'min', 'max', 'count', 'last', 'first', 'median', 'std'], 
        'virtual_sol_balance_after': ['sum', 'min', 'max', 'last', 'first', 'median', 'std'],
        'base_coin_amount': ['sum', 'min', 'max', 'last', 'first', 'median', 'std', 'mean'],
        'provided_gas_fee': ['sum', 'min', 'max', 'last', 'first', 'median', 'std', 'mean'],
        'provided_gas_limit': ['sum', 'min', 'max', 'last', 'first', 'median', 'std', 'mean'],
        'fee': ['sum', 'min', 'max', 'last', 'first', 'median', 'std', 'mean'],
        'consumed_gas': ['sum', 'min', 'max', 'last', 'first', 'median', 'std', 'mean'],
        'direction': ['last',],
        'slot': ['min', 'max', 'median', 'mean']
    })
    features.columns = ['_'.join(f) for f in features.columns]
    features['slot_max_offset'] = features['slot_max'] - features['slot_min']
    features['slot_mean_offset'] = features['slot_mean'] - features['slot_min']
    features['slot_median_offset'] = features['slot_median'] - features['slot_min']
    features = features.drop(columns=['slot_min', 'slot_max', 'slot_median', 'slot_mean'])
    
    return features

if IS_DEBUG:
    filenames = filenames[:2]+filenames[-2:]
features = generate_features(filenames)

  0%|          | 0/41 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 17033442 entries, 0 to 430066
Data columns (total 15 columns):
 #   Column                       Dtype 
---  ------                       ----- 
 0   block_time                   object
 1   slot                         int64 
 2   tx_idx                       int64 
 3   signing_wallet               object
 4   direction                    object
 5   base_coin                    object
 6   base_coin_amount             int64 
 7   quote_coin_amount            int64 
 8   virtual_token_balance_after  int64 
 9   virtual_sol_balance_after    int64 
 10  signature                    object
 11  provided_gas_fee             int64 
 12  provided_gas_limit           int64 
 13  fee                          int64 
 14  consumed_gas                 int64 
dtypes: int64(10), object(5)
memory usage: 2.0+ GB


In [9]:
feature_names = list(features.columns)

In [10]:
Xy = train[['mint', 'has_graduated']].merge(features, left_on='mint', right_on='base_coin', how='left')

In [11]:
onchain_divers = pd.read_csv(os.path.join(DIR, 'token_info_onchain_divers_v2.csv'))

In [12]:
onchain_divers.columns

Index(['block_time', 'slot', 'tx_idx', 'creator', 'name', 'symbol', 'url',
       'mint', 'bundle_size', 'gas_used', 'amount_of_instructions',
       'amount_of_lookup_reads', 'amount_of_lookup_writes', 'bundle_structure',
       'bundled_buys', 'bundled_buys_count', 'dev_balance',
       'creation_ix_index', 'curve_address', 'pf_program_index',
       'direct_pf_invocation', 'version'],
      dtype='object')

In [13]:
onchain_divers = onchain_divers.drop_duplicates(subset=['mint'])

In [14]:
for c in ['name', 'creator']:
    onchain_divers[c] = onchain_divers[c].str.lower()

In [15]:
name_count = onchain_divers['name'].value_counts()
creator_count = onchain_divers['creator'].value_counts()

In [16]:
name_count_train = onchain_divers.loc[onchain_divers['mint'].isin(train.mint), 'name'].value_counts()
creator_count_train = onchain_divers.loc[onchain_divers['mint'].isin(train.mint), 'creator'].value_counts()

## Introducing two similar feature groups (one with a leak)

`duplicate_count` has leak

`train_duplicate_count` only uses train and therefore has no leak

In [17]:
name_count.name = 'name_duplicate_count'
creator_count.name = 'creator_duplicate_count'

In [18]:
name_count_train.name = 'name_train_duplicate_count'
creator_count_train.name = 'creator_train_duplicate_count'

In [19]:
onchain_divers = onchain_divers.merge(name_count, on='name', how='left')
onchain_divers = onchain_divers.merge(creator_count, on='creator', how='left')

In [20]:
onchain_divers = onchain_divers.merge(name_count_train, on='name', how='left')
onchain_divers = onchain_divers.merge(creator_count_train, on='creator', how='left')

In [21]:
onchain_divers = onchain_divers.drop(columns=['curve_address', 'pf_program_index', 'slot', 'block_time', 'name', 'url', 'symbol']).set_index('mint')

In [22]:
Xy = Xy.merge(onchain_divers, on='mint', how='left')

In [23]:
if IS_DEBUG:
    model = catboost.CatBoostClassifier(task_type="GPU", iterations=50)
else:
    model = catboost.CatBoostClassifier(task_type="GPU", iterations=1000)

In [24]:
feature_names += list(onchain_divers.columns)

In [25]:
cat_features = [c for c in feature_names if Xy[c].dtype == 'object' or Xy[c].nunique() < 10]
cat_features

['direction_last',
 'creator',
 'bundle_size',
 'amount_of_lookup_reads',
 'amount_of_lookup_writes',
 'bundle_structure',
 'bundled_buys_count',
 'creation_ix_index',
 'direct_pf_invocation',
 'version']

In [26]:
Xy[cat_features] = Xy[cat_features].astype(str)

In [27]:
model.fit(Xy[feature_names], Xy['has_graduated'], cat_features=cat_features, metric_period=100)

Learning rate set to 0.023629
0:	learn: 0.6370664	total: 28.1ms	remaining: 28s
100:	learn: 0.0397182	total: 2.38s	remaining: 21.2s
200:	learn: 0.0375819	total: 4.76s	remaining: 18.9s
300:	learn: 0.0369761	total: 7.15s	remaining: 16.6s
400:	learn: 0.0365851	total: 9.5s	remaining: 14.2s
500:	learn: 0.0362607	total: 11.9s	remaining: 11.9s
600:	learn: 0.0359920	total: 14.3s	remaining: 9.51s
700:	learn: 0.0357368	total: 16.7s	remaining: 7.13s
800:	learn: 0.0355091	total: 19.2s	remaining: 4.76s
900:	learn: 0.0352944	total: 21.6s	remaining: 2.38s
999:	learn: 0.0350785	total: 24s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x70f47e5d5bd0>

In [28]:
test = pd.read_csv(os.path.join(DIR, 'test_unlabeled.csv'))

In [29]:
X_test = test[['mint']].merge(features, left_on='mint', right_on='base_coin', how='left')

In [30]:
X_test = X_test.merge(onchain_divers, on='mint', how='left')

In [31]:
X_test[cat_features] = X_test[cat_features].astype(str)

In [32]:
p = model.predict_proba(X_test[feature_names])[:, 1]

In [33]:
X_test.set_index('mint', drop=True)[feature_names].to_parquet('full_features.parquet')

In [34]:
pd.Series(p).value_counts()

0.001729    37
0.001728    27
0.000996    21
0.001008    20
0.001589    19
            ..
0.006791     1
0.004032     1
0.003023     1
0.007272     1
0.001114     1
Name: count, Length: 473931, dtype: int64

In [35]:
submission = X_test[['mint']]

In [36]:
submission['has_graduated'] = p

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['has_graduated'] = p


In [37]:
submission

Unnamed: 0,mint,has_graduated
0,9Wt3N7etKMX9cioTdEJ5S4b8A9nK3M66n9RFVgBGpump,0.002509
1,9q5y2X2P8ZEKTjyXBVcS5q2EZM7HbNV8DURY2qnvqi2f,0.001397
2,HL2di8dcQ7eYDmkcFoZ4zJyHX5SbRZXAJxTegL3JPfx2,0.000482
3,7iAFj9Pc5QH9jbGmHwYe8T6yzNVbjhL13PNJXVTspump,0.013539
4,F7U1Rdgz2KFpneKpAnYytWF2jggnsrLScfi2A668pump,0.000976
...,...,...
478827,BRDCfyWZaZURhCwDyst2UufM1yJ7r2hiBQurp323pump,0.003541
478828,Dt5gdJqfAnn6EiimukYwpEkg5VnT7s4CGuPjM7ahpump,0.001938
478829,2tGi3egEvr7gH5VD3DC8BWrWB4sem3XetcRzdh6gpump,0.004187
478830,6Lyy15Hondyj4XTWf6SzviioAuxoYs1UiMEvL74Gpump,0.001826


In [38]:
assert submission.shape[0] == test.shape[0]

In [39]:
submission.to_csv('leak_submission.csv', index=False)

In [40]:
sorted(zip(map(float, model.get_feature_importance()), feature_names))

[(0.0015236473655254774, 'version'),
 (0.00751444042587431, 'bundle_size'),
 (0.03125955207157971, 'direct_pf_invocation'),
 (0.05353506357585184, 'amount_of_lookup_writes'),
 (0.05688969646121022, 'provided_gas_fee_min'),
 (0.07952312684430067, 'fee_min'),
 (0.08182620114703883, 'provided_gas_limit_max'),
 (0.11212252872705837, 'amount_of_lookup_reads'),
 (0.12033304900082235, 'consumed_gas_first'),
 (0.1580021331402418, 'fee_last'),
 (0.15952319206016854, 'provided_gas_limit_min'),
 (0.17862122079743797, 'gas_used'),
 (0.20046967422472822, 'provided_gas_fee_max'),
 (0.21200478831955527, 'fee_std'),
 (0.21269806695821825, 'base_coin_amount_first'),
 (0.23402020194944026, 'provided_gas_fee_last'),
 (0.2342259952757878, 'consumed_gas_max'),
 (0.24454890018735273, 'fee_max'),
 (0.24560049619007948, 'direction_last'),
 (0.27226287623153284, 'provided_gas_limit_last'),
 (0.27553722463528124, 'consumed_gas_median'),
 (0.2795208378258291, 'quote_coin_amount_max'),
 (0.2833531368531489, 'tx_i