In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pump-fun-graduation-february-2025/chunk_40.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_23.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_18.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_21.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_38.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_37.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_36.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_14.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_6.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_35.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_34.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_32.csv
/kaggle/input/pump-fun-graduation-february-2025/token_info_onchain_divers.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_17.csv
/kaggle/input/pump-fun-graduation-february-2025/test_unlabeled.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_8.csv
/ka

In [2]:
import os
import glob
import pandas as pd
import polars as pl
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.model_selection import train_test_split


BASE_API  = "/kaggle/input/pump-fun-api-solana-tokens-info"
BASE_GRAD = "/kaggle/input/pump-fun-graduation-february-2025"
BASE_COMP = "/kaggle/input/solana-skill-sprint-memcoin-graduation"

INFO1_FILE = os.path.join(BASE_API, "pump_fun_api_info.parquet")
TRAIN_FILE = os.path.join(BASE_COMP, "train.csv")
TEST_FILE  = os.path.join(BASE_COMP, "test_unlabeled.csv")
SAMPLE_SUB = os.path.join(BASE_COMP, "sample_submission.csv")


info1 = pl.read_parquet(INFO1_FILE, columns=["mint", "created_timestamp"])


agg_df = None
for path in glob.glob(os.path.join(BASE_GRAD, "chunk_*.csv")):
    for chunk in pd.read_csv(
        path,
        usecols=["base_coin", "base_coin_amount", "quote_coin_amount", "fee"],
        chunksize=500_000
    ):
        chunk.rename(columns={"base_coin": "mint"}, inplace=True)
        grp = chunk.groupby("mint").agg(
            tx_count=("mint", "size"),
            sum_base=("base_coin_amount", "sum"),
            sum_quote=("quote_coin_amount", "sum"),
            sum_fee=("fee", "sum"),
        )
        agg_df = grp if agg_df is None else agg_df.add(grp, fill_value=0)

agg_df["mean_base"]  = agg_df["sum_base"]  / agg_df["tx_count"]
agg_df["mean_quote"] = agg_df["sum_quote"] / agg_df["tx_count"]
agg_df["mean_fee"]   = agg_df["sum_fee"]   / agg_df["tx_count"]
info2_agg = pl.from_pandas(agg_df.reset_index())


train = pl.read_csv(TRAIN_FILE).drop("")
test  = pl.read_csv(TEST_FILE).drop("")

def enrich(df: pl.DataFrame) -> pl.DataFrame:
    return (
        df.lazy()
          .join(info1.lazy(),    on="mint", how="left")
          .join(info2_agg.lazy(), on="mint", how="left")
          .fill_null(0)
          .collect(streaming=True)
    )

train_enriched = enrich(train)
test_enriched  = enrich(test)


y = train_enriched["has_graduated"]
X = train_enriched.drop(["mint", "has_graduated", "slot_graduated"])


float_cols = [c for c, dt in zip(X.columns, X.dtypes) if dt == pl.Float64]
X = X.with_columns([pl.col(c).cast(pl.Float32) for c in float_cols])


X_train, X_val, y_train, y_val = train_test_split(
    X.to_pandas(), y.to_pandas(),
    test_size=0.2, random_state=42
)


model = LGBMClassifier(
    learning_rate=0.05,
    num_leaves=64,
    n_estimators=1000,
    random_state=42
)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="binary_logloss",
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=100)
    ]
)


sample_pd = pd.read_csv(SAMPLE_SUB)
X_test    = test_enriched.drop(["mint"]).to_pandas()
preds     = model.predict_proba(X_test)[:, 1]

sample_pd["has_graduated"] = preds
sample_pd.to_csv("submission.csv", index=False)
print("submission.csv saved!")


[LightGBM] [Info] Number of positive: 5932, number of negative: 505713
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 511645, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.011594 -> initscore=-4.445608
[LightGBM] [Info] Start training from score -4.445608
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.0427593
Early stopping, best iteration is:
[80]	valid_0's binary_logloss: 0.0427383
submission.csv saved!
