## Data Preparation

In [None]:
# data_preparation.py

"""
Part 1: Data Preparation
------------------------
Pulls raw data (TVL, metadata) from DuckDB, merges protocol info with chain alignment,
handles token mappings, and computes misrepresentation flags. Exports 'protocol_data_cleaned.csv'.
"""

import pandas as pd
import numpy as np
import re
from datetime import timedelta
import urllib3
import warnings

from op_analytics.cli.subcommands.pulls.defillama.dataaccess import DefiLlama
from op_analytics.coreutils.duckdb_inmem.client import init_client as init_duckdb

urllib3.disable_warnings()
warnings.filterwarnings("ignore")

# Initialize DuckDB client
duckdb_client = init_client = init_duckdb()

# Minimum date for data pull
MINIMUM_DATE = "2024-01-01"

# 1) Pull protocol TVL data from DuckDB
view_tvl_data = DefiLlama.PROTOCOLS_TOKEN_TVL.read(min_date=MINIMUM_DATE)
df_protocol_tvl_data = duckdb_client.client.sql(
    f"""
    SELECT
        dt,
        protocol_slug,
        chain,
        token,
        app_token_tvl,
        app_token_tvl_usd
    FROM {view_tvl_data}
    """
).to_df()

# 2) Pull protocol metadata from DuckDB
view_meta_data = DefiLlama.PROTOCOLS_METADATA.read(min_date=MINIMUM_DATE)
df_protocol_metadata = duckdb_client.client.sql(
    f"""
    SELECT
        protocol_name,
        protocol_slug,
        protocol_category,
        parent_protocol,
        misrepresented_tokens
    FROM {view_meta_data}
    """
).to_df()

# Convert misrepresented_tokens to int
df_protocol_metadata["misrepresented_tokens"] = df_protocol_metadata["misrepresented_tokens"].map(
    {"True": 1, "False": 0}
).fillna(0).astype(int)

# 3) Token-level mapping stored as a list of dicts
source_tokens_data = [
    {"token":"WEETH","project":"Ether-fi","source_protocol":"ether-fi"},
    {"token":"GRT","project":"The Graph","source_protocol":"the-graph"},
    {"token":"PT-SUSDE-27MAR2025","project":"Pendle","source_protocol":"pendle"},
    {"token":"EURC","project":"Circle","source_protocol":"circle"},
    {"token":"FBOMB","project":"Millennium Club","source_protocol":"millenium-club"},
    {"token":"WETH-USDC-GMX-V2","project":"GMX","source_protocol":"gmx"},
    {"token":"USD0++","project":"Usual","source_protocol":"usual-money"},
    {"token":"WSTETH","project":"Lido","source_protocol":"lido"},
    {"token":"MGLP","project":"Abracadabra Money","source_protocol":"abracadabra"},
    {"token":"USDT","project":"Tether","source_protocol":"tether"},
    {"token":"AIXBT","project":"Virtuals Protocol","source_protocol":"virtuals-protocol"},
    {"token":"VAWSTETH","project":"Metronome","source_protocol":"metronome"},
    {"token":"MBTC","project":"Merlin","source_protocol":"merlin"},
    {"token":"FRAX","project":"Frax Finance","source_protocol":"frax-finance"},
    {"token":"AETHWSTETH","project":"Ankr","source_protocol":"ankr"},
    {"token":"USX","project":"dForce","source_protocol":"dforce"},
    {"token":"BEAN","project":"Beanstalk","source_protocol":"beanstalk"},
    {"token":"PENDLE","project":"Pendle","source_protocol":"pendle"},
    {"token":"GLP","project":"GMX","source_protocol":"gmx"},
    {"token":"OP","project":"Optimism","source_protocol":"optimism"},
    {"token":"WSUPEROETHB","project":"Origin Protocol","source_protocol":"origin-defi"},
    {"token":"WETH","project":"Ethereum","source_protocol":"ethereum"},
    {"token":"SOLVBTC.BBN","project":"Solv","source_protocol":"solv-protocol"},
    {"token":"LPT","project":"Livepeer","source_protocol":"livepeer"},
    {"token":"MIM","project":"Abracadabra Money","source_protocol":"abracadabra"},
    {"token":"LOTUS","project":"Lotus","source_protocol":"lotus"},
    {"token":"EBTC","project":"eBTC","source_protocol":"ebtc-protocol"},
    {"token":"SEAM","project":"Seamless Protocol","source_protocol":"seamless-protocol"},
    {"token":"SWETH","project":"Swell Network","source_protocol":"swell"},
    {"token":"PAXG","project":"Paxos","source_protocol":"paxos-gold"},
    {"token":"ARB","project":"Arbitrum","source_protocol":"abritrum"},
    {"token":"SUPEROETHB","project":"Origin Protocol","source_protocol":"origin-defi"},
    {"token":"NORMIE","project":"Normie","source_protocol":"normie"},
    {"token":"ETHX","project":"Stader Labs","source_protocol":"stader"},
    {"token":"LQDR","project":"Abacus","source_protocol":"abacus"},
    {"token":"GMX","project":"GMX","source_protocol":"gms"},
    {"token":"EETH","project":"Ether-fi","source_protocol":"ether-fi"},
    {"token":"SUSD","project":"Synthetix","source_protocol":"synthetix"},
    {"token":"PT-SUSDE-26DEC2024","project":"Pendle","source_protocol":"pendle"},
    {"token":"SOLVBTC","project":"Solv","source_protocol":"solv-protocol"},
    {"token":"DEGEN","project":"Degen","source_protocol":"degen"},
    {"token":"EUSD","project":"Reserve Protocol","source_protocol":"reserve-protocol"},
    {"token":"VELO","project":"Velodrome Finance","source_protocol":"velodrome"},
    {"token":"BPT-ETHTRI","project":"Balancer","source_protocol":"balancer"},
    {"token":"USDBC","project":"Circle","source_protocol":"circle"},
    {"token":"GRAIL","project":"Camelot","source_protocol":"camelot"},
    {"token":"USDC+","project":"Overnight Finance","source_protocol":"overnight-finance"},
    {"token":"AAVE","project":"Aave","source_protocol":"aave"},
    {"token":"RDNT","project":"Radiant Capital","source_protocol":"radiant"},
    {"token":"DEXE","project":"Dexe.network","source_protocol":"dexe"},
    {"token":"AMP","project":"Amp","source_protocol":"amp"},
    {"token":"RNDR","project":"Render Network","source_protocol":"render"},
    {"token":"ETH","project":"Ethereum","source_protocol":"ethereum"},
    {"token":"WSOL","project":"Solana","source_protocol":"solana"},
    {"token":"STETH","project":"Lido","source_protocol":"lido"},
    {"token":"ALUSD","project":"Alchemix","source_protocol":"alchemix"},
    {"token":"USDC.E","project":"Circle","source_protocol":"circle"},
    {"token":"RETH","project":"Rocket Pool","source_protocol":"rocket-pool"},
    {"token":"METH","project":"Mantle","source_protocol":"mantle-restaking"},
    {"token":"UNI","project":"Uniswap","source_protocol":"uniswap"},
    {"token":"WRSETH/WETH","project":"Balancer","source_protocol":"balancer"},
    {"token":"TBTC","project":"Threshold Network","source_protocol":"threshold-network"},
    {"token":"BPT-RETH-ETH","project":"Balancer","source_protocol":"balancer"},
    {"token":"CBBTC","project":"Coinbase","source_protocol":"coinbase"},
    {"token":"UNIBTC","project":"Bedrock","source_protocol":"bedrock"},
    {"token":"HEGIC","project":"Hegic","source_protocol":"hegic"},
    {"token":"SUSDE","project":"Ethena","source_protocol":"ethena"},
    {"token":"DAI","project":"MakerDAO","source_protocol":"maker"},
    {"token":"VIRTUAL","project":"Virtuals Protocol","source_protocol":"virtuals-protocol"},
    {"token":"USDM","project":"Mountain Protocol","source_protocol":"mountain-protocol"},
    {"token":"PZETH","project":"Pendle","source_protocol":"pendle"},
    {"token":"MSTETH","project":"Magpie","source_protocol":"magpie-ecosystem"},
    {"token":"WBTC-WBTC-GMX-V2","project":"GMX","source_protocol":"gmx"},
    {"token":"D2","project":"D2 Finance","source_protocol":"d2-finance"},
    {"token":"CRV","project":"Curve Finance","source_protocol":"curve-finance"},
    {"token":"MATIC","project":"Polygon","source_protocol":"polygon"},
    {"token":"LFBTC-AVALON-ETH","project":"Avalon Finance","source_protocol":"avalon-labs"},
    {"token":"EGETH","project":"Magpie","source_protocol":"magpie-ecosystem"},
    {"token":"LUNA","project":"Terra","source_protocol":"terra"},
    {"token":"AERO","project":"Aerodrome Finance","source_protocol":"aerodrome"},
    {"token":"CDXUSD","project":"cod3x","source_protocol":"cod3x"},
    {"token":"OGN","project":"Origin Protocol","source_protocol":"origin-defi"},
    {"token":"BSDETH","project":"Reserve Protocol","source_protocol":"reserve-protocol"},
    {"token":"PENDLE-LPT","project":"Pendle","source_protocol":"pendle"},
    {"token":"WETH-WETH-GMX-V2","project":"GMX","source_protocol":"gmx"},
    {"token":"EPENDLE","project":"Equilibria Finance","source_protocol":"equilibria"},
    {"token":"STAR","project":"Preon","source_protocol":"preon"},
    {"token":"MPENDLE","project":"Magpie","source_protocol":"magpie-ecosystem"},
    {"token":"PUMPBTC","project":"Pump BTC","source_protocol":"pumpbtc"},
    {"token":"WBTC","project":"Wrapped Bitcoin","source_protocol":"wrapped-bitcoin"},
    {"token":"XAUT","project":"Tether Gold","source_protocol":"tether-gold"},
    {"token":"RSETH","project":"KelpDAO","source_protocol":"kelp-dao"},
    {"token":"USDC","project":"Circle","source_protocol":"circle"},
    {"token":"USDZ","project":"Anzen","source_protocol":"anzen-finance"},
    {"token":"BRETT","project":"Brett","source_protocol":"brett"},
    {"token":"(=ↀΩↀ=)","project":"Nekodex","source_protocol":"nekodex"},
    {"token":"OUSG","project":"Ondo Finance","source_protocol":"ondo-finance"},
    {"token":"CBETH","project":"Coinbase","source_protocol":"coinbase"},
    {"token":"MAI","project":"Mai Finance","source_protocol":"mai-finance"},
    {"token":"LBTC","project":"Lombard","source_protocol":"lombard-finance"},
    {"token":"BALD","project":"Bald","source_protocol":"balancer"},
    {"token":"USDE","project":"Ethena","source_protocol":"ethena"},
    {"token":"FSGLP","project":"GMX","source_protocol":"gmx"},
    {"token":"USYC","project":"Hashnote","source_protocol":"hashnote-usyc"},
    {"token":"USDY","project":"Ondo Finance","source_protocol":"ondo-finance"},
    {"token":"WBTC-USDC-GMX-V2","project":"GMX","source_protocol":"gmx"},
    {"token":"ENA","project":"Ethena","source_protocol":"ethena"},
    {"token":"WEETHS","project":"Ether-fi","source_protocol":"ether-fi"},
    {"token":"WRSETH","project":"KelpDAO","source_protocol":"kelp-dao"},
    {"token":"LINK","project":"Chainlink","source_protocol":"chainlink"},
    {"token":"DOLA","project":"Inverse Finance","source_protocol":"inverse-finance"},
    {"token":"SPEC","project":"Spectral Labs","source_protocol":"spectral-labs"},
    {"token":"WEETH.BASE","project":"Ether-fi","source_protocol":"ether-fi"},
    {"token":"TWETH","project":"Twittereum","source_protocol":"twittereum"},
    {"token":"SFRXETH","project":"Frax Finance","source_protocol":"frax-finance"},
    {"token":"WBETH","project":"Binance","source_protocol":"binance"},
    {"token":"EZETH","project":"Renzo","source_protocol":"renzo"},
    {"token":"USD+","project":"Overnight Finance","source_protocol":"overnight-finance"},
    {"token":"SNX","project":"Synthetix","source_protocol":"synthetix"}
]
df_source_tokens = pd.DataFrame(source_tokens_data)

# 4) Extra mappings and helper data
UNWANTED_PATTERNS = [
    "-borrowed", "-vesting", "-staking", "-pool2", "-treasury", "-cex",
    "^treasury$", "^borrowed$", "^staking$", "^pool2$", "polygon-bridge-&-staking", ".*-cex$"
]
UNWANTED_CATEGORIES = ["CEX", "Chain"]

chain_alignment_map = {
    "Metis": "OP Stack Fork",
    "Blast": "OP Stack Fork",
    "Mantle": "OP Stack Fork",
    "Zircuit": "OP Stack Fork",
    "RSS3": "OP Stack Fork",
    "Rollux": "OP Stack Fork",
    "Ancient8": "OP Stack Fork",
    "Manta": "OP Stack Fork",
    "Cyber": "OP Chain",
    "Mint": "OP Chain",
    "Ham": "OP Chain",
    "Polynomial": "OP Chain",
    "Lisk": "OP Chain",
    "BOB": "OP Chain",
    "Mode": "OP Chain",
    "World Chain": "OP Chain",
    "Base": "OP Chain",
    "Kroma": "OP Chain",
    "Boba": "OP Chain",
    "Fraxtal": "OP Chain",
    "Optimism": "OP Chain",
    "Shape": "OP Chain",
    "Zora": "OP Chain"
}

token_category_data = [
    {"token": "ETH", "token_category": "Native Asset"},
    {"token": "WETH", "token_category": "Native Asset"},
    {"token": "SOL", "token_category": "Native Asset"},
    {"token": "WBTC", "token_category": "Wrapped Assets"}
]

protocol_category_map = {
    "Dexes": "Trading",
    "Liquidity manager": "Yield",
    "Derivatives": "Derivatives",
    "Yield Aggregator": "Yield",
    "Indexes": "Yield",
    "Bridge": "Trading",
    "Leveraged Farming": "Yield",
    "Cross Chain": "Trading",
    "CDP": "Lending",
    "Farm": "Yield",
    "Options": "Trading",
    "DCA Tools": "Trading",
    "Services": "TradFi/Fintech",
    "Chain": "TradFi/Fintech",
    "Privacy": "TradFi/Fintech",
    "RWA": "TradFi/Fintech",
    "Payments": "TradFi/Fintech",
    "Launchpad": "TradFi/Fintech",
    "Synthetics": "Derivatives",
    "SoFi": "TradFi/Fintech",
    "Prediction Market": "Trading",
    "Token Locker": "Yield",
    "Yield Lottery": "Yield",
    "Algo-Stables": "Stablecoins",
    "DEX Aggregator": "Trading",
    "Liquid Restaking": "Restaking/Liquid Restaking",
    "Governance Incentives": "Yield",
    "Restaking": "Restaking/Liquid Restaking",
    "Liquid Staking": "Liquid Staking",
    "Uncollateralized Lending": "Lending",
    "Managed Token Pools": "Trading",
    "Insurance": "TradFi/Fintech",
    "NFT Marketplace": "Trading",
    "NFT Lending": "Lending",
    "Options Vault": "Trading",
    "NftFi": "Trading",
    "Basis Trading": "Trading",
    "Bug Bounty": "TradFi/Fintech",
    "OTC Marketplace": "Trading",
    "Reserve Currency": "Stablecoins",
    "Gaming": "Other",
    "AI Agents": "TradFi/Fintech",
    "Treasury Manager": "TradFi/Fintech",
    "CDP Manager": "Lending",
    "Decentralized Stablecoin": "Stablecoins",
    "Restaked BTC": "Restaking/Liquid Restaking",
    "RWA Lending": "Lending",
    "Staking Pool": "Staking/Liquid Staking",
    "CeDeFi": "TradFi/Fintech",
    "Staking": "Staking/Liquid Staking",
    "Oracle": "Other",
    "Ponzi": "Other",
    "Anchor BTC": "Other",
    "Decentralized BTC": "Other",
    "CEX": "Other",
    "Lending": "Lending"
}

### Calculate cleaned protocol_data

In [None]:

# 5) Merge metadata + TVL
df_merged = pd.merge(
    df_protocol_metadata.drop_duplicates(),
    df_protocol_tvl_data.drop_duplicates(),
    on="protocol_slug",
    how="left"
)

# 6) Merge chain alignment info
df_chain_alignment = pd.DataFrame(list(chain_alignment_map.items()), columns=["chain", "alignment"])
df_merged = pd.merge(df_merged, df_chain_alignment, on="chain", how="left")
df_merged["alignment"] = df_merged["alignment"].fillna("Other")

# 7) Merge token category info
df_token_lookup = pd.DataFrame(token_category_data)
df_token_lookup["token"] = df_token_lookup["token"].str.upper()
df_merged["token"] = df_merged["token"].str.upper()
df_merged = pd.merge(df_merged, df_token_lookup, on="token", how="left")
df_merged["token_category"] = df_merged["token_category"].fillna("Other")

# 8) Compute chain-level misrepresentation
def calculate_misrepresentation_flags(df_input):
    ref_date = df_input["dt"].max() - pd.Timedelta(days=1)
    df_flags = (
        df_input[df_input.dt == ref_date]
        [["protocol_slug", "chain", "misrepresented_tokens", "token"]]
        .groupby(["protocol_slug", "chain", "misrepresented_tokens"])
        .agg(
            token_count=("token", "nunique"),
            has_usdt=("token", lambda x: 1 if "USDT" in x.values else 0)
        )
        .reset_index()
    )
    df_flags["chain_misrepresented_tokens"] = (
        (df_flags["misrepresented_tokens"] == 1)
        & (df_flags["token_count"] == 1)
        & (df_flags["has_usdt"] == 1)
    ).astype(int)

    return df_flags

df_chain_misrep = calculate_misrepresentation_flags(df_merged)
df_merged = pd.merge(
    df_merged,
    df_chain_misrep[["protocol_slug", "chain", "chain_misrepresented_tokens"]],
    on=["protocol_slug", "chain"],
    how="left"
)

df_merged["dt"] = pd.to_datetime(df_merged["dt"])
df_merged["parent_protocol"] = df_merged["parent_protocol"].astype(str).str.replace("parent#", "", regex=False)

# 9) If chain-level misrep is true, override token category
df_merged["token_category_misrep"] = np.where(
    (df_merged["chain_misrepresented_tokens"] == 1),
    "Misrepresented TVL",
    df_merged["token_category"]
)

# 10) Map protocol categories to simpler classification
df_merged["protocol_category_mapped"] = df_merged["protocol_category"].map(protocol_category_map, na_action="ignore")
df_merged.loc[df_merged["protocol_category_mapped"].isna(), "protocol_category_mapped"] = df_merged["protocol_category"]

# 11) Merge in source token info
df_merged = pd.merge(
    df_merged,
    df_source_tokens[["token", "project", "source_protocol"]].drop_duplicates(),
    on="token",
    how="left"
)

# 12) Export final CSV
cleaned_export_name = "protocol_data_cleaned.csv"
df_merged.to_csv(cleaned_export_name, index=False)
print(f"Exported {df_merged.shape[0]} rows to {cleaned_export_name}")

## Data Filtering & Aggregation for the Growth measures

In [18]:
# data_filter_aggregate.py

"""
Part 2: Data Filtering & Aggregation
------------------------------------
Loads 'protocol_data_cleaned.csv', applies pattern/category filters, 
excludes protocols under 500k TVL in the final week, and aggregates 
cross-chain TVL by parent_protocol. Exports 'final_tvl_aggregate.csv'.
"""

import pandas as pd
import numpy as np
import re
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# File path to the cleaned data
CLEANED_DATA_PATH = "protocol_data_cleaned.csv"

ANALYSIS_START = "2024-01-01"
ANALYSIS_END = "2024-12-01"

# Patterns and categories from data_preparation.py
UNWANTED_PATTERNS = [
    "-borrowed", "-vesting", "-staking", "-pool2", "-treasury", "-cex",
    "^treasury$", "^borrowed$", "^staking$", "^pool2$", "polygon-bridge-&-staking", ".*-cex$"
]
UNWANTED_CATEGORIES = ["CEX", "Chain"]

# Suppose you unify or map your protocol categories:
SIMPLE_CATEGORY_MAP = {
    # Core categories from the original snippet
    "Dexes": "Trading",
    "Liquidity manager": "Yield",
    "Derivatives": "Derivatives",
    "CEX": "Other",
    "Staking": "Staking/Liquid Staking",
    "Farm": "Yield",
    "CDP": "Lending",
    "Bridge": "Trading",
    "Chain": "Other",
    "Lending": "Lending",
    "Cross Chain": "Trading",
    "Algo-Stables": "Stablecoins",
    "Synthetics": "Derivatives",
    "Stablecoins": "Stablecoins",
    "Yield": "Yield",
    "Trading": "Trading",
    "TradFi/Fintech": "TradFi/Fintech",
    "Other": "Other",
    "Restaking/Liquid Restaking": "Restaking/Liquid Restaking",
    "Liquid Staking": "Liquid Staking",
    "Staking/Liquid Staking": "Staking/Liquid Staking",

    # Additional sub-categories appearing in df_tvl_agg/final_growth_df
    "Yield Aggregator": "Yield",
    "RWA": "TradFi/Fintech",
    "Indexes": "Yield",
    "Liquid Restaking": "Restaking/Liquid Restaking",  # same bucket as above
    "Insurance": "TradFi/Fintech",
    "NFT Lending": "Lending",
    "Options": "Trading",
    "Privacy": "TradFi/Fintech",
    "Leveraged Farming": "Yield",
    "Prediction Market": "Trading",
    "Payments": "TradFi/Fintech",
    "Uncollateralized Lending": "Lending",
    "NFT Marketplace": "Trading",
    "Launchpad": "TradFi/Fintech",
    "Token Locker": "Yield",
    "Restaking": "Restaking/Liquid Restaking",         # same bucket as above
    "Basis Trading": "Trading",
    "DEX Aggregator": "Trading",
    "Options Vault": "Trading",
    "CDP Manager": "Lending",
    "Reserve Currency": "Stablecoins",
    "Managed Token Pools": "Trading",
    "Yield Lottery": "Yield",
    "Liquidity Automation": "Yield",
    "RWA Lending": "Lending",
    "Bug Bounty": "TradFi/Fintech",
    "Governance Incentives": "Yield",
    "Treasury Manager": "TradFi/Fintech",
    "Decentralized BTC": "Other",
    "AI Agents": "TradFi/Fintech",
    "OTC Marketplace": "Trading",
    "SoFi": "TradFi/Fintech",
    "Anchor BTC": "Other",
    "Staking Pool": "Staking/Liquid Staking"
}

def does_match_unwanted_pattern(s: str) -> bool:
    """Check if a string hits any of the undesired patterns."""
    return any(re.search(pattern, s, re.IGNORECASE) for pattern in UNWANTED_PATTERNS)

def exclude_small_tvl_protocols(df: pd.DataFrame, analysis_start: str, analysis_end: str, min_tvl: int = 500000):
    """
    Removes protocols that remain under 500k TVL in the last 7 days
    of the given analysis period.
    """
    df["dt"] = pd.to_datetime(df["dt"], errors="coerce")
    start = pd.to_datetime(analysis_start)
    end = pd.to_datetime(analysis_end)
    protocol_names = df["protocol_name"].unique()

    kept_records = []
    excluded_list = []

    for proto in tqdm(protocol_names):
        df_single = df[df["protocol_name"] == proto].copy()
        if df_single.empty:
            excluded_list.append({"protocol": proto, "reason": "No data"})
            continue

        df_single = df_single[(df_single["dt"] >= start) & (df_single["dt"] <= end)]
        if df_single.empty:
            excluded_list.append({"protocol": proto, "reason": "No data in period"})
            continue

        df_single.set_index("dt", inplace=True)
        daily_tvl_series = df_single["app_token_tvl_usd"].resample("D").last()
        last_week = daily_tvl_series.last("7D")

        if last_week.mean() < min_tvl:
            excluded_list.append({"protocol": proto, "reason": "<500k TVL last 7 days"})
            continue

        kept_records.append(df_single.reset_index())

    if kept_records:
        final_filtered_df = pd.concat(kept_records, ignore_index=True)
    else:
        final_filtered_df = pd.DataFrame(columns=df.columns)

    return final_filtered_df, excluded_list

# 1. Load cleaned data
df_main_cleaned = pd.read_csv(CLEANED_DATA_PATH)
print(f"Loaded df_main_cleaned with shape {df_main_cleaned.shape}")

# 2. Convert dt to datetime
df_main_cleaned["dt"] = pd.to_datetime(df_main_cleaned["dt"], errors="coerce")
df_main_cleaned["chain"] = df_main_cleaned["chain"].astype(str)

# 3. Map protocol_category to a simpler classification
df_main_cleaned["protocol_category_mapped"] = (
    df_main_cleaned["protocol_category"].map(SIMPLE_CATEGORY_MAP, na_action="ignore")
)
df_main_cleaned.loc[
    df_main_cleaned["protocol_category_mapped"].isna(), 
    "protocol_category_mapped"
] = df_main_cleaned["protocol_category"]

# 4. Create a helper DataFrame to mark undesired protocols
df_chain_proto_helper = df_main_cleaned[["chain", "protocol_slug", "protocol_category"]].drop_duplicates()
df_chain_proto_helper["unwanted_mark"] = (
    df_chain_proto_helper["chain"].apply(does_match_unwanted_pattern)
    | df_chain_proto_helper["protocol_slug"].str.endswith("-cex")
    | df_chain_proto_helper["protocol_slug"].eq("polygon-bridge-&-staking")
    | df_chain_proto_helper["protocol_category"].isin(UNWANTED_CATEGORIES)
).astype(int)

# 5. Example chain selection logic: keep only Ethereum 
df_chain_proto_helper["chain_selected"] = (df_chain_proto_helper["chain"] == "Ethereum").astype(int)

# 6. Final filter
final_chain_mask = (
    (df_chain_proto_helper["unwanted_mark"] == 0)
    & (df_chain_proto_helper["chain_selected"] == 1)
)

# 7. Apply the filter
df_postfilter = pd.merge(
    df_main_cleaned,
    df_chain_proto_helper[final_chain_mask][["chain","protocol_slug","protocol_category"]],
    on=["chain","protocol_slug","protocol_category"],
    how="inner"
)
print(f"df_postfilter shape: {df_postfilter.shape}")

# 8. Aggregate by (parent_protocol, dt) => cross-chain TVL
df_rollup = (
    df_postfilter.groupby(["parent_protocol","dt"], as_index=False)
    .agg(
        app_token_tvl_usd=("app_token_tvl_usd","sum"),
        app_token_tvl=("app_token_tvl","sum"),
        chains_count=("chain","nunique"),
        protocol_category=("protocol_category_mapped", lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0])
    )
    .sort_values(by="app_token_tvl_usd", ascending=False)
)
df_rollup.rename(columns={"parent_protocol":"protocol_name"}, inplace=True)
print(f"df_rollup shape: {df_rollup.shape}")

# 9. Exclude protocols < 500k TVL in final week
df_final_tvl, excluded_list = exclude_small_tvl_protocols(
    df=df_rollup,
    analysis_start=ANALYSIS_START,
    analysis_end=ANALYSIS_END
)
print(f"Final df_final_tvl shape: {df_final_tvl.shape}")
print(f"Excluded {len(excluded_list)} protocols due to filters or low TVL.")

# 10. Save final aggregated dataset
df_final_tvl.to_csv("final_tvl_aggregate.csv", index=False)
print("Saved final_tvl_aggregate.csv.")

Loaded df_main_cleaned with shape (28928746, 17)
df_postfilter shape: (5111464, 17)
df_rollup shape: (289429, 6)


100%|██████████| 916/916 [00:07<00:00, 121.24it/s]


Final df_final_tvl shape: (129791, 6)
Excluded 467 protocols due to filters or low TVL.
Saved final_tvl_aggregate.csv.


## Growth Calculations

In [19]:
# growth_calculations.py

"""
Part 3: Growth Calculations
---------------------------
Loads 'final_tvl_aggregate.csv', computes monthly logistic growth rate, 
MoM growth, and overall percent growth. Exports 'protocol_growth_metrics.csv'.
"""

import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.optimize import curve_fit

FINAL_AGGREGATE_PATH = "final_tvl_aggregate.csv"
GROWTH_OUTPUT_PATH = "protocol_growth_metrics.csv"

df_tvl_agg = pd.read_csv(FINAL_AGGREGATE_PATH)
print(f"Loaded df_tvl_agg: {df_tvl_agg.shape}")

def extract_monthly_tvl(df_protocol_slice, start_period, months_count):
    """
    Resamples TVL at monthly intervals. 
    Returns a Series indexed by each month.
    """
    df_protocol_slice = df_protocol_slice.set_index("dt")
    monthly_series = df_protocol_slice["app_token_tvl_usd"].resample("M").last()
    full_month_range = pd.date_range(start=start_period, periods=months_count, freq="M")
    monthly_series = monthly_series.reindex(full_month_range)
    return monthly_series

def fit_logistic_growth(monthly_values):
    """
    Fits logistic curve: K / (1 + exp(-r(x - t0))) 
    and returns the 'r' (growth rate).
    """
    monthly_values = monthly_values.dropna()
    if len(monthly_values) < 3:
        return None
    time_index = np.arange(len(monthly_values))
    y_vals = monthly_values.values

    guess_k = max(y_vals)
    guess_r = 1.0
    guess_t0 = np.median(time_index)
    try:
        popt, _ = curve_fit(
            lambda x, K, r, t0: K / (1 + np.exp(-r*(x - t0))),
            time_index, y_vals,
            p0=[guess_k, guess_r, guess_t0],
            maxfev=10000
        )
        return popt[1]
    except:
        return None

def calculate_percent_growth(monthly_values):
    monthly_values = monthly_values.dropna()
    if len(monthly_values) < 2 or monthly_values.iloc[0] == 0:
        return None
    return (monthly_values.iloc[-1] - monthly_values.iloc[0]) / monthly_values.iloc[0]

def calculate_average_mom_growth(monthly_values):
    monthly_values = monthly_values.dropna()
    if len(monthly_values) < 2:
        return None, None
    diffs = monthly_values.diff().dropna()
    avg_abs_diff = diffs.mean()
    avg_pct_diff = (diffs / monthly_values.shift(1)).mean()
    return avg_abs_diff, avg_pct_diff

def compute_weighted_ranks(monthly_data_dict):
    monthly_df = pd.DataFrame(monthly_data_dict)
    rank_df = monthly_df.rank(axis=1, method="min", ascending=False)
    weights = np.log1p(np.arange(1, len(rank_df)+1))
    weights = weights / weights.sum()
    weighted_ranks = (rank_df * weights[:, None]).sum()
    return weighted_ranks

def analyze_growth_for_category(category_name, df_input, months_window=4, cutoff_date=None):
    """
    For each protocol within a given category, resample monthly,
    compute logistic growth, percent growth, MoM growth, etc.
    """
    df_cat_slice = df_input[df_input["protocol_category"] == category_name].copy()
    df_cat_slice["dt"] = pd.to_datetime(df_cat_slice["dt"], errors="coerce")
    protocol_list = df_cat_slice["protocol_name"].unique()

    if cutoff_date:
        cutoff_date = pd.to_datetime(cutoff_date)

    excluded_info = []
    results_data = []
    monthly_tvl_records = {}

    for proto in tqdm(protocol_list):
        df_proto = df_cat_slice[df_cat_slice["protocol_name"] == proto]
        if df_proto.empty:
            excluded_info.append({"protocol": proto, "reason": "No data"})
            continue

        if cutoff_date:
            df_proto = df_proto[df_proto["dt"] <= cutoff_date]
        latest_date = df_proto["dt"].max()
        if pd.isna(latest_date):
            excluded_info.append({"protocol": proto, "reason": "No valid dt"})
            continue

        end_period = latest_date.replace(day=1)
        start_period = end_period - pd.DateOffset(months=months_window-1)
        monthly_series = extract_monthly_tvl(df_proto, start_period, months_window)

        if monthly_series.isna().any():
            excluded_info.append({"protocol": proto, "reason": "NaN in monthly TVL"})
            continue

        monthly_tvl_records[proto] = monthly_series

    # Weighted rank approach
    average_ranks = compute_weighted_ranks(monthly_tvl_records)

    for proto_name, series_vals in monthly_tvl_records.items():
        percent_growth_val = calculate_percent_growth(series_vals)
        if percent_growth_val is None:
            excluded_info.append({"protocol": proto_name, "reason": "Invalid percent growth"})
            continue

        logistic_r_val = fit_logistic_growth(series_vals)
        if logistic_r_val is None:
            excluded_info.append({"protocol": proto_name, "reason": "Invalid logistic fit"})
            continue

        abs_mom_val, pct_mom_val = calculate_average_mom_growth(series_vals)
        df_proto_slice = df_cat_slice[df_cat_slice["protocol_name"] == proto_name]
        chain_count_val = df_proto_slice["chains_count"].iloc[0] if not df_proto_slice.empty else 1

        results_data.append({
            "protocol_name": proto_name,
            "chains_count": chain_count_val,
            "tvl_percent_growth": percent_growth_val,
            "avg_tvl_rank": average_ranks[proto_name],
            "logistic_growth_rate": logistic_r_val,
            "avg_mom_growth_abs": abs_mom_val,
            "avg_mom_growth_percent": pct_mom_val
        })

    return pd.DataFrame(results_data), pd.DataFrame(excluded_info)

def process_growth_for_all_categories(df_input, months_span=6, cutoff_date=None):
    """
    Iterates through all protocol categories, collects growth stats 
    into a combined DataFrame.
    """
    df_input["dt"] = pd.to_datetime(df_input["dt"], errors="coerce")
    if cutoff_date:
        cutoff_date = pd.to_datetime(cutoff_date)
    else:
        cutoff_date = df_input["dt"].max()

    unique_categories = df_input["protocol_category"].unique()
    print("Computing growth measures for categories:", unique_categories)

    combined_results_list = []
    combined_excluded_list = []

    for cat_item in unique_categories:
        print(f"\nCategory => {cat_item}")
        cat_df, cat_excluded = analyze_growth_for_category(
            category_name=cat_item,
            df_input=df_input,
            months_window=months_span,
            cutoff_date=cutoff_date
        )
        cat_df["protocol_category"] = cat_item

        if not cat_df.empty:
            combined_results_list.append(cat_df)
        if not cat_excluded.empty:
            combined_excluded_list.append(cat_excluded)

    if combined_results_list:
        df_growth = pd.concat(combined_results_list, ignore_index=True)
    else:
        df_growth = pd.DataFrame()

    if combined_excluded_list:
        df_excluded = pd.concat(combined_excluded_list, ignore_index=True)
    else:
        df_excluded = pd.DataFrame()

    return df_growth, df_excluded

# def main():
df_tvl_agg["dt"] = pd.to_datetime(df_tvl_agg["dt"], errors="coerce")
final_growth_df, excluded_df = process_growth_for_all_categories(
    df_input=df_tvl_agg,
    months_span=6,
    cutoff_date="2024-12-01"
)
print("Calculated growth measures. Sample:")
print(final_growth_df.head())

# Save
final_growth_df.to_csv(GROWTH_OUTPUT_PATH, index=False)
print(f"Saved '{GROWTH_OUTPUT_PATH}' with shape:", final_growth_df.shape)

# if __name__ == "__main__":
#     main()

Loaded df_tvl_agg: (129791, 6)
Computing growth measures for categories: ['Liquid Staking' 'Restaking/Liquid Restaking' 'Lending' 'Trading' 'Yield'
 'TradFi/Fintech' 'Stablecoins' 'Derivatives' 'Other'
 'Staking/Liquid Staking' 'Services']

Category => Liquid Staking


100%|██████████| 29/29 [00:00<00:00, 761.39it/s]



Category => Restaking/Liquid Restaking


100%|██████████| 21/21 [00:00<00:00, 840.71it/s]



Category => Lending


100%|██████████| 82/82 [00:00<00:00, 613.09it/s]



Category => Trading


100%|██████████| 126/126 [00:00<00:00, 461.90it/s]



Category => Yield


100%|██████████| 121/121 [00:00<00:00, 648.31it/s]



Category => TradFi/Fintech


100%|██████████| 55/55 [00:00<00:00, 863.76it/s]



Category => Stablecoins


100%|██████████| 7/7 [00:00<00:00, 676.25it/s]



Category => Derivatives


100%|██████████| 26/26 [00:00<00:00, 494.83it/s]



Category => Other


100%|██████████| 3/3 [00:00<00:00, 883.45it/s]



Category => Staking/Liquid Staking


100%|██████████| 1/1 [00:00<00:00, 845.80it/s]



Category => Services


100%|██████████| 1/1 [00:00<00:00, 528.58it/s]


Calculated growth measures. Sample:
        protocol_name  chains_count  tvl_percent_growth  avg_tvl_rank  \
0                lido             1               0.133         0.860   
1  binance-staked-eth             1               1.399         2.094   
2         rocket-pool             1              -0.303         2.206   
3       meth-protocol             1               0.112         3.440   
4          stakestone             1              -0.233         3.296   

   logistic_growth_rate  avg_mom_growth_abs  avg_mom_growth_percent  \
0                 0.055       859712684.403                   0.042   
1                 0.244       665169255.675                   0.202   
2                -0.065      -243103369.212                  -0.061   
3                 0.045        35720830.444                   0.035   
4                -0.420       -25749239.840                  -0.044   

  protocol_category  
0    Liquid Staking  
1    Liquid Staking  
2    Liquid Staking  
3    Liqui

In [20]:
df_tvl_agg["protocol_category"].value_counts()

protocol_category
Trading                       35666
Yield                         33325
Lending                       23948
TradFi/Fintech                14205
Derivatives                    7873
Liquid Staking                 7673
Restaking/Liquid Restaking     4665
Stablecoins                    1826
Services                        336
Other                           269
Staking/Liquid Staking            5
Name: count, dtype: int64

In [21]:
final_growth_df["protocol_category"].value_counts()

protocol_category
Trading                       110
Yield                         106
Lending                        76
TradFi/Fintech                 47
Derivatives                    24
Liquid Staking                 23
Restaking/Liquid Restaking     17
Stablecoins                     5
Other                           1
Services                        1
Name: count, dtype: int64