## Data Preparation

In [None]:
# data_preparation.py

"""
Part 1: Data Preparation
------------------------
Pulls raw data (TVL, metadata) from DuckDB, merges protocol info with chain alignment,
handles token mappings, and computes misrepresentation flags. Exports 'protocol_data_cleaned.csv'.
"""

import pandas as pd
import numpy as np
import urllib3
import warnings

from op_analytics.cli.subcommands.pulls.defillama.dataaccess import DefiLlama
from op_analytics.coreutils.duckdb_inmem.client import init_client as init_duckdb

urllib3.disable_warnings()
warnings.filterwarnings("ignore")

# Initialize DuckDB client
duckdb_client = init_client = init_duckdb()

# Minimum date for data pull
MINIMUM_DATE = "2025-01-01"

# 1) Pull protocol TVL data from DuckDB
view_tvl_data = DefiLlama.PROTOCOLS_TOKEN_TVL.read(min_date=MINIMUM_DATE)
df_protocol_tvl_data = duckdb_client.client.sql(
    f"""
    SELECT
        dt,
        protocol_slug,
        chain,
        token,
        app_token_tvl,
        app_token_tvl_usd
    FROM {view_tvl_data}
    """
).to_df()

# 2) Pull protocol metadata from DuckDB
view_meta_data = DefiLlama.PROTOCOLS_METADATA.read(min_date=MINIMUM_DATE)
df_protocol_metadata = duckdb_client.client.sql(
    f"""
    SELECT
        protocol_name,
        protocol_slug,
        protocol_category,
        parent_protocol,
        misrepresented_tokens
    FROM {view_meta_data}
    """
).to_df()

# Convert misrepresented_tokens to int
df_protocol_metadata["misrepresented_tokens"] = df_protocol_metadata["misrepresented_tokens"].map(
    {"True": 1, "False": 0}
).fillna(0).astype(int)

df_source_tokens = pd.read_csv("token_mapping.csv")

# 4) Extra mappings and helper data
UNWANTED_PATTERNS = [
    "-borrowed", "-vesting", "-staking", "-pool2", "-treasury", "-cex",
    "^treasury$", "^borrowed$", "^staking$", "^pool2$", "polygon-bridge-&-staking", ".*-cex$"
]
UNWANTED_CATEGORIES = ["CEX", "Chain"]

chain_alignment_map = {
    "Metis": "OP Stack Fork",
    "Blast": "OP Stack Fork",
    "Mantle": "OP Stack Fork",
    "Zircuit": "OP Stack Fork",
    "RSS3": "OP Stack Fork",
    "Rollux": "OP Stack Fork",
    "Ancient8": "OP Stack Fork",
    "Manta": "OP Stack Fork",
    "Cyber": "OP Chain",
    "Mint": "OP Chain",
    "Ham": "OP Chain",
    "Polynomial": "OP Chain",
    "Lisk": "OP Chain",
    "BOB": "OP Chain",
    "Mode": "OP Chain",
    "World Chain": "OP Chain",
    "Base": "OP Chain",
    "Kroma": "OP Chain",
    "Boba": "OP Chain",
    "Fraxtal": "OP Chain",
    "Optimism": "OP Chain",
    "Shape": "OP Chain",
    "Zora": "OP Chain"
}

token_category_data = [
    {"token": "ETH", "token_category": "Native Asset"},
    {"token": "WETH", "token_category": "Native Asset"},
    {"token": "SOL", "token_category": "Native Asset"},
    {"token": "WBTC", "token_category": "Wrapped Assets"}
]

protocol_category_map = {
    "Dexes": "Dexes",
    "Liquidity manager": "Yield",
    "Derivatives": "Derivatives",
    "Yield Aggregator": "Yield",
    "Indexes": "Yield",
    "Bridge": "Bridge",
    "Leveraged Farming": "Yield",
    "Cross Chain": "Bridge",
    "CDP": "Lending",
    "Farm": "Yield",
    "Options": "Other Trading",
    "DCA Tools": "Other Trading",
    "Services": "TradFi/Fintech",
    "Chain": "TradFi/Fintech",
    "Privacy": "TradFi/Fintech",
    "RWA": "TradFi/Fintech",
    "Payments": "TradFi/Fintech",
    "Launchpad": "TradFi/Fintech",
    "Synthetics": "Derivatives",
    "SoFi": "TradFi/Fintech",
    "Prediction Market": "Other Trading",
    "Token Locker": "Yield",
    "Yield Lottery": "Yield",
    "Algo-Stables": "Stablecoins",
    "DEX Aggregator": "Dexes",
    "Liquid Restaking": "Restaking/Liquid Restaking",
    "Governance Incentives": "Yield",
    "Restaking": "Restaking/Liquid Restaking",
    "Liquid Staking": "Liquid Staking",
    "Uncollateralized Lending": "Lending",
    "Managed Token Pools": "Other Trading",
    "Insurance": "TradFi/Fintech",
    "NFT Marketplace": "Other Trading",
    "NFT Lending": "Lending",
    "Options Vault": "Other Trading",
    "NftFi": "Other Trading",
    "Basis Trading": "Other Trading",
    "Bug Bounty": "TradFi/Fintech",
    "OTC Marketplace": "Other Trading",
    "Reserve Currency": "Stablecoins",
    "Gaming": "Other",
    "AI Agents": "TradFi/Fintech",
    "Treasury Manager": "TradFi/Fintech",
    "CDP Manager": "Lending",
    "Decentralized Stablecoin": "Stablecoins",
    "Restaked BTC": "Restaking/Liquid Restaking",
    "RWA Lending": "Lending",
    "Staking Pool": "Staking/Liquid Staking",
    "CeDeFi": "TradFi/Fintech",
    "Staking": "Staking/Liquid Staking",
    "Oracle": "Other",
    "Ponzi": "Other",
    "Anchor BTC": "Other",
    "Decentralized BTC": "Other",
    "CEX": "Other",
    "Lending": "Lending"
}

In [None]:
df_source_tokens

### Calculate cleaned protocol_data

In [None]:

# 5) Merge metadata + TVL
df_merged = pd.merge(
    df_protocol_metadata.drop_duplicates(),
    df_protocol_tvl_data.drop_duplicates(),
    on="protocol_slug",
    how="left"
)

# 6) Merge chain alignment info
df_chain_alignment = pd.DataFrame(list(chain_alignment_map.items()), columns=["chain", "alignment"])
df_merged = pd.merge(df_merged, df_chain_alignment, on="chain", how="left")
df_merged["alignment"] = df_merged["alignment"].fillna("Other")

# 7) Merge token category info
df_token_lookup = pd.DataFrame(token_category_data)
df_token_lookup["token"] = df_token_lookup["token"].str.upper()
df_merged["token"] = df_merged["token"].str.upper()
df_merged = pd.merge(df_merged, df_token_lookup, on="token", how="left")
df_merged["token_category"] = df_merged["token_category"].fillna("Other")

# 8) Compute chain-level misrepresentation
def calculate_misrepresentation_flags(df_input):
    ref_date = df_input["dt"].max() - pd.Timedelta(days=1)
    df_flags = (
        df_input[df_input.dt == ref_date]
        [["protocol_slug", "chain", "misrepresented_tokens", "token"]]
        .groupby(["protocol_slug", "chain", "misrepresented_tokens"])
        .agg(
            token_count=("token", "nunique"),
            has_usdt=("token", lambda x: 1 if "USDT" in x.values else 0)
        )
        .reset_index()
    )
    df_flags["chain_misrepresented_tokens"] = (
        (df_flags["misrepresented_tokens"] == 1)
        & (df_flags["token_count"] == 1)
        & (df_flags["has_usdt"] == 1)
    ).astype(int)

    return df_flags

df_chain_misrep = calculate_misrepresentation_flags(df_merged)
df_merged = pd.merge(
    df_merged,
    df_chain_misrep[["protocol_slug", "chain", "chain_misrepresented_tokens"]],
    on=["protocol_slug", "chain"],
    how="left"
)

df_merged["dt"] = pd.to_datetime(df_merged["dt"])
df_merged["parent_protocol"] = df_merged["parent_protocol"].astype(str).str.replace("parent#", "", regex=False)

# 9) If chain-level misrep is true, override token category
df_merged["token_category_misrep"] = np.where(
    (df_merged["chain_misrepresented_tokens"] == 1),
    "Misrepresented TVL",
    df_merged["token_category"]
)

# 10) Map protocol categories to simpler classification
df_merged["protocol_category_mapped"] = df_merged["protocol_category"].map(protocol_category_map, na_action="ignore")
df_merged.loc[df_merged["protocol_category_mapped"].isna(), "protocol_category_mapped"] = df_merged["protocol_category"]

# 11) Merge in source token info
df_merged = pd.merge(
    df_merged,
    df_source_tokens[["token", "project", "source_protocol"]].drop_duplicates(),
    on="token",
    how="left"
)

# 12) Export final CSV
cleaned_export_name = "protocol_data_cleaned.csv"
df_merged.to_csv(cleaned_export_name, index=False)
print(f"Exported {df_merged.shape[0]} rows to {cleaned_export_name}")


## Optimize the final csv Dataframe
df = df_merged
df["dt"] = pd.to_datetime(df["dt"], errors="coerce")
df["dt"] = df["dt"].astype("category")

categorical_cols = [
"protocol_name",
"protocol_slug",
"protocol_category",
"parent_protocol",
"alignment",
"token_category",
"token_category_misrep",
"protocol_category_mapped",
"project",
"source_protocol",
"chain",
"token"
]
for c in categorical_cols:
    if c in df.columns:
        df[c] = df[c].astype("category")

df["misrepresented_tokens"] = df["misrepresented_tokens"].astype("int8")
df["chain_misrepresented_tokens"] = df["chain_misrepresented_tokens"].fillna(0).astype("int8")

float_cols = ["app_token_tvl", "app_token_tvl_usd"]
for fc in float_cols:
    if fc in df.columns:
        df[fc] = df[fc].astype("float32")

df.info()
df.to_parquet("protocol_data_cleaned.csv", index=False)


## Data Filtering & Aggregation for the Growth measures

In [None]:
# data_filter_aggregate.py

"""
Part 2: Data Filtering & Aggregation
------------------------------------
Loads 'protocol_data_cleaned.csv', applies pattern/category filters, 
excludes protocols under 500k TVL in the final week, and aggregates 
cross-chain TVL by parent_protocol. Exports 'final_tvl_aggregate.csv'.
"""

import pandas as pd
import numpy as np
import re
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# File path to the cleaned data
CLEANED_DATA_PATH = "protocol_data_cleaned.csv"

ANALYSIS_START = "2024-01-01"
ANALYSIS_END = "2025-01-13"

# Patterns and categories from data_preparation.py
UNWANTED_PATTERNS = [
    "-borrowed", "-vesting", "-staking", "-pool2", "-treasury", "-cex",
    "^treasury$", "^borrowed$", "^staking$", "^pool2$", "polygon-bridge-&-staking", ".*-cex$"
]
UNWANTED_CATEGORIES = ["CEX", "Chain"]

# Suppose you unify or map your protocol categories:
SIMPLE_CATEGORY_MAP = {
    # Core categories from the original snippet
    "Dexes": "Trading",
    "Liquidity manager": "Yield",
    "Derivatives": "Derivatives",
    "CEX": "Other",
    "Staking": "Staking/Liquid Staking",
    "Farm": "Yield",
    "CDP": "Lending",
    "Bridge": "Trading",
    "Chain": "Other",
    "Lending": "Lending",
    "Cross Chain": "Trading",
    "Algo-Stables": "Stablecoins",
    "Synthetics": "Derivatives",
    "Stablecoins": "Stablecoins",
    "Yield": "Yield",
    "Trading": "Trading",
    "TradFi/Fintech": "TradFi/Fintech",
    "Other": "Other",
    "Restaking/Liquid Restaking": "Restaking/Liquid Restaking",
    "Liquid Staking": "Liquid Staking",
    "Staking/Liquid Staking": "Staking/Liquid Staking",

    # Additional sub-categories appearing in df_tvl_agg/final_growth_df
    "Yield Aggregator": "Yield",
    "RWA": "TradFi/Fintech",
    "Indexes": "Yield",
    "Liquid Restaking": "Restaking/Liquid Restaking",  # same bucket as above
    "Insurance": "TradFi/Fintech",
    "NFT Lending": "Lending",
    "Options": "Trading",
    "Privacy": "TradFi/Fintech",
    "Leveraged Farming": "Yield",
    "Prediction Market": "Trading",
    "Payments": "TradFi/Fintech",
    "Uncollateralized Lending": "Lending",
    "NFT Marketplace": "Trading",
    "Launchpad": "TradFi/Fintech",
    "Token Locker": "Yield",
    "Restaking": "Restaking/Liquid Restaking",         # same bucket as above
    "Basis Trading": "Trading",
    "DEX Aggregator": "Trading",
    "Options Vault": "Trading",
    "CDP Manager": "Lending",
    "Reserve Currency": "Stablecoins",
    "Managed Token Pools": "Trading",
    "Yield Lottery": "Yield",
    "Liquidity Automation": "Yield",
    "RWA Lending": "Lending",
    "Bug Bounty": "TradFi/Fintech",
    "Governance Incentives": "Yield",
    "Treasury Manager": "TradFi/Fintech",
    "Decentralized BTC": "Other",
    "AI Agents": "TradFi/Fintech",
    "OTC Marketplace": "Trading",
    "SoFi": "TradFi/Fintech",
    "Anchor BTC": "Other",
    "Staking Pool": "Staking/Liquid Staking"
}

def does_match_unwanted_pattern(s: str) -> bool:
    """Check if a string hits any of the undesired patterns."""
    return any(re.search(pattern, s, re.IGNORECASE) for pattern in UNWANTED_PATTERNS)

def exclude_small_tvl_protocols(df: pd.DataFrame, analysis_start: str, analysis_end: str, min_tvl: int = 500000):
    """
    Removes protocols that remain under 500k TVL in the last 7 days
    of the given analysis period.
    """
    df["dt"] = pd.to_datetime(df["dt"], errors="coerce")
    start = pd.to_datetime(analysis_start)
    end = pd.to_datetime(analysis_end)
    protocol_names = df["protocol_name"].unique()

    kept_records = []
    excluded_list = []

    for proto in tqdm(protocol_names):
        df_single = df[df["protocol_name"] == proto].copy()
        if df_single.empty:
            excluded_list.append({"protocol": proto, "reason": "No data"})
            continue

        df_single = df_single[(df_single["dt"] >= start) & (df_single["dt"] <= end)]
        if df_single.empty:
            excluded_list.append({"protocol": proto, "reason": "No data in period"})
            continue

        df_single.set_index("dt", inplace=True)
        daily_tvl_series = df_single["app_token_tvl_usd"].resample("D").last()
        last_week = daily_tvl_series.last("7D")

        if last_week.mean() < min_tvl:
            excluded_list.append({"protocol": proto, "reason": "<500k TVL last 7 days"})
            continue

        kept_records.append(df_single.reset_index())

    if kept_records:
        final_filtered_df = pd.concat(kept_records, ignore_index=True)
    else:
        final_filtered_df = pd.DataFrame(columns=df.columns)

    return final_filtered_df, excluded_list

# 1. Load cleaned data
df_main_cleaned = pd.read_csv(CLEANED_DATA_PATH)
print(f"Loaded df_main_cleaned with shape {df_main_cleaned.shape}")

# 2. Convert dt to datetime
df_main_cleaned["dt"] = pd.to_datetime(df_main_cleaned["dt"], errors="coerce")
df_main_cleaned["chain"] = df_main_cleaned["chain"].astype(str)

# 3. Map protocol_category to a simpler classification
df_main_cleaned["protocol_category_mapped"] = (
    df_main_cleaned["protocol_category"].map(SIMPLE_CATEGORY_MAP, na_action="ignore")
)
df_main_cleaned.loc[
    df_main_cleaned["protocol_category_mapped"].isna(), 
    "protocol_category_mapped"
] = df_main_cleaned["protocol_category"]

# 4. Create a helper DataFrame to mark undesired protocols
df_chain_proto_helper = df_main_cleaned[["chain", "protocol_slug", "protocol_category"]].drop_duplicates()
df_chain_proto_helper["unwanted_mark"] = (
    df_chain_proto_helper["chain"].apply(does_match_unwanted_pattern)
    | df_chain_proto_helper["protocol_slug"].str.endswith("-cex")
    | df_chain_proto_helper["protocol_slug"].eq("polygon-bridge-&-staking")
    | df_chain_proto_helper["protocol_category"].isin(UNWANTED_CATEGORIES)
).astype(int)

# 5. Example chain selection logic: keep only Ethereum 
df_chain_proto_helper["chain_selected"] = (df_chain_proto_helper["chain"] == "Ethereum").astype(int)

# 6. Final filter
final_chain_mask = (
    (df_chain_proto_helper["unwanted_mark"] == 0)
    & (df_chain_proto_helper["chain_selected"] == 1)
)

# 7. Apply the filter
df_postfilter = pd.merge(
    df_main_cleaned,
    df_chain_proto_helper[final_chain_mask][["chain","protocol_slug","protocol_category"]],
    on=["chain","protocol_slug","protocol_category"],
    how="inner"
)
print(f"df_postfilter shape: {df_postfilter.shape}")

# 8. Aggregate by (parent_protocol, dt) => cross-chain TVL
df_rollup = (
    df_postfilter.groupby(["parent_protocol","dt"], as_index=False)
    .agg(
        app_token_tvl_usd=("app_token_tvl_usd","sum"),
        app_token_tvl=("app_token_tvl","sum"),
        chains_count=("chain","nunique"),
        protocol_category=("protocol_category_mapped", lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0])
    )
    .sort_values(by="app_token_tvl_usd", ascending=False)
)
df_rollup.rename(columns={"parent_protocol":"protocol_name"}, inplace=True)
print(f"df_rollup shape: {df_rollup.shape}")

# 9. Exclude protocols < 500k TVL in final week
df_final_tvl, excluded_list = exclude_small_tvl_protocols(
    df=df_rollup,
    analysis_start=ANALYSIS_START,
    analysis_end=ANALYSIS_END
)
print(f"Final df_final_tvl shape: {df_final_tvl.shape}")
print(f"Excluded {len(excluded_list)} protocols due to filters or low TVL.")

# 10. Save final aggregated dataset
df_final_tvl.to_csv("final_tvl_aggregate.csv", index=False)
print("Saved final_tvl_aggregate.csv.")

## Growth Calculations

In [None]:
# growth_calculations.py

"""
Part 3: Growth Calculations
---------------------------
Loads 'final_tvl_aggregate.csv', computes monthly logistic growth rate, 
MoM growth, and overall percent growth. Exports 'protocol_growth_metrics.csv'.
"""

import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.optimize import curve_fit

FINAL_AGGREGATE_PATH = "final_tvl_aggregate.csv"
GROWTH_OUTPUT_PATH = "protocol_growth_metrics.csv"

df_tvl_agg = pd.read_csv(FINAL_AGGREGATE_PATH)
print(f"Loaded df_tvl_agg: {df_tvl_agg.shape}")

def extract_monthly_tvl(df_protocol_slice, start_period, months_count):
    """
    Resamples TVL at monthly intervals. 
    Returns a Series indexed by each month.
    """
    df_protocol_slice = df_protocol_slice.set_index("dt")
    monthly_series = df_protocol_slice["app_token_tvl_usd"].resample("M").last()
    full_month_range = pd.date_range(start=start_period, periods=months_count, freq="M")
    monthly_series = monthly_series.reindex(full_month_range)
    return monthly_series

def fit_logistic_growth(monthly_values):
    """
    Fits logistic curve: K / (1 + exp(-r(x - t0))) 
    and returns the 'r' (growth rate).
    """
    monthly_values = monthly_values.dropna()
    if len(monthly_values) < 3:
        return None
    time_index = np.arange(len(monthly_values))
    y_vals = monthly_values.values

    guess_k = max(y_vals)
    guess_r = 1.0
    guess_t0 = np.median(time_index)
    try:
        popt, _ = curve_fit(
            lambda x, K, r, t0: K / (1 + np.exp(-r*(x - t0))),
            time_index, y_vals,
            p0=[guess_k, guess_r, guess_t0],
            maxfev=10000
        )
        return popt[1]
    except:
        return None

def calculate_percent_growth(monthly_values):
    monthly_values = monthly_values.dropna()
    if len(monthly_values) < 2 or monthly_values.iloc[0] == 0:
        return None
    return (monthly_values.iloc[-1] - monthly_values.iloc[0]) / monthly_values.iloc[0]

def calculate_average_mom_growth(monthly_values):
    monthly_values = monthly_values.dropna()
    if len(monthly_values) < 2:
        return None, None
    diffs = monthly_values.diff().dropna()
    avg_abs_diff = diffs.mean()
    avg_pct_diff = (diffs / monthly_values.shift(1)).mean()
    return avg_abs_diff, avg_pct_diff

def compute_weighted_ranks(monthly_data_dict):
    monthly_df = pd.DataFrame(monthly_data_dict)
    rank_df = monthly_df.rank(axis=1, method="min", ascending=False)
    weights = np.log1p(np.arange(1, len(rank_df)+1))
    weights = weights / weights.sum()
    weighted_ranks = (rank_df * weights[:, None]).sum()
    return weighted_ranks

def analyze_growth_for_category(category_name, df_input, months_window=4, cutoff_date=None):
    """
    For each protocol within a given category, resample monthly,
    compute logistic growth, percent growth, MoM growth, etc.
    """
    df_cat_slice = df_input[df_input["protocol_category"] == category_name].copy()
    df_cat_slice["dt"] = pd.to_datetime(df_cat_slice["dt"], errors="coerce")
    protocol_list = df_cat_slice["protocol_name"].unique()

    if cutoff_date:
        cutoff_date = pd.to_datetime(cutoff_date)

    excluded_info = []
    results_data = []
    monthly_tvl_records = {}

    for proto in tqdm(protocol_list):
        df_proto = df_cat_slice[df_cat_slice["protocol_name"] == proto]
        if df_proto.empty:
            excluded_info.append({"protocol": proto, "reason": "No data"})
            continue

        if cutoff_date:
            df_proto = df_proto[df_proto["dt"] <= cutoff_date]
        latest_date = df_proto["dt"].max()
        if pd.isna(latest_date):
            excluded_info.append({"protocol": proto, "reason": "No valid dt"})
            continue

        end_period = latest_date.replace(day=1)
        start_period = end_period - pd.DateOffset(months=months_window-1)
        monthly_series = extract_monthly_tvl(df_proto, start_period, months_window)

        if monthly_series.isna().any():
            excluded_info.append({"protocol": proto, "reason": "NaN in monthly TVL"})
            continue

        monthly_tvl_records[proto] = monthly_series

    # Weighted rank approach
    average_ranks = compute_weighted_ranks(monthly_tvl_records)

    for proto_name, series_vals in monthly_tvl_records.items():
        percent_growth_val = calculate_percent_growth(series_vals)
        if percent_growth_val is None:
            excluded_info.append({"protocol": proto_name, "reason": "Invalid percent growth"})
            continue

        logistic_r_val = fit_logistic_growth(series_vals)
        if logistic_r_val is None:
            excluded_info.append({"protocol": proto_name, "reason": "Invalid logistic fit"})
            continue

        abs_mom_val, pct_mom_val = calculate_average_mom_growth(series_vals)
        df_proto_slice = df_cat_slice[df_cat_slice["protocol_name"] == proto_name]
        chain_count_val = df_proto_slice["chains_count"].iloc[0] if not df_proto_slice.empty else 1

        results_data.append({
            "protocol_name": proto_name,
            "chains_count": chain_count_val,
            "tvl_percent_growth": percent_growth_val,
            "avg_tvl_rank": average_ranks[proto_name],
            "logistic_growth_rate": logistic_r_val,
            "avg_mom_growth_abs": abs_mom_val,
            "avg_mom_growth_percent": pct_mom_val
        })

    return pd.DataFrame(results_data), pd.DataFrame(excluded_info)

def process_growth_for_all_categories(df_input, months_span=6, cutoff_date=None):
    """
    Iterates through all protocol categories, collects growth stats 
    into a combined DataFrame.
    """
    df_input["dt"] = pd.to_datetime(df_input["dt"], errors="coerce")
    if cutoff_date:
        cutoff_date = pd.to_datetime(cutoff_date)
    else:
        cutoff_date = df_input["dt"].max()

    unique_categories = df_input["protocol_category"].unique()
    print("Computing growth measures for categories:", unique_categories)

    combined_results_list = []
    combined_excluded_list = []

    for cat_item in unique_categories:
        print(f"\nCategory => {cat_item}")
        cat_df, cat_excluded = analyze_growth_for_category(
            category_name=cat_item,
            df_input=df_input,
            months_window=months_span,
            cutoff_date=cutoff_date
        )
        cat_df["protocol_category"] = cat_item

        if not cat_df.empty:
            combined_results_list.append(cat_df)
        if not cat_excluded.empty:
            combined_excluded_list.append(cat_excluded)

    if combined_results_list:
        df_growth = pd.concat(combined_results_list, ignore_index=True)
    else:
        df_growth = pd.DataFrame()

    if combined_excluded_list:
        df_excluded = pd.concat(combined_excluded_list, ignore_index=True)
    else:
        df_excluded = pd.DataFrame()

    return df_growth, df_excluded

# def main():
df_tvl_agg["dt"] = pd.to_datetime(df_tvl_agg["dt"], errors="coerce")
final_growth_df, excluded_df = process_growth_for_all_categories(
    df_input=df_tvl_agg,
    months_span=6,
    cutoff_date="2024-12-01"
)
print("Calculated growth measures. Sample:")
print(final_growth_df.head())

# Save
final_growth_df.to_csv(GROWTH_OUTPUT_PATH, index=False)
print(f"Saved '{GROWTH_OUTPUT_PATH}' with shape:", final_growth_df.shape)

# if __name__ == "__main__":
#     main()

In [None]:
df_tvl_agg["protocol_category"].value_counts()

In [None]:
final_growth_df["protocol_category"].value_counts()