In [40]:
import pandas as pd
import json
import ast
from tqdm import tqdm
from joblib import Parallel, delayed
import numpy as np
import gc

# This line is important to make tqdm work with pandas' apply function
tqdm.pandas()

file_path = 'crypto_merged.parquet'
df = pd.read_parquet(file_path)

nested_columns = ['chainTvls', 'tokens', 'tokensInUsd', 'tvl']
# protocols_df = df.drop(columns=nested_columns)

def extract_chain_tvl(row):
    try:
        chain_info = json.loads(row['chainTvls'])
        return [{
            "id": row['id'],
            "chain_name": chain_name,
            "date": tvl_entry['date'],
            "totalLiquidityUSD": tvl_entry['totalLiquidityUSD']
        } for chain_name, chain_data in chain_info.items() for tvl_entry in chain_data['tvl']]
    except Exception as e:
        print(f"Error in extract_chain_tvl for id {row['id']}: {e}")
        return [None]

# Define the function to apply
def extract_token_tvl(row):
    results = []
    try:
        # Parsing chainTvls
        chains_str = row['chainTvls']
        chains = {}
        if isinstance(chains_str, str):
            chains = json.loads(chains_str.replace('null', '0').replace('NULL', '0'))

        # Parsing tokens
        tokens_str = row['tokens']
        tokens_list = []
        if isinstance(tokens_str, str):
            tokens_list = ast.literal_eval(tokens_str.replace('null', '0').replace('NULL', '0'))

        # Parsing tokensInUsd and creating a lookup dictionary
        tokens_in_usd_str = row['tokensInUsd']
        tokens_in_usd_list = []
        if isinstance(tokens_in_usd_str, str):
            tokens_in_usd_list = ast.literal_eval(tokens_in_usd_str.replace('null', '0').replace('NULL', '0'))

        tokens_in_usd_dict = {item['date']: item['tokens'] for item in tokens_in_usd_list}

        for entry in tokens_list:
            date = entry['date']
            for chain_name, chain_data in chains.items():
                for token_name, quantity in entry['tokens'].items():
                    value_usd = tokens_in_usd_dict.get(date, {}).get(token_name, 0)
                    results.append([row['id'], chain_name, date, token_name, quantity, value_usd])
    except Exception as e:
        print(f"Error in extract_token_tvl for id {row['id']}: {e}")
        results.append([row['id'], None, None, None, None, None])
    return results

def extract_tvl(row):
    try:
        tvl_list = ast.literal_eval(row['tvl'].replace('null', '0').replace('NULL', '0'))
        return [[row['id'], entry['date'], entry['totalLiquidityUSD']] for entry in tvl_list]
    except Exception as e:
        print(f"Error in extract_tvl for id {row['id']}: {e}")
        return [[row['id'], None, None]]

# protocol_chain_tvl_data = df.progress_apply(extract_chain_tvl, axis=1).explode().dropna().tolist()
# protocol_chain_tvl_df = pd.DataFrame(protocol_chain_tvl_data)

protocol_token_tvl_data = df.progress_apply(extract_token_tvl, axis=1).explode().dropna().tolist()
del df
gc.collect()

protocol_token_tvl_df = pd.DataFrame(protocol_token_tvl_data, columns=['id', 'chain_name', 'date', 'token_name', 'quantity', 'value_usd'])

protocol_token_tvl_df.to_csv('protocol_token_tvl.csv', index=False)

100%|██████████| 3993/3993 [00:00<00:00, 15212.24it/s]

Error in extract_token_tvl for id 3777: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error in extract_token_tvl for id 2286: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error in extract_token_tvl for id 3732: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error in extract_token_tvl for id 240: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error in extract_token_tvl for id 2269: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error in extract_token_tvl for id 2: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error in extract_token_tvl for id 2275: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error in extract_token_tvl for id 2272: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error in extract_token_tvl for id 3107: Expecting property name enclosed in 




In [79]:
import pandas as pd
import json
import glob

def extract_token_tvl_from_json_to_df(file_path):
    results = []
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        chains = data['chainTvls']
        tokens_list = data['tokens']
        tokens_in_usd_list = data['tokensInUsd']
        tokens_in_usd_dict = {item['date']: item['tokens'] for item in tokens_in_usd_list}
        for entry in tokens_list:
            date = entry['date']
            for chain_name, chain_data in chains.items():
                for token_name, quantity in entry['tokens'].items():
                    value_usd = tokens_in_usd_dict.get(date, {}).get(token_name, 0)
                    results.append([data['id'], chain_name, date, token_name, quantity, value_usd])
    except Exception as e:
        print(f"Error in extract_token_tvl: {e}")
        results.append([data.get('id', None), None, None, None, None, None])
    return pd.DataFrame(results, columns=['id', 'chain_name', 'date', 'token_name', 'quantity', 'value_usd'])

# Path to the llama folder containing JSON files
folder_path = 'data/llama/*.json'

# List all JSON files in the directory
json_files = glob.glob(folder_path)

# Process each JSON file and collect the resulting DataFrames
dfs = [extract_token_tvl_from_json_to_df(file) for file in json_files]

# Concatenate all DataFrames into one
unified_df = pd.concat(dfs, ignore_index=True)

# Now `unified_df` contains the combined data from all JSON files
print(unified_df)

           id chain_name        date token_name      quantity     value_usd
0        3777   Ethereum  1699920000      MAGIC  2.879522e+08  1.791143e+08
1        3777   Ethereum  1699920000       LINK  3.050918e+06  4.384170e+07
2        3777   Ethereum  1699920000       RETH  1.386769e+04  3.085437e+07
3        3777   Ethereum  1699920000     PENDLE  2.161470e+07  2.269544e+07
4        3777   Ethereum  1699920000        WOO  6.906119e+07  1.639209e+07
...       ...        ...         ...        ...           ...           ...
1859487   182     Solana  1709312639      MATIC  1.387950e+08  1.394889e+08
1859488   182     Solana  1709312639       WETH  9.842368e+06  3.357537e+10
1859489   182     Solana  1709312639        DOT  3.858225e+04  3.213901e+05
1859490   182     Solana  1709312639        KSM  4.871648e+03  2.403184e+05
1859491   182     Solana  1709312639        SOL  1.464893e+05  1.904508e+07

[1859492 rows x 6 columns]


In [80]:
import pandas as pd

# Path to your JSON file
json_file_path = 'data/protocol_headers.json'
# Path where you want to save the Parquet file
parquet_file_path = 'data/protocol_headers.parquet'

# Read the JSON file into a DataFrame
df = pd.read_json(json_file_path)

# Write the DataFrame to a Parquet file
df.to_parquet(parquet_file_path, index=False)

print(f"Data from {json_file_path} has been successfully written to {parquet_file_path}")

ArrowTypeError: ("Expected bytes, got a 'bool' object", 'Conversion failed for column methodology with type object')

In [81]:
import pandas as pd

json_file_path = 'data/protocol_headers.json'
parquet_file_path = 'data/protocol_headers.parquet'

# Read the JSON file into a DataFrame
df = pd.read_json(json_file_path)

# Convert all columns of type 'object' to strings to avoid ArrowTypeError
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].astype(str)

# Attempt to write the DataFrame to a Parquet file again
df.to_parquet(parquet_file_path, index=False)

print(f"Data from {json_file_path} has been successfully written to {parquet_file_path}")

Data from data/protocol_headers.json has been successfully written to data/protocol_headers.parquet


In [84]:
pd.read_parquet('data/protocol_headers.parquet')

Unnamed: 0,id,name,address,symbol,url,description,chain,logo,audits,audit_note,...,assetToken,misrepresentedTokens,staking,pool2,language,oraclesByChain,stablecoins,deadUrl,rugged,deadFrom
0,2269,Binance CEX,,-,https://www.binance.com,Binance is a cryptocurrency exchange which is ...,Multi-Chain,https://icons.llama.fi/binance-cex.jpg,0,,...,,,,,,,,,,
1,182,Lido,0x5a98fcbea516cf06857215779fd812ca3bef1b32,LDO,https://lido.fi/,Liquid staking for Ethereum and Polygon. Daily...,Multi-Chain,https://icons.llama.fi/lido.png,2,,...,,,,,,,,,,
2,2272,OKX,,-,https://www.okx.com,"OKX, formerly known as OKEx, is a Seychelles-b...",Multi-Chain,https://icons.llama.fi/okx.jpg,0,,...,,,,,,,,,,
3,2275,Bitfinex,,-,https://www.bitfinex.com,Bitfinex facilitates a graphical trading exper...,Multi-Chain,https://icons.llama.fi/bitfinex.png,0,,...,,,,,,,,,,
4,3732,Robinhood,,-,https://robinhood.com,Democratizing finance for all. Crypto trading:...,Multi-Chain,https://icons.llama.fi/robinhood.jpg,0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3987,4147,0xScans,0x10703ca5e253306e2ababd68e963198be8887c81,SCAN,https://www.0xscans.com,"Leveraging AI and automated testing, this revo...",Ethereum,https://icons.llama.fi/0xscans.jpg,0,,...,,,2.633865e+06,,,,,,,
3988,4160,BlackrockFund,0xbD6323A83b613F668687014E8A5852079494fB68,BTC,https://www.blackrockfund.finance,State of the art Defi Hedge Fund with disrupti...,Ethereum,https://icons.llama.fi/blackrockfund.jpg,0,,...,,,1.875704e+07,1.178388e+06,,,,,,
3989,4169,Metastrike,bsc:0x496cc0b4ee12aa2ac4c42e93067484e7ff50294b,MTS,https://metastrike.io,Metastrike is a Metaverse FPS Blockchain Game ...,Binance,https://icons.llama.fi/metastrike.png,2,,...,,,7.995934e+04,,,,,,,
3990,4192,WOWMAX Exchange,,-,https://wowmax.exchange,WOWMAX is the next generation DEX aggregation ...,,https://icons.llama.fi/wowmax-exchange.jpg,0,,...,,False,,,,,,,,
