In [1]:
assert len(os.environ['GOOGLE_APPLICATION_CREDENTIALS']) > 0, \
            "Set GOOGLE_APPLICATION_CREDENTIALS prior to use, see README.md"
assert len(os.environ['PYTHONPATH']) > 0, \
            "Set PYTHONPATH to include this repository prior to use, see README.md"

# Complete Data Proccessing Pipeline

In [2]:
import os
from typing import Optional, List

BQ_TABLE_NAME: str = 'traces'
BUCKET_NAME: str = f'eth-aml-data'
DOWNLOAD_DIR: str = f'/media/ponbac/Biggy1/ethereum/{BQ_TABLE_NAME}'
COLUMNS_TO_SAVE: Optional[List[str]] = ['block_number', 'from_address', 'to_address', 'value'] # 'None' = Download all columns
MAX_BLOBS: Optional[int] = 2 # 'None' = Download all blobs to local storage.

### BiqQuery Ethereum Table &rarr; Google Cloud Storage Bucket

In [3]:
from utils.bigquery import EthereumBigQuery
from pprint import pprint

query: EthereumBigQuery = EthereumBigQuery()

# Tables available for download:
pprint(query.get_table_names())

['bigquery-public-data.crypto_ethereum.amended_tokens',
 'bigquery-public-data.crypto_ethereum.balances',
 'bigquery-public-data.crypto_ethereum.blocks',
 'bigquery-public-data.crypto_ethereum.contracts',
 'bigquery-public-data.crypto_ethereum.logs',
 'bigquery-public-data.crypto_ethereum.sessions',
 'bigquery-public-data.crypto_ethereum.token_transfers',
 'bigquery-public-data.crypto_ethereum.tokens',
 'bigquery-public-data.crypto_ethereum.traces',
 'bigquery-public-data.crypto_ethereum.transactions']


In [4]:
# Possible to export to json (as_json=True) or parquet (as_parquet=True), csv by default.
query.export_to_bucket(BQ_TABLE_NAME, f'{BUCKET_NAME}/{BQ_TABLE_NAME}')

### Google Cloud Storage Bucket &rarr; Local Storage

In [3]:
from utils.storage.base import EthereumStorage
from utils.storage.google_cloud_storage import GoogleCloudStorage

storage: EthereumStorage = GoogleCloudStorage()

In [4]:
storage.download(BUCKET_NAME, f'{BQ_TABLE_NAME}/', DOWNLOAD_DIR, MAX_BLOBS, use_cols=COLUMNS_TO_SAVE)

100%|██████████| 2/2 [00:00<00:00,  5.53it/s]


### Sort & Merge Downloaded Files
Sorted and merged csv will be inside ```DOWNLOAD_DIR/processed/```

In [6]:
import scripts.sort_big_csv as sort_big_csv
import argparse

#args = argparse.Namespace(csv_dir=DOWNLOAD_DIR, merge_only=False, sort_only=False, sort_column='block_number', out_filename=f'{BQ_TABLE_NAME}-sorted.csv')
args = argparse.Namespace(csv_dir=DOWNLOAD_DIR,
    merge_only=False,
    sort_only=False,
    sort_columns=['block_number','transaction_index'],
    out_filename=f'{BQ_TABLE_NAME}-sorted.csv')
#,'trace_address'
sort_big_csv.main(args)

['block_number', 'transaction_index']
hello
running simple sort...


100%|██████████| 10/10 [00:03<00:00,  3.29it/s]


running merge sort...


100%|██████████| 10/10 [00:09<00:00,  1.05it/s]


removing temp file...
