This notebook generates the CSV file for a given DAO, from the DAO Census dataset

In [None]:
from typing import Optional
from pathlib import Path
import datetime as dt

import numpy as np
import pandas as pd

## Hparams (DAO name)

In [None]:
ORG_NAME: str = 'Decentraland'
FILTER_PLATFORMS: str = 'snapshot'
KAGGLE_DATASET: str = 'daviddavo/daos-census-tfm'
EXPORT_PATH: Path = Path('../data/input') / ORG_NAME
AUX_PATH: Path = Path('~/Downloads/daos-census-tfm').expanduser()
CUTOFF_DATE_STR: Optional[str] = None

# This dictionary "merges" organizations
ORGS_DICT: dict[str, list[str]] = {
    'dxDAO - xDXdao': ['dxDAO', 'xDXdao'],
    'Aave - Aavegotchi': ['Aave', 'Aavegotchi', 'AAVE'],
    'MetaCartel - MetaCartel Ventures': ['MetaCartel Ventures', 'MetaCartel xDai', 'MetaCartel DAO'],
}

In [None]:
CUTOFF_DATE = dt.datetime.fromisoformat(CUTOFF_DATE_STR) if CUTOFF_DATE_STR else None

## Downloading the dataset if it does not exist

In [None]:
import kaggle

In [None]:
if not AUX_PATH.exists():
    kaggle.api.dataset_download_cli(KAGGLE_DATASET, path=AUX_PATH, unzip=True)

## Processing the dataset

In [None]:
import duckdb

In [None]:
def _list2sql(lst: list[str]) -> str:
    return "".join(["(", ", ".join(map("'{}'".format, lst)), ")"])

def _gen_orgs_query(parquet: Path) -> str:
    _casestr = "    WHEN name IN {caselst} THEN '{orgname}'"

    _cases = "\n".join(_casestr.format(
        orgname=orgname,
        caselst=_list2sql(caselst),
    ) for orgname, caselst in ORGS_DICT.items())
    
    return f"""
CREATE VIEW deployments AS
SELECT * EXCLUDE (name),
    name AS deployment_name,
    CASE 
{_cases}
    ELSE name
    END AS name
FROM parquet_scan('{parquet}')
    """

### Import from parquets

In [None]:
db = duckdb.connect(database=':memory:', read_only=False)
db.execute(_gen_orgs_query(AUX_PATH / 'deployments.parquet'))
_cond_date = ""
if CUTOFF_DATE:
    _cond_date = f"WHERE date <= '{CUTOFF_DATE.isoformat()}'"

db.execute("CREATE VIEW votes AS SELECT * FROM parquet_scan('{}') {}".format(AUX_PATH / "votes.parquet", _cond_date))
db.execute("CREATE VIEW proposals AS SELECT * FROM parquet_scan('{}') {}".format(AUX_PATH / "proposals-text.parquet", _cond_date))

### Get votes table

In [None]:
cond_dfv = [f"name='{ORG_NAME}'"]

if FILTER_PLATFORMS:
    filter_platforms = FILTER_PLATFORMS
    if isinstance(FILTER_PLATFORMS, str):
        filter_platforms = [FILTER_PLATFORMS]

    cond_dfv.append(f"platform IN {_list2sql(filter_platforms)}")

dfv = db.execute(q := f"""
SELECT platform, name, votes.*
FROM deployments
RIGHT JOIN votes ON (deployments.id = votes.deployment_id)
WHERE {" AND ".join(cond_dfv)}
""").fetchdf().rename(columns=lambda x: x.replace('_id', ''))
dfv['voter'] = dfv['voter'].str.lower()
dfv

#### Clean and save votes table

We will only leave the following columns:
- id
- proposal
- voter
- date

In [None]:
EXPORT_PATH.mkdir(exist_ok=True, parents=True)
dfv_cols = ['id', 'proposal', 'voter', 'date']
votes_file = EXPORT_PATH / 'votes.pq'
assert not votes_file.exists(), "Cant overwrite files"
dfv[dfv_cols].sort_values('date').to_parquet(votes_file, index=False)

### Get proposals table

In [None]:
dfp = db.execute(q := f"""
SELECT platform, name, platform_deployment_id, proposals.* EXCLUDE (votes_count), count(votes.id) AS votes_count
FROM deployments
RIGHT JOIN proposals ON (deployments.id = proposals.deployment_id)
LEFT JOIN votes ON (proposals.id = votes.proposal_id)
WHERE {" AND ".join(cond_dfv)}
GROUP BY proposals.*
-- HAVING count(votes.id) >= {0}
""").fetchdf().rename(columns=lambda x: x.replace('_id', ''))
dfp['author'] = dfp['author'].str.lower()
dfp

#### Clean and save proposals table

We will only leave the following columns:
- id
- proposal
- voter
- date

In [None]:
dfp_cols = ['id', 'author', 'date', 'start', 'end', 'platform_proposal', 'title', 'description']
proposals_file = EXPORT_PATH / 'proposals.pq'
assert not proposals_file.exists(), "Cant overwrite files"
dfp[dfp_cols].sort_values('date').to_parquet(proposals_file, index=False)