In this notebook we will explore the daos census dataset to select which DAOs we want to run the recommender system against

In [None]:
from pathlib import Path
import datetime as dt

import numpy as np
import pandas as pd

import duckdb

%load_ext sql
%config SqlMagic.autopandas = True

In [None]:
KAGGLE_DATASET: str = 'daviddavo/daos-census-tfm'
AUX_PATH: Path = Path('~/Downloads/daos-census-tfm').expanduser()
CUTOFF_DATE = None

# This dictionary merges different deployments into one organization
ORGS_DICT: dict[str, list[str]] = {
    'dxDAO - xDXdao': ['dxDAO', 'xDXdao'],
    'Aave - Aavegotchi': ['Aave', 'Aavegotchi', 'AAVE'],
    'MetaCartel - MetaCartel Ventures': ['MetaCartel Ventures', 'MetaCartel xDai', 'MetaCartel DAO'],
}

## Download dataset if it does not exist

In [None]:
import kaggle

In [None]:
if not AUX_PATH.exists():
    kaggle.api.dataset_download_cli(KAGGLE_DATASET, path=AUX_PATH, unzip=True)

## Processing the dataset

In [None]:
def _list2sql(lst: list[str]) -> str:
    return "".join(["(", ", ".join(map("'{}'".format, lst)), ")"])

def _gen_orgs_query(parquet: Path) -> str:
    _casestr = "    WHEN name IN {caselst} THEN '{orgname}'"

    _cases = "\n".join(_casestr.format(
        orgname=orgname,
        caselst=_list2sql(caselst),
    ) for orgname, caselst in ORGS_DICT.items())
    
    return f"""
CREATE VIEW deployments AS
SELECT * EXCLUDE (name),
    name AS deployment_name,
    CASE 
{_cases}
    ELSE name
    END AS name
FROM parquet_scan('{parquet}')
    """

### Import from the parquets

In [None]:
db = duckdb.connect(database=':memory:', read_only=False)
# db.execute(_gen_orgs_query(AUX_PATH / 'deployments.parquet'))
_cond_date = ""
if CUTOFF_DATE:
    _cond_date = f"WHERE date <= '{CUTOFF_DATE.isoformat()}'"
print(_cond_date)

db.execute("CREATE VIEW deployments AS SELECT * FROM parquet_scan('{}')".format(AUX_PATH / "deployments.parquet"))
db.execute("CREATE VIEW votes AS SELECT * FROM parquet_scan('{}') {}".format(AUX_PATH / "votes.parquet", _cond_date))
db.execute("CREATE VIEW proposals AS SELECT * FROM parquet_scan('{}') {}".format(AUX_PATH / "proposals-text.parquet", _cond_date))

%sql db --alias duckdb

In [None]:
db.execute("SHOW TABLES").fetchdf()

In [None]:
%%sql
SHOW TABLES

In [None]:
%%sql
SELECT * FROM duckdb_columns() WHERE table_name IN ('deployments')

In [None]:
_casestr = "WHEN name IN {caselst} THEN '{orgname}'"

_cases = "\n".join(_casestr.format(
    orgname=orgname,
    caselst=_list2sql(caselst),
) for orgname, caselst in ORGS_DICT.items())
print(_cases)

In [None]:
%%sql
CREATE OR REPLACE VIEW orgs AS
    WITH G AS (WITH Gv AS (
            SELECT *,
                CASE
                    {{_cases}}
                    ELSE name
                END AS orgname
            FROM deployments
            LEFT JOIN votes ON (deployments.id = votes.deployment_id)
            -- WHERE platform IN ({platforms})
        )
        SELECT 
            orgname AS name,
            COUNT(DISTINCT Gv.id) AS n_deploys,
            LIST(DISTINCT Gv.id) AS deploys,
            LIST(DISTINCT Gv.platform) AS platforms,
            LIST(DISTINCT Gv.platform_deployment_id) AS platform_ids,
            COUNT(DISTINCT Gv.proposal_id) AS proposals_count,
            COUNT(DISTINCT Gv.voter) AS voters_count,
            COUNT(*) AS votes_count,
        FROM Gv
        GROUP BY Gv.orgname
    )
    SELECT
        *,
        proposals_count + voters_count AS nodes,
        2*(votes_count)/((voters_count + proposals_count)*(voters_count + proposals_count-1)) AS density_naive,
        2*(votes_count)/(voters_count*proposals_count) AS density,
        votes_count/proposals_count AS vpp, 
        votes_count/voters_count AS vpv
    FROM G
    ORDER BY proposals_count DESC

In [None]:
%%sql
SELECT *
FROM orgs
WHERE lower(name) LIKE '%dorg%'
LIMIT 5

In [None]:
%%sql df_all <<
SELECT *
FROM orgs
WHERE proposals_count > 300
    AND name IS NOT NULL

In [None]:
df_all