## despliegues

In [126]:
# 1. load deployment parquet, remove the platform = realms rows, and then
# re-add the new realms rows

In [127]:
import pandas as pd
new_deployments_df = pd.read_csv('../DATA/realms/realms_deployments.csv')
new_deployments_df.head(2)

Unnamed: 0,id,platform,platform_deployment_id,name,website,additional,votes_count,proposals_count
0,40624f67-57d7-5bfb-baad-fe443ff81318,realms,GovHgfDPyQ1GwazJTDY2avSVY8GGcpmCapmmCsymRaGe_F...,Psy Finance,,,380,149
1,fe035ab9-73f8-58ea-8de2-8e74f92300ea,realms,GovHgfDPyQ1GwazJTDY2avSVY8GGcpmCapmmCsymRaGe_8...,PSY DO Protocol Council,,,22,9


In [128]:
import pyarrow as pa
new_deployments_table = pa.Table.from_pandas(new_deployments_df, preserve_index=False)
new_deployments_table.schema

id: string
platform: string
platform_deployment_id: string
name: string
website: double
additional: double
votes_count: int64
proposals_count: int64
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 1029

In [129]:
new_deployments_table.column_names[4], new_deployments_table.column_names[5]

('website', 'additional')

In [130]:
# cast new_deployments_table website to string
new_deployments_table = new_deployments_table.set_column(4, 'website', new_deployments_table.column(4).cast('string'))
new_deployments_table = new_deployments_table.set_column(5, 'additional', new_deployments_table.column(5).cast('string'))

In [131]:
new_deployments_table.schema

id: string
platform: string
platform_deployment_id: string
name: string
website: string
additional: string
votes_count: int64
proposals_count: int64
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 1029

In [132]:
import pyarrow.parquet as pq
deployments_table = pq.read_table('../parquets_version6/deployments.parquet')
deployments_table.schema

id: string
platform: string
platform_deployment_id: string
name: string
website: string
additional: string
votes_count: int64
proposals_count: int64
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 1215

In [133]:
deployments_table.schema == new_deployments_table.schema

True

In [134]:
# TODO: need to rm the old platform = 'realms' rows from the parquet file
deployments_v6 = pq.read_table('../parquets_version6/deployments.parquet')
deployments_v6.shape

(31013, 8)

In [135]:
deployments_v6_no_realms = pq.read_table(
    '../parquets_version6/deployments.parquet',
    filters=[('platform', '!=', 'realms')]
)
deployments_v6_no_realms.shape

(28848, 8)

In [136]:
# save deployments_v6_no_realms
pq.write_table(deployments_v6_no_realms, "../parquets_version7/_temp_deployments_v6_no_realms.parquet")

In [137]:
import pyarrow.parquet as pq
import pyarrow as pa
from pyarrow.parquet import ParquetFile
deployments_v6_no_realms_pf = ParquetFile("../parquets_version7/_temp_deployments_v6_no_realms.parquet")
for i, record_batch in enumerate(deployments_v6_no_realms_pf.iter_batches(batch_size=10000)):
    # for the first chunk of records
    table = pa.Table.from_batches([record_batch])
    if i == 0:
        # create a parquet write object giving it an output file
        pqwriter = pq.ParquetWriter('../parquets_version7/deployments.parquet', table.schema)            
    pqwriter.write_table(table)

pqwriter.write_table(new_deployments_table)

# close the parquet writer
pqwriter.close()

In [138]:
# read '../parquets_version6/deployments.parquet' size without opening
with pq.ParquetFile('../parquets_version6/deployments.parquet') as pf:
    num_rows = pf.metadata.num_rows
    num_columns = len(pf.metadata.schema.names)
    print(f'Number of rows: {num_rows}')
    print(f'Number of columns: {num_columns}')

Number of rows: 31013
Number of columns: 8


In [139]:
# read '../parquets_version7/deployments.parquet' size without opening
with pq.ParquetFile('../parquets_version7/deployments.parquet') as pf:
    num_rows = pf.metadata.num_rows
    num_columns = len(pf.metadata.schema.names)
    print(f'Number of rows: {num_rows}')
    print(f'Number of columns: {num_columns}')

Number of rows: 31135
Number of columns: 8


## propuestas

In [140]:
import pandas as pd
new_proposals_df = pd.read_csv('../DATA/realms/realms_proposals.csv')
new_proposals_df.head(2)

Unnamed: 0,id,deployment_id,platform_proposal_id,author,date,votes_count
0,7696e042-1620-56b3-9c10-f58f334895d8,a66d8ba4-7b09-59eb-aa7c-4607d9335fa2,GovER5Lthms3bLBqWub97yVrMmEogzX7xNjdXpPPCVZw_G...,,2022-04-19 20:59:11,0
1,d80e17a0-b287-5e2b-a014-15349a9ec440,a66d8ba4-7b09-59eb-aa7c-4607d9335fa2,GovER5Lthms3bLBqWub97yVrMmEogzX7xNjdXpPPCVZw_c...,,2022-04-22 17:54:06,2


In [141]:
import pyarrow as pa
new_proposals_table = pa.Table.from_pandas(new_proposals_df, preserve_index=False)
new_proposals_table.schema

id: string
deployment_id: string
platform_proposal_id: string
author: double
date: string
votes_count: int64
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 785

In [142]:
new_proposals_table.column_names[4]

'date'

In [143]:
new_proposals_table = new_proposals_table.set_column(4, 'date', new_proposals_table.column(4).cast(pa.timestamp('us')))
new_proposals_table.schema

id: string
deployment_id: string
platform_proposal_id: string
author: double
date: timestamp[us]
votes_count: int64
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 785

In [144]:
new_proposals_table = new_proposals_table.set_column(3, 'author', new_proposals_table.column(3).cast('string'))
new_proposals_table.schema

id: string
deployment_id: string
platform_proposal_id: string
author: string
date: timestamp[us]
votes_count: int64
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 785

In [145]:
# here we get the proposals that are not from platform = 'realms'
import duckdb

db = duckdb.connect(database=':memory:', read_only=False)
# as view
db.execute("CREATE VIEW votes AS SELECT * FROM parquet_scan('{}')".format('../parquets_version6/votes.parquet'))
db.execute("CREATE VIEW proposals AS SELECT * FROM parquet_scan('{}')".format('../parquets_version6/proposals.parquet'))
# same for deployments
db.execute("CREATE VIEW deployments AS SELECT * FROM parquet_scan('{}')".format('../parquets_version6/deployments.parquet'))

<duckdb.DuckDBPyConnection at 0x168840d30>

In [146]:
query = """
SELECT
    p.*
FROM
    proposals as p
JOIN
    deployments
ON
    p.deployment_id = deployments.id
WHERE
    deployments.platform != 'realms'
"""
proposals_v6_no_realms = db.execute(query).fetch_arrow_table()
proposals_v6_no_realms

pyarrow.Table
id: string
deployment_id: string
platform_proposal_id: string
author: string
date: timestamp[us]
votes_count: int64
----
id: [["a1a650e2-1225-5a90-a648-e6eee005cb27","7a520e27-71b6-50b5-9ad6-49bfaed6ecb2","593b9989-aa58-5eaa-8979-20003630dc1c","6c015507-4011-5c67-b9f4-8ab85dcca302","71f254fa-ff1d-56aa-8ef1-ba5291809b54",...,"f77d459e-12d2-54c1-a5d4-9e2be9970f46","fe44c6da-0504-5575-adbb-993bbc7701b9","00f1d3af-e142-5cb8-9d6e-64109e450374","9b8e42f5-6cf4-563c-a259-5e29fa624238","56b52ef1-01c1-5175-853b-1ab88e32bce6"]]
deployment_id: [["1ff410f5-1c80-53cd-9558-d93d1f6de121","1ff410f5-1c80-53cd-9558-d93d1f6de121","1ff410f5-1c80-53cd-9558-d93d1f6de121","d3229595-ac4c-55bf-9808-386fc5603bd1","1ff410f5-1c80-53cd-9558-d93d1f6de121",...,"def86296-256e-514a-a152-7097aede3e55","f52c872a-f851-5a5a-9838-3ce048cc7b4c","c2d8e86b-3632-5db5-a515-b0300fe42ef2","07a28090-895c-558e-bedc-f69040088ca2","63e914eb-380d-59e5-98a7-950351aa4796"]]
platform_proposal_id: [["0x9da21802ef6636402d793d9

In [147]:
pq.write_table(proposals_v6_no_realms, "../parquets_version7/_temp_proposals_v6_no_realms.parquet")

In [148]:
proposals_v6_no_realms.schema

id: string
deployment_id: string
platform_proposal_id: string
author: string
date: timestamp[us]
votes_count: int64

In [149]:
new_proposals_table.schema

id: string
deployment_id: string
platform_proposal_id: string
author: string
date: timestamp[us]
votes_count: int64
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 785

In [150]:
proposals_v6_no_realms.schema == new_proposals_table.schema

True

In [151]:
import pyarrow.parquet as pq
import pyarrow as pa
from pyarrow.parquet import ParquetFile
proposals_v6_no_realms_pf = ParquetFile('../parquets_version7/_temp_proposals_v6_no_realms.parquet')
for i, record_batch in enumerate(proposals_v6_no_realms_pf.iter_batches(batch_size=10000)):
    # for the first chunk of records
    table = pa.Table.from_batches([record_batch])
    if i == 0:
        # create a parquet write object giving it an output file
        pqwriter = pq.ParquetWriter('../parquets_version7/proposals.parquet', table.schema)            
    pqwriter.write_table(table)

pqwriter.write_table(new_proposals_table)

# close the parquet writer
pqwriter.close()

In [152]:
# read '../parquets_version6/proposals.parquet' size without opening
with pq.ParquetFile('../parquets_version6/proposals.parquet') as pf:
    num_rows = pf.metadata.num_rows
    num_columns = len(pf.metadata.schema.names)
    print(f'Number of rows: {num_rows}')
    print(f'Number of columns: {num_columns}')

Number of rows: 206061
Number of columns: 6


In [153]:
# read '../parquets_version7/proposals.parquet' size without opening
with pq.ParquetFile('../parquets_version7/proposals.parquet') as pf:
    num_rows = pf.metadata.num_rows
    num_columns = len(pf.metadata.schema.names)
    print(f'Number of rows: {num_rows}')
    print(f'Number of columns: {num_columns}')

Number of rows: 207871
Number of columns: 6
