## votes

In [60]:
import pandas as pd
new_votes_df = pd.read_csv('../DATA/realms/realms_votes.csv')
new_votes_df.head(2)

Unnamed: 0,id,proposal_id,deployment_id,platform_vote_id,voter,date,choice,weight
0,485b00e5-4707-5ecf-b9d9-384c65f19ca6,21f321d1-04a6-5474-b4bd-724c104d908c,d8a985e4-3cc2-542a-9cb4-3e024577fb3c,GovER5Lthms3bLBqWub97yVrMmEogzX7xNjdXpPPCVZw_G...,WabxR2gcdMgovS6Uo5JD4Cv9me7uExRyaH4QDKrp64b,,"[{'rank': 0, 'weightPercentage': 100}]",2000000000.0
1,4ae92050-0563-5126-b7a7-a926d06b1467,21f321d1-04a6-5474-b4bd-724c104d908c,d8a985e4-3cc2-542a-9cb4-3e024577fb3c,GovER5Lthms3bLBqWub97yVrMmEogzX7xNjdXpPPCVZw_A...,6cGTLr9bTCYis6KjsuZPQS7LrcPjyjibr9gFk2JH65Mn,,"[{'rank': 0, 'weightPercentage': 100}]",2000000000.0


In [61]:
# make into type date
new_votes_df['date'] = pd.to_datetime(new_votes_df['date'], format='%Y-%m-%d')
new_votes_df.head(2)

Unnamed: 0,id,proposal_id,deployment_id,platform_vote_id,voter,date,choice,weight
0,485b00e5-4707-5ecf-b9d9-384c65f19ca6,21f321d1-04a6-5474-b4bd-724c104d908c,d8a985e4-3cc2-542a-9cb4-3e024577fb3c,GovER5Lthms3bLBqWub97yVrMmEogzX7xNjdXpPPCVZw_G...,WabxR2gcdMgovS6Uo5JD4Cv9me7uExRyaH4QDKrp64b,NaT,"[{'rank': 0, 'weightPercentage': 100}]",2000000000.0
1,4ae92050-0563-5126-b7a7-a926d06b1467,21f321d1-04a6-5474-b4bd-724c104d908c,d8a985e4-3cc2-542a-9cb4-3e024577fb3c,GovER5Lthms3bLBqWub97yVrMmEogzX7xNjdXpPPCVZw_A...,6cGTLr9bTCYis6KjsuZPQS7LrcPjyjibr9gFk2JH65Mn,NaT,"[{'rank': 0, 'weightPercentage': 100}]",2000000000.0


In [62]:
import pyarrow as pa
new_votes_table = pa.Table.from_pandas(new_votes_df, preserve_index=False)
new_votes_table.schema

id: string
proposal_id: string
deployment_id: string
platform_vote_id: string
voter: string
date: timestamp[ns]
choice: string
weight: double
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 1011

In [63]:
new_votes_table.column_names[5], new_votes_table.column_names[7]

('date', 'weight')

In [64]:
new_votes_table = new_votes_table.set_column(5, 'date', new_votes_table.column(5).cast(pa.timestamp('us')))
weights_as_decimal_128 = new_votes_table['weight'].cast(pa.decimal128(38, 4))
new_votes_table = new_votes_table.set_column(7, 'weight', weights_as_decimal_128)
new_votes_table.schema

id: string
proposal_id: string
deployment_id: string
platform_vote_id: string
voter: string
date: timestamp[us]
choice: string
weight: decimal128(38, 4)
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 1011

In [65]:
# here we get the proposals that are not from platform = 'realms'
import duckdb

db = duckdb.connect(database=':memory:', read_only=False)
# as view
db.execute("CREATE VIEW votes AS SELECT * FROM parquet_scan('{}')".format('../parquets_version6/votes.parquet'))
db.execute("CREATE VIEW proposals AS SELECT * FROM parquet_scan('{}')".format('../parquets_version6/proposals.parquet'))
# same for deployments
db.execute("CREATE VIEW deployments AS SELECT * FROM parquet_scan('{}')".format('../parquets_version6/deployments.parquet'))

<duckdb.DuckDBPyConnection at 0x169920c70>

In [66]:
query = """
SELECT
    v.*
FROM
    votes as v
JOIN
    deployments
ON
    v.deployment_id = deployments.id
WHERE
    deployments.platform != 'realms'
"""
votes_v6_no_realms = db.execute(query).fetch_arrow_table()
votes_v6_no_realms

pyarrow.Table
id: string
proposal_id: string
deployment_id: string
platform_vote_id: string
voter: string
date: timestamp[us]
choice: string
weight: decimal128(38, 4)
----
id: [["a8b902cf-c6ee-5803-bcc6-a4263d832520","444341a1-178e-5e95-8126-6ad1b7899735","ec7349f9-4a69-5452-a16b-f253c9497211","66822a7a-c8ed-5350-97d7-b4fc4281d5fa","12a310f1-f56b-58b1-8601-216bfb0f1c6c",...,"d141af24-6b5f-5396-82d5-b689c4bc5ff6","f11da313-43ee-5d16-9d0a-f861ef847bb2","4499f48b-c139-5f80-9ae7-d926ade49717","dc4ca9b1-f1f0-5ad2-a992-a9d4a2bfac4a","bcdf68ad-86e7-5ba3-b32c-ecee50309c2c"],["ba91cc54-bf95-5ad0-8eba-91d4b2daaeed","0604d58a-0612-5386-a2a2-6f843b6566f0","f8c2aefd-f98d-5d8e-93b3-5f71dcaa39f4","80bad869-e518-5485-9016-b9330924e02b","98cec328-a1e8-5de2-bfda-2760e4ede020",...,"6158b9b5-6646-5836-9aac-3a8b9128a402","9161a01e-62af-5d34-a13c-9a9d98c87f22","0a8678b1-c068-5526-b277-c13347ab9cc9","2ce4ffe3-dcb7-5300-9c73-5a5aefcb7fc0","ec8bf50d-f259-540a-acac-6870600fe393"],...,["57ec4adf-911c-5629-9f35-3

In [67]:
votes_v6_no_realms.schema == new_votes_table.schema

True

In [68]:
import pyarrow.parquet as pq
# save votes_v6_no_realms
pq.write_table(votes_v6_no_realms, "../parquets_version7/_temp_votes_v6_no_realms.parquet")

In [69]:
import pyarrow.parquet as pq
import pyarrow as pa
from pyarrow.parquet import ParquetFile
votes_v6_no_realms_pf = ParquetFile("../parquets_version7/_temp_votes_v6_no_realms.parquet")
for i, record_batch in enumerate(votes_v6_no_realms_pf.iter_batches(batch_size=10000)):
    # for the first chunk of records
    table = pa.Table.from_batches([record_batch])
    if i == 0:
        # create a parquet write object giving it an output file
        pqwriter = pq.ParquetWriter('../parquets_version7/votes.parquet', table.schema)            
    pqwriter.write_table(table)

pqwriter.write_table(new_votes_table)

# close the parquet writer
pqwriter.close()

In [70]:
# read '../parquets_version6/votes.parquet' size without opening
with pq.ParquetFile('../parquets_version6/votes.parquet') as pf:
    num_rows = pf.metadata.num_rows
    num_columns = len(pf.metadata.schema.names)
    print(f'Number of rows: {num_rows}')
    print(f'Number of columns: {num_columns}')

Number of rows: 22030614
Number of columns: 8


In [71]:
# read '../parquets_version7/votes.parquet' size without opening
with pq.ParquetFile('../parquets_version7/votes.parquet') as pf:
    num_rows = pf.metadata.num_rows
    num_columns = len(pf.metadata.schema.names)
    print(f'Number of rows: {num_rows}')
    print(f'Number of columns: {num_columns}')

Number of rows: 22032447
Number of columns: 8
