In [1]:
# here we will:
# 1. make the proposal date field to type date and 
# 2. remove the 'platform' column from the proposal table; this is redundant as
# we have this field on the deployment table; this is a property of deployments,
# not of proposals
# 3. make votes date precision to seconds

In [2]:
# open ../parquets_version5/proposals.parquet
import pyarrow.parquet as pq
import pyarrow as pa

proposals = pq.read_table('../parquets_version5/proposals.parquet')
proposals

pyarrow.Table
id: string
deployment_id: string
platform_proposal_id: string
author: string
date: string
votes_count: int64
platform: string
----
id: [["5badfaf0-fb6b-5050-85ec-9e4a8b97d759","e4e5343b-4205-5561-9722-60a80e37fc82","924663af-1ba7-534b-adde-d7bec9b6a3af","b4bd6548-2898-5523-aab2-24d40aab3910","d9368ff7-ec0f-5747-a7ca-827a746415ab",...,"ad1a65ea-c153-59c2-8522-5b09a19f3b24","1d9b7985-728f-5cdf-a867-d7b105fb2ed3","24bb50ae-8930-59d9-b80e-fad3aab66bee","398ff2a2-c6df-5ed1-9e94-e2aa2a0e8bb6","6e949a92-0ece-5408-a8d8-bc941dd47eda"],["0755d88e-effb-5e25-a96c-8ee6bbd1c34b","9394ddae-0df4-5f89-8471-3b818f1622a5","a233c03a-50ec-558c-b404-fadfeca8dd9a","bef6d95a-f12a-5311-8eac-76d62331eb36","fa573801-0fec-5864-82ae-4b081325d919",...,"e52a23f2-a3d2-5dee-9905-8b2013f182ac","6d3033a9-10a0-5200-950f-d54d9a75a15c","debca85d-c90e-5e67-88ae-c3829719de92","b50176c5-6339-5c10-8096-5da0132513b9","eba70a13-f55d-5378-be20-c3a709843e2d"],...,["a3387b60-d5eb-53ca-b7fb-f3dae3913cf1","bc4e4941-8a9a

In [3]:
# remove the 6th column
proposals = proposals.drop('platform')

In [4]:
import pandas as pd
proposals_df = proposals.to_pandas()
proposals_df['date'] = pd.to_datetime(proposals_df['date'], format='%Y-%m-%d')
proposals_df.head()

Unnamed: 0,id,deployment_id,platform_proposal_id,author,date,votes_count
0,5badfaf0-fb6b-5050-85ec-9e4a8b97d759,167ec1a5-0bec-5fde-9496-5caa7d004487,EUrU9LLfbEqra1TXZXhYKpJfvYhKLJJuVy1weZNUnt4G,,2022-10-30 18:30:31,1
1,e4e5343b-4205-5561-9722-60a80e37fc82,42d9f322-1f55-58a3-83c4-9fc82666559b,DsaGqF8tpiQqwJGxGP6JXAWUo9L623jWBnexHuxEkrZ3,,2022-06-02 07:58:12,1
2,924663af-1ba7-534b-adde-d7bec9b6a3af,881167d8-3707-5640-9716-ab77768b4ebe,5dDX6gBPGfTeuQdV93aNL6PtTMFxCXXiYVFMxzD7kJdc,,2022-03-24 13:42:45,2
3,b4bd6548-2898-5523-aab2-24d40aab3910,881167d8-3707-5640-9716-ab77768b4ebe,8icCe6sAdDQCnMg9a3LQVsRH8vyT2UdYE64ET7Uk232n,,2022-09-22 23:30:14,2
4,d9368ff7-ec0f-5747-a7ca-827a746415ab,fbbc05e2-7149-5eed-9899-c9a83eea5545,HJFtqkKE9sxmMqYaHK8LeoFbibufZMzDn8UWdzJbJWyc,,2022-02-16 04:05:58,1


In [5]:
proposals_df.dtypes

id                              object
deployment_id                   object
platform_proposal_id            object
author                          object
date                    datetime64[ns]
votes_count                      int64
dtype: object

In [6]:
# proposals_df to pyarrow
import pyarrow as pa
proposals_table = pa.Table.from_pandas(proposals_df)
proposals_table.schema

id: string
deployment_id: string
platform_proposal_id: string
author: string
date: timestamp[ns]
votes_count: int64
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 982

In [7]:
proposals_table.column_names[4]

'date'

In [8]:
proposals_table = proposals_table.set_column(4, 'date', proposals_table.column(4).cast(pa.timestamp('s')))
proposals_table.schema

id: string
deployment_id: string
platform_proposal_id: string
author: string
date: timestamp[s]
votes_count: int64
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 982

In [9]:
proposals_table

pyarrow.Table
id: string
deployment_id: string
platform_proposal_id: string
author: string
date: timestamp[s]
votes_count: int64
----
id: [["5badfaf0-fb6b-5050-85ec-9e4a8b97d759","e4e5343b-4205-5561-9722-60a80e37fc82","924663af-1ba7-534b-adde-d7bec9b6a3af","b4bd6548-2898-5523-aab2-24d40aab3910","d9368ff7-ec0f-5747-a7ca-827a746415ab",...,"8ccd108d-4214-5217-98b4-5a9e92253d57","8a490916-a877-5b03-8c00-c09c882cab6a","750131f1-b1e4-53f2-a944-0431de6db5da","5392db96-66b7-5f68-b875-3c70007eb0ca","32b94bdf-9322-5cb4-a8e8-cf4b1dfb2beb"]]
deployment_id: [["167ec1a5-0bec-5fde-9496-5caa7d004487","42d9f322-1f55-58a3-83c4-9fc82666559b","881167d8-3707-5640-9716-ab77768b4ebe","881167d8-3707-5640-9716-ab77768b4ebe","fbbc05e2-7149-5eed-9899-c9a83eea5545",...,"52bc6eec-fcaf-546e-93fa-3c899146f612","d345da16-b4e6-5e54-b806-01c0042f0b52","09987489-519f-5553-87bc-b51245ef8362","cff38513-293c-55b1-862f-7b4b4fa29e5f","2507f8af-6261-549c-b812-c7c91eef4779"]]
platform_proposal_id: [["EUrU9LLfbEqra1TXZXhYKpJfvY

In [10]:
# save to ../parquets_version6/proposals.parquet
pa.parquet.write_table(proposals_table, '../parquets_version6/proposals.parquet')

In [11]:
votes = pq.read_table('../parquets_version5/votes.parquet')
votes.schema

id: string
proposal_id: string
deployment_id: string
platform_vote_id: string
voter: string
date: timestamp[us]
choice: string
weight: decimal128(38, 4)

In [12]:
votes.column_names[5]

'date'

In [13]:
votes = votes.set_column(5, 'date', votes.column(5).cast(pa.timestamp('s')))

In [14]:
votes.schema

id: string
proposal_id: string
deployment_id: string
platform_vote_id: string
voter: string
date: timestamp[s]
choice: string
weight: decimal128(38, 4)

In [15]:
pa.parquet.write_table(votes, '../parquets_version6/votes.parquet')