In [22]:
filename = '../parquets_version2/votes.parquet'

In [23]:
import pyarrow.parquet as pq
votes = pq.read_table(filename)

In [24]:
type(votes)

pyarrow.lib.Table

In [25]:
# keep only 100000 rows
# votes = votes[:100000]

In [26]:
votes_length = votes.shape[0]
votes_length

21867988

In [27]:
# make a pa array that just counts up from 0 to the length of the votes table

import pyarrow as pa
votes_length_array = pa.array(range(votes_length))

In [28]:
import pyarrow as pa
# add a temp uuid column to the votes table
# add as empty column
votes = votes.append_column('temp_id', votes_length_array)
votes

pyarrow.Table
platform: string
platform_deployment_id: string
proposal_id: string
vote_id: string
voter: string
date: timestamp[us]
choice: string
weight: decimal128(38, 4)
temp_id: int64
----
platform: [["realms","realms","realms","realms","realms",...,"tally","tally","tally","tally","tally"],["tally","tally","tally","tally","tally",...,"tally","tally","tally","tally","tally"],...,["snapshot","snapshot","snapshot","snapshot","snapshot",...,"snapshot","snapshot","snapshot","snapshot","snapshot"],["snapshot","snapshot","snapshot","snapshot","snapshot",...,"snapshot","snapshot","snapshot","snapshot","snapshot"]]
platform_deployment_id: [["84pGFuy1Y27ApK67ApethaPvexeDWA66zNV8gm38TVeQ","84pGFuy1Y27ApK67ApethaPvexeDWA66zNV8gm38TVeQ","84pGFuy1Y27ApK67ApethaPvexeDWA66zNV8gm38TVeQ","84pGFuy1Y27ApK67ApethaPvexeDWA66zNV8gm38TVeQ","84pGFuy1Y27ApK67ApethaPvexeDWA66zNV8gm38TVeQ",...,"eip155:1:0x6f3E6272A167e8AcCb32072d08E0957F9c79223d","eip155:1:0x6f3E6272A167e8AcCb32072d08E0957F9c79223d","eip155:1

In [29]:
# use duckdb to remove duplicate votes

query = """
SELECT
    *
FROM
    votes
WHERE
    temp_id IN (
        SELECT
            MIN(temp_id)
        FROM
            votes
        GROUP BY
            platform, platform_deployment_id, proposal_id, vote_id
    )
"""

import duckdb
con = duckdb.connect(database=':memory:', read_only=False)
con.register('votes', votes)
clean = con.execute(query).fetch_arrow_table()

In [30]:
# drop column temp_id
clean = clean.drop('temp_id')

In [31]:
clean.schema

platform: string
platform_deployment_id: string
proposal_id: string
vote_id: string
voter: string
date: timestamp[us]
choice: string
weight: decimal128(38, 4)

In [32]:
# save
pq.write_table(clean, '../parquets_version2/votes_clean.parquet')

In [33]:
clean.shape

(21617900, 8)

In [35]:
21867988
print(21867988 - clean.shape[0], 'duplicate votes removed')
print((21867988 - clean.shape[0]) / 21867988 * 100, 'pct removed')

250088 duplicate votes removed
1.1436260162571883 pct removed
