In [9]:
IN_FILE = '../../parquets/deployments.parquet'
OUT_FILE = '../../parquets/deployments.parquet'

In [10]:
import pyarrow.parquet as pq

deployments = pq.read_table(IN_FILE)
deployments

pyarrow.Table
platform: string
platform_id: string
name: string
website: string
additional: string
votes_count: int64
proposals_count: int64
----
platform: [["daostack","daostack","daostack","daostack","daostack",...,"daostack","daostack","daostack","daostack","daostack"],["snapshot","snapshot","snapshot","snapshot","snapshot",...,"snapshot","snapshot","snapshot","snapshot","snapshot"],["aragon","aragon","aragon","aragon","aragon",...,"aragon","aragon","aragon","aragon","aragon"],["daohaus","daohaus","daohaus","daohaus","daohaus",...,"daohaus","daohaus","daohaus","daohaus","daohaus"]]
platform_id: [["0x0b93ba560283350d4216f29dc57e15df38d0eace","0x15344ecdc2c4edfcb092e284d93c20f0529fd8a6","0x273d0f686a53a49a0fba2a801566b19f9aaf8fcd","0x294f999356ed03347c7a23bcbcf8d33fa41dc830","0x2b8c70fffda7f3d7667f7cfede1429313886329c",...,"0xe716ec63c5673b3a4732d22909b38d779fa47c3f","0xeed417f80cd918ce65b8e141daa00704aef98a90","0xef25b64d4bdbe23f1bb5e557b5d24f12c46da0bc","0xf902172bd2f34a6cf86d424f04

In [11]:
num_deployments = deployments.shape[0]
deployments.shape

(35983, 7)

In [12]:
deployments_df = deployments.to_pandas()
deployments_df

Unnamed: 0,platform,platform_id,name,website,additional,votes_count,proposals_count
0,daostack,0x0b93ba560283350d4216f29dc57e15df38d0eace,CuraDAO,,,103,49
1,daostack,0x15344ecdc2c4edfcb092e284d93c20f0529fd8a6,dOrg,,,336,127
2,daostack,0x273d0f686a53a49a0fba2a801566b19f9aaf8fcd,PrimeDAO,,,30,37
3,daostack,0x294f999356ed03347c7a23bcbcf8d33fa41dc830,Genesis Alpha,,,2203,387
4,daostack,0x2b8c70fffda7f3d7667f7cfede1429313886329c,ETHGlobal,,,74,19
...,...,...,...,...,...,...,...
35978,daohaus,0xff07943f171b826b3618982a6213f157fbc5e226,,,network: xdai version: 2.1,0,0
35979,daohaus,0xff3f8c0b98454306fb0bda57e5ae38cbfa66cc0d,array.finance CCO,,network: xdai version: 2.1,123,122
35980,daohaus,0xff4b20d48a1e8f93bf6745de388547e731f96fd0,PRIDEdao,,network: xdai version: 2.1,0,0
35981,daohaus,0xffb676765d521518ac5c0d4f4067a9db9b72901b,EmpanaDAO,,network: xdai version: 2.0,0,3


In [13]:
# check if there are rows that are complete duplicates
num_dupes = deployments_df.duplicated().sum()
num_dupes

2599

In [14]:
# remove the duplicates
deployments_df = deployments_df.drop_duplicates()
assert deployments_df.shape[0] == num_deployments - num_dupes
deployments_df

Unnamed: 0,platform,platform_id,name,website,additional,votes_count,proposals_count
0,daostack,0x0b93ba560283350d4216f29dc57e15df38d0eace,CuraDAO,,,103,49
1,daostack,0x15344ecdc2c4edfcb092e284d93c20f0529fd8a6,dOrg,,,336,127
2,daostack,0x273d0f686a53a49a0fba2a801566b19f9aaf8fcd,PrimeDAO,,,30,37
3,daostack,0x294f999356ed03347c7a23bcbcf8d33fa41dc830,Genesis Alpha,,,2203,387
4,daostack,0x2b8c70fffda7f3d7667f7cfede1429313886329c,ETHGlobal,,,74,19
...,...,...,...,...,...,...,...
35978,daohaus,0xff07943f171b826b3618982a6213f157fbc5e226,,,network: xdai version: 2.1,0,0
35979,daohaus,0xff3f8c0b98454306fb0bda57e5ae38cbfa66cc0d,array.finance CCO,,network: xdai version: 2.1,123,122
35980,daohaus,0xff4b20d48a1e8f93bf6745de388547e731f96fd0,PRIDEdao,,network: xdai version: 2.1,0,0
35981,daohaus,0xffb676765d521518ac5c0d4f4067a9db9b72901b,EmpanaDAO,,network: xdai version: 2.0,0,3


In [15]:
# are there duplicate platform and platform_id combinations?
dupes = deployments_df.groupby(['platform', 'platform_id']).size()[deployments_df.groupby(['platform', 'platform_id']).size() > 1]
dupes = dupes.reset_index()
dupes

Unnamed: 0,platform,platform_id,0
0,snapshot,bananagun-snapshot.eth,2
1,snapshot,konpedalnii.eth,2
2,snapshot,mollars.eth,2
3,snapshot,opnxherd.eth,2
4,snapshot,ramsesdex.eth,2
5,snapshot,🐋bluewhale.eth,2


In [16]:
# show where platform_id is paragonscouncil.eth
deployments_df[deployments_df['platform_id'] == 'paragonscouncil.eth']
# keep the later of the two
deployments_df = deployments_df.drop_duplicates(subset=['platform', 'platform_id'], keep='last')
# assert deployments_df.shape[0] == 30129 - 1
deployments_df

Unnamed: 0,platform,platform_id,name,website,additional,votes_count,proposals_count
0,daostack,0x0b93ba560283350d4216f29dc57e15df38d0eace,CuraDAO,,,103,49
1,daostack,0x15344ecdc2c4edfcb092e284d93c20f0529fd8a6,dOrg,,,336,127
2,daostack,0x273d0f686a53a49a0fba2a801566b19f9aaf8fcd,PrimeDAO,,,30,37
3,daostack,0x294f999356ed03347c7a23bcbcf8d33fa41dc830,Genesis Alpha,,,2203,387
4,daostack,0x2b8c70fffda7f3d7667f7cfede1429313886329c,ETHGlobal,,,74,19
...,...,...,...,...,...,...,...
35978,daohaus,0xff07943f171b826b3618982a6213f157fbc5e226,,,network: xdai version: 2.1,0,0
35979,daohaus,0xff3f8c0b98454306fb0bda57e5ae38cbfa66cc0d,array.finance CCO,,network: xdai version: 2.1,123,122
35980,daohaus,0xff4b20d48a1e8f93bf6745de388547e731f96fd0,PRIDEdao,,network: xdai version: 2.1,0,0
35981,daohaus,0xffb676765d521518ac5c0d4f4067a9db9b72901b,EmpanaDAO,,network: xdai version: 2.0,0,3


In [17]:
# save the dataframe as a parquet file deployments.parquet
import pyarrow as pa
deployments = pa.Table.from_pandas(deployments_df)
pq.write_table(deployments, OUT_FILE)