In [1]:
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
import zipfile

In [2]:
election_path = "2020/ca/city/san_francisco"
election_kind = "general"

In [3]:
%%time

input_filename = f"data_raw/{election_path}/{election_kind}/CVR_Export_20201201091840.zip"
election_basename = election_path.replace("/", "__") + f"__{election_kind}"
output_dirname = f"data_processed/{election_path}/{election_basename}"

with zipfile.ZipFile(input_filename, "r") as zip_file:

    with zip_file.open("ContestManifest.json") as contest_manifest:
        contest_data = json.load(contest_manifest)

    with zip_file.open("CandidateManifest.json") as candidate_manifest:
        candidate_data = json.load(candidate_manifest)
    
    mark_data = []
    ballot_contest_id = 0
    total = len(zip_file.filelist)
    for idx, zip_info in enumerate(zip_file.filelist):
        
        if idx % 1000 == 0:
            print(idx, 'of', total)
            
        if not zip_info.filename.startswith('CvrExport_'):
            continue
            
        with zip_file.open(zip_info) as cvr_export:
            data = json.load(cvr_export)
            
        for sess in data['Sessions']:
            orig = sess["Original"]
            for card in orig['Cards']:
                for contest in card['Contests']:
                    contest_id = contest['Id']
                    for mark in contest['Marks']:
                        mark_data.append([
                            zip_info.filename, 
                             ballot_contest_id, 
                             contest_id,
                             mark['CandidateId'], 
                             mark['Rank'], 
                             mark['IsVote'], 
                            mark['IsAmbiguous']
                        ])
                    ballot_contest_id += 1

0 of 24956
1000 of 24956
2000 of 24956
3000 of 24956
4000 of 24956
5000 of 24956
6000 of 24956
7000 of 24956
8000 of 24956
9000 of 24956
10000 of 24956
11000 of 24956
12000 of 24956
13000 of 24956
14000 of 24956
15000 of 24956
16000 of 24956
17000 of 24956
18000 of 24956
19000 of 24956
20000 of 24956
21000 of 24956
22000 of 24956
23000 of 24956
24000 of 24956
CPU times: user 2min 22s, sys: 3.52 s, total: 2min 25s
Wall time: 2min 45s


In [4]:
%%time

df_contests = pd.json_normalize(contest_data, record_path=["List"])
df_contests = df_contests.rename(columns={'Id': 'contest_id'})
df_contests = df_contests.set_index('contest_id').sort_index()

df_candidates = pd.json_normalize(candidate_data, record_path=["List"])
df_candidates = df_candidates.rename(columns={'Id': 'candidate_id'})
df_candidates = df_candidates.set_index('candidate_id').sort_index()

columns = [
    'filename', 
    'ballot_contest_id', 
    'contest_id', 
    'candidate_id', 
    'rank', 
    'is_vote', 
    'is_ambiguous'
]

df_marks = pd.DataFrame(mark_data, columns=columns)
dtypes = {
    'filename': 'category',
    'rank': np.int8,
}
df_marks = df_marks.astype(dtypes)
df_marks = df_marks.set_index(['contest_id', 'ballot_contest_id', 'candidate_id']).sort_index()

CPU times: user 24.8 s, sys: 5.86 s, total: 30.7 s
Wall time: 34 s


In [5]:
os.makedirs(output_dirname, exist_ok=True)
df_marks.to_parquet(f"{output_dirname}/marks.pq")
df_contests.to_parquet(f"{output_dirname}/contests.pq")
df_candidates.to_parquet(f"{output_dirname}/candidates.pq")