In [1]:
from hashlib import sha3_256

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Extract

In [2]:
df = pd.read_json("../data/scraped/summaries.jsonl", lines=True, convert_dates=["ts"])
df.head()

Unnamed: 0,ts,psu,mode,chart,table,progres,url
0,2024-03-04 04:00:15,Reguler,hhcw,"{'100025': 1936408, '100026': 1956503, '100027...","{'3101': {'psu': 'Reguler', '100025': 7043, '1...","{'total': 30766, 'progres': 22196}",https://sirekap-obj-data.kpu.go.id/pemilu/hhcw...
1,2024-03-31 21:30:15,Reguler,hhcw,"{'100025': 50678, '100026': 830531, '100027': ...","{'5101': {'psu': 'Reguler', '100025': 5548, '1...","{'total': 12809, 'progres': 7520}",https://sirekap-obj-data.kpu.go.id/pemilu/hhcw...
2,2024-04-03 00:00:15,Reguler,hhcw,"{'100025': 1902656, '100026': 3168569, '100027...","{'3601': {'psu': 'Reguler', '100025': 227628, ...","{'total': 33324, 'progres': 26912}",https://sirekap-obj-data.kpu.go.id/pemilu/hhcw...
3,2024-03-13 09:00:15,Reguler,hhcw,"{'100025': 218396, '100026': 844549, '100027':...","{'1701': {'psu': 'Reguler', '100025': 20415, '...","{'total': 6210, 'progres': 5862}",https://sirekap-obj-data.kpu.go.id/pemilu/hhcw...
4,2024-03-20 09:00:15,Reguler,hhcw,"{'100025': 385680, '100026': 1000797, '100027'...","{'3401': {'psu': 'Reguler', '100025': 44057, '...","{'total': 11932, 'progres': 9406}",https://sirekap-obj-data.kpu.go.id/pemilu/hhcw...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91948 entries, 0 to 91947
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   ts       91948 non-null  datetime64[ns]
 1   psu      91948 non-null  object        
 2   mode     73211 non-null  object        
 3   chart    80724 non-null  object        
 4   table    91948 non-null  object        
 5   progres  91948 non-null  object        
 6   url      91948 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 4.9+ MB


# Transform

In [4]:
PASLON_NAMES = {
    "100025": "H. ANIES RASYID BASWEDAN, Ph.D. - Dr. (H.C.) H. A. MUHAIMIN ISKANDAR",
    "100026": "H. PRABOWO SUBIANTO - GIBRAN RAKABUMING RAKA",
    "100027": "H. GANJAR PRANOWO, S.H., M.I.P. - Prof. Dr. H. M. MAHFUD MD",
}

In [5]:
def hash_url(s: str) -> str:
    return sha3_256(s.encode("utf-8")).hexdigest()

In [6]:
df_stats = (
    pd.json_normalize(df["chart"])
    .rename(columns={k: f"suara_paslon_{i}" for i,k in enumerate(PASLON_NAMES.keys())})
    .rename(columns={"persen": "suara_masuk_persen"})
)

df_stats.head()

Unnamed: 0,suara_paslon_0,suara_paslon_1,suara_paslon_2,suara_masuk_persen
0,1936408.0,1956503.0,824937.0,72.14
1,50678.0,830531.0,688356.0,58.71
2,1902656.0,3168569.0,560423.0,80.76
3,218396.0,844549.0,140187.0,94.4
4,385680.0,1000797.0,585930.0,78.83


In [7]:
df_progress = (
    pd.json_normalize(df["progres"])
    .rename(columns={"total": "total_pemilih", "progres": "suara_masuk"})
)

df_progress.head()

Unnamed: 0,total_pemilih,suara_masuk
0,30766.0,22196.0
1,12809.0,7520.0
2,33324.0,26912.0
3,6210.0,5862.0
4,11932.0,9406.0


In [8]:
df_final = (
    df[["ts", "mode", "psu", "url"]]
    .copy()
    .assign(
        id=df["url"].apply(hash_url),
        kode=df["url"].str.extract(r"(\d+)\.json"),
    )
    .join(df_stats)
    .join(df_progress)
)

df_final.head()

Unnamed: 0,ts,mode,psu,url,id,kode,suara_paslon_0,suara_paslon_1,suara_paslon_2,suara_masuk_persen,total_pemilih,suara_masuk
0,2024-03-04 04:00:15,hhcw,Reguler,https://sirekap-obj-data.kpu.go.id/pemilu/hhcw...,07a490b280a7f775730a66f0a66d4be058bd12810b9eed...,31,1936408.0,1956503.0,824937.0,72.14,30766.0,22196.0
1,2024-03-31 21:30:15,hhcw,Reguler,https://sirekap-obj-data.kpu.go.id/pemilu/hhcw...,615d693b8aea1a2279d927bdc50a6bb184ae7489868385...,51,50678.0,830531.0,688356.0,58.71,12809.0,7520.0
2,2024-04-03 00:00:15,hhcw,Reguler,https://sirekap-obj-data.kpu.go.id/pemilu/hhcw...,292548aeaf3afc8aa00d5cbd80fd5fae6b64cf53d2c5f9...,36,1902656.0,3168569.0,560423.0,80.76,33324.0,26912.0
3,2024-03-13 09:00:15,hhcw,Reguler,https://sirekap-obj-data.kpu.go.id/pemilu/hhcw...,d9969464eb5d975e009b58443887972e6ce927ce3c28b0...,17,218396.0,844549.0,140187.0,94.4,6210.0,5862.0
4,2024-03-20 09:00:15,hhcw,Reguler,https://sirekap-obj-data.kpu.go.id/pemilu/hhcw...,631a110004b966cb022e37314e8c4d86c065f621eee7c6...,34,385680.0,1000797.0,585930.0,78.83,11932.0,9406.0


# Load

In [9]:
df_final.to_parquet("../data/clean/summaries.parquet")