In [45]:
import pandas as pd
import numpy as np
from pathlib import Path

import config

import importlib
try:
    importlib.reload(config) # reload module
except NameError:
    pass

In [70]:
df_vote = pd.read_excel(config.RAW_DATA_PATH, sheet_name='vote')

vote_count_regex = '\d+\.\d+$|\d+\,\d+$|\d+$'
partai_no = [
    'Partai Kebangkitan Bangsa',
    'Partai Gerakan Indonesia Raya',
    'Partai Demokrasi Indonesia Perjuangan',
    'Partai Golongan Karya',
    'Partai Nasdem',
    'Partai Gerakan Perubahan Indonesia',
    'Partai Berkarya',
    'Partai Keadilan Sejahtera',
    'Partai Persatuan Indonesia',
    'Partai Persatuan Pembangunan',
    'Partai Solidaritas Indonesia',
    'Partai Amanat Nasional',
    'Partai Hati Nurani Rakyat',
    'Partai Demokrat',
    'Partai Bulan Bintang',
    'Partai Keadilan dan Persatuan Indonesia'
]
partai_no_cat = pd.CategoricalDtype(partai_no, ordered=True)

def clean_df_vote(df):
    return (df
        .loc[~(df['values'].isna())]
        .assign(
            tipe=lambda df_: np.where(df_['values'].str.startswith('A.1'), 'Partai', 'Calon'), 
            values=lambda df_: (df_['values']
                                .str.replace('^A.1. |^A.1 ', '', regex=True) # remove A1
                                .str.replace('^A.2. |^A.2 ', '', regex=True) # remove A1
                                .str.replace('^\d+\.', '', regex=True) # remove no urut
                                .str.strip()
                                ),
            nama=lambda df_: (df_['values']
                              .str.replace(vote_count_regex, '', regex=True) # remove vote
                              .str.replace('\d+\.\d+|\d+', '', regex=True) # remove digit
                              .str.strip()
                              ), 
            vote=lambda df_: (df_['values']
                            .str.extract(f'({vote_count_regex})')[0]
                            .str.replace('.', '', regex=False)
                            .str.replace(',', '', regex=False)
                            .str.strip()
                            .astype(int)
                            ), 
            partai=lambda df_: np.where(df_['tipe']=='Partai', df_['nama'], np.NaN), 
            terpilih=lambda df_: df_['terpilih'].fillna(False)
        )
        .assign(
            partai=lambda df_: df_['partai'].fillna(method='ffill').astype(partai_no_cat),
            no_urut=lambda df_: df_.groupby(['dapil_no', 'partai']).transform('cumcount'),
        )
        .drop(columns=['values'])
    )

df_vote_clean = clean_df_vote(df_vote)

# # check vote for each partai to make sure
# (df_vote_clean
#     .loc[df_vote_clean['tipe'] == 'Partai']
#     .groupby(['dapil_no', 'dapil_nama', 'nama'])
#     .agg(vote=('vote', 'sum'))
#     .reset_index()
#     .pivot(index='nama', columns=['dapil_no', 'dapil_nama'], values='vote')
# )

# check vote for each partai + calon to make sure
# (df_vote_clean
#     .groupby(['dapil_no', 'dapil_nama', 'partai'])
#     .agg(vote=('vote', 'sum'))
#     .reset_index()
#     .loc[lambda df_: df_['vote'] > 0]
#     .loc[lambda df_: df_['dapil_no'] == 10]
# )

# make sure that partai has no urut 0
# (df_vote_clean
#     .loc[df_vote_clean['tipe'] == 'Partai', 'no_urut']
#     .sum()
# )

# check terpilih one by one
# (df_vote_clean
#     .loc[(df_vote_clean['terpilih'] == True) & (df_vote_clean['dapil_no'] == 10), ['partai', 'no_urut', 'nama']]
# )

Unnamed: 0,partai,no_urut,nama
1652,Partai Gerakan Indonesia Raya,1,Yudha Permana
1659,Partai Gerakan Indonesia Raya,8,Syarifudin
1666,Partai Demokrasi Indonesia Perjuangan,1,"Merry Hotma, S.H."
1669,Partai Demokrasi Indonesia Perjuangan,4,Ima Mahdiah
1672,Partai Demokrasi Indonesia Perjuangan,7,"Hardiyanto Kenneth, S.H., M.H., M.Si."
1674,Partai Demokrasi Indonesia Perjuangan,8,dr. Stephanie Octavia
1702,Partai Nasdem,7,Jupiter
1728,Partai Keadilan Sejahtera,1,"Drs. H. Nasrullah, M.E."
1729,Partai Keadilan Sejahtera,2,"Ir. Abdul Aziz, S.Si."
1767,Partai Solidaritas Indonesia,1,"Eneng Malianasari, S.Sos."


In [53]:
df_res = pd.read_excel(config.RAW_DATA_PATH, sheet_name='result')

partai_regex = '|'.join(partai_no)

def clean_df_res(df):
    return (df
        .assign(
            nama=lambda df_: (df_['values']
                            .str.replace(partai_regex, '', regex=True) # remove partai
                            .str.replace('^\w+ ? \w+', '', regex=True) # remove beginning digit
                            .str.replace('\d+, ?\d+ \d+$', '', regex=True) # remove after digit
                            .str.replace('\d', '', regex=True)
                            .str.replace(',', '', regex=False)
                            .str.strip()
                            ),
            no_urut=lambda df_: (df_['values']
                                 .str.extract('(^\d+ \d?)') # extract beginning digit
                                 ),
            partai=lambda df_: df_['values'].str.extract(f'({partai_regex})'), 
            vote=lambda df_: (df_['values']
                            .str.replace(partai_regex, '', regex=True) # remove partai
                            .str.strip()
                            .str.replace('^\w+ ? \w+', '', regex=True) # remove beginning digit
                            .str.replace(' \d{1}$', '', regex=True) # remove last sole digit
                            .str.replace('\D', '', regex=True) # remove nondigit
                            .astype(int)
                            )
        )
        .assign(
            partai=lambda df_: df_['partai'].fillna(method='ffill'), 
            no_urut=lambda df_: (df_['no_urut']
                                 .str.strip()
                                 .str.split(' ')
                                 .str[-1]
                                 .astype(int)
                                 ) # get last digit
            )
        .drop(columns=["values"])
    )

df_res_clean = clean_df_res(df_res)

# check one by one dapil
# (df_res_clean
#     .loc[df_res_clean['dapil_no'] == 10]
# )
# df_res_clean

In [54]:
# (df_vote_clean
#     .merge(
#         right=df_res_clean, 
#         left_on=['dapil_no', 'partai', 'no_urut'],
#         right_on=['dapil_no', 'partai', 'no_urut'],
#         how='left'
#     )
#     .loc[lambda df_: df_['vote_y'] > 0]
#     .head(12)
# )