# Candidate Data Check

Make sure the DCBOE hash IDs were pasted correctly.

In [1]:
import os
os.chdir('../')

import pandas as pd
pd.set_option('display.max_rows', 500)

from scripts.data_transformations import list_commissioners, list_candidates

In [2]:
cand = list_candidates()
dcboe = pd.read_csv('data/dcboe/candidates_dcboe.csv')

In [3]:
cd = pd.merge(cand, dcboe, how='inner', on='dcboe_hash_id', suffixes=['_openanc', '_dcboe'])

## Candidate Name Comparison

These candidates have a different name in DCBOE than in OpenANC. Compare them and consider changing to match the ballot. 

In [4]:
(
    cd.loc[cd.candidate_name_openanc != cd.candidate_name_dcboe
    , ['candidate_name_openanc', 'candidate_name_dcboe', 'smd_id_openanc', 'smd_id_dcboe']]
    .sort_values(by='smd_id_openanc')
)

Unnamed: 0,candidate_name_openanc,candidate_name_dcboe,smd_id_openanc,smd_id_dcboe
292,J. Swiderski,J.I. Swiderski,smd_2022_1B07,smd_2022_1B07
248,Yasmin Romero-Latin,Yasmin Romero,smd_2022_1D04,smd_2022_1D04
214,Mike McLaughlin,Mike Mclaughlin,smd_2022_1E04,smd_2022_1E04
59,Amanda Farnan,Amanda M Farnan,smd_2022_1E07,smd_2022_1E07
282,Susana Barañano,Susana Bara√Ëano,smd_2022_2A01,smd_2022_2A01
60,"Trupti ""Trip"" J. Patel","Trupti ""Trip"" Patel",smd_2022_2A03,smd_2022_2A03
69,Carson Colton Robb,Carson Robb,smd_2022_2A04,smd_2022_2A04
61,Jeff Rueckgauer,Jeffrey Rueckgauer,smd_2022_2B02,smd_2022_2B02
154,Carole Feld,Carole L. Feld,smd_2022_2D02,smd_2022_2D02
71,Brian J. McCabe,Brian J. Mccabe,smd_2022_2F04,smd_2022_2F04


## SMD Comparison

This should be empty. If there's an disagreement on which district a candidate is running in, figure it out by the candidate's address. 

In [5]:
(
    cd.loc[cd.smd_id_openanc != cd.smd_id_dcboe
    , ['candidate_name_openanc', 'candidate_name_dcboe', 'smd_id_openanc', 'smd_id_dcboe']]
    .sort_values(by='smd_id_openanc')
)

Unnamed: 0,candidate_name_openanc,candidate_name_dcboe,smd_id_openanc,smd_id_dcboe


In [6]:
# This sum should be zero
# Once a candidate has a dcboe_hash_id, they should not have a manual status, generally
cd_left = pd.merge(cand, dcboe, how='left', on='dcboe_hash_id')
cd_left[cd_left.manual_status.notnull()].dcboe_hash_id.notnull().sum()

0

## Candidate Duplicates

Every person_id should only be in the candidate table once

In [7]:
cand.groupby('person_id').size()[cand.groupby('person_id').size() > 1]

Series([], dtype: int64)

In [8]:
cand.groupby('candidate_name').size()[cand.groupby('candidate_name').size() > 1]

Series([], dtype: int64)