Take two DCBOE files and find differences based on the hashes. 

* old_dcboe = the CSV in the current head commit
* new_dcboe = the CSV in the current working directory, not yet committed

In [1]:
import io
import os
import pandas as pd
from git import Repo

os.chdir('../..') # root of repo is two directories above this notebook

In [2]:
csv_file_path = 'data/dcboe/candidates_dcboe.csv'

oa_repo = Repo('.') 
commit = oa_repo.head.commit
targetfile = commit.tree / csv_file_path

with io.BytesIO(targetfile.data_stream.read()) as f:
    old_dcboe = pd.read_table(f, sep=',', encoding='utf-8')

In [3]:
new_dcboe = pd.read_csv(csv_file_path)

In [4]:
len(old_dcboe)

411

In [5]:
len(new_dcboe)

402

In [6]:
# Change in number of active candidates
len(new_dcboe) - len(old_dcboe)

-9

## Old hashes not in new file

Candidates who will no longer be on the ballot.  

In [7]:
old_hashes_not_in_new = ~( old_dcboe['dcboe_hash_id'].isin(new_dcboe['dcboe_hash_id']))
old_dcboe[old_hashes_not_in_new][['smd_id', 'candidate_name']]

Unnamed: 0,smd_id,candidate_name
20,smd_1B07,Marcia Shia
21,smd_1B08,Joshua Mater
26,smd_1B12,Andre Denegri
27,smd_1B12,Michael Singer
49,smd_2A03,Trupti Patel
...,...,...
386,smd_8C04,Regina Pixley
388,smd_8C04,Travon Hawkins
392,smd_8D01,Patricia Carmon
400,smd_8E01,Shekita McBroom


In [8]:
old_dcboe['smd_name'] = old_dcboe['smd_id'].str.replace('smd_', '') + ': ' + old_dcboe['candidate_name']
old_dcboe[old_hashes_not_in_new][['smd_name']].to_clipboard(index=False)

## New hashes not in old file

In [9]:
new_hashes_not_in_old = ~( new_dcboe['dcboe_hash_id'].isin(old_dcboe['dcboe_hash_id']))
new_dcboe[new_hashes_not_in_old]

Unnamed: 0,dcboe_hash_id,smd_id,candidate_name,pickup_date,filed_date
21,0db14722274a4c02bc71460868357252045f02ff5e9155...,smd_1B08,Joshua Ryan Mater,2020-07-29,2020-08-04
25,fca97272cb92a481e754badb69763c5ceb25fce585ef3b...,smd_1B12,"Michael ""Mike"" Singer",2020-07-15,2020-08-04
47,fe0ed8e2962c47ff06bb27df80e7c440c66019905ca70b...,smd_2A03,"Trupti ""Trip"" J. Patel",2020-07-06,2020-08-05
49,fe1e426543050ed50a06cf28f3d274b34233aa5ebf3562...,smd_2B01,Meg Roggensack,2020-07-08,2020-07-17
65,6aa96a6071c63f6af8e58c941013728ba3f0192ed784dc...,smd_2C02,Will Mascaro,2020-08-03,2020-08-05
...,...,...,...,...,...
377,65d5b028c5c1d4125cb7fe38e4bffcbdb3b9fd3d68bda2...,smd_8C04,Regina Sharlita Pixley,2020-06-26,2020-07-02
379,e805ed63c74a845e8d943f4f96a4e7e6f4f73b8ee75ed8...,smd_8C04,"Travon ""Ward8"" Hawkins",2020-08-05,2020-08-05
383,a512be8fd611558995388673b780a2d2c138dd3d31ebf0...,smd_8D01,"Patricia ""Pat"" Carmon",2020-07-17,2020-08-05
391,bd39a6aed99ae20deca74d79c69f86375eff318c1223d4...,smd_8E01,"Shekita ""Ki-Ki"" McBroom",2020-07-17,2020-08-05


## Same hash, changed info

Have any fields changed on the same hash? 

In [10]:
df = pd.merge(old_dcboe, new_dcboe, how='inner', on='dcboe_hash_id', suffixes=['_old', '_new'])

In [11]:
columns_to_check = [c for c in new_dcboe.columns if c != 'dcboe_hash_id']

for c in columns_to_check:
    num_differences = sum(df[c + '_old'] != df[c + '_new'])
    if num_differences > 0:
        print(df[df[c + '_old'] != df[c + '_new']])

## Compare counts by district

In [12]:
from scripts.refresh_data import RefreshData
rd = RefreshData()
new_smd = rd.assemble_smd_info()

In [13]:
new_smd['number_of_candidates'].sum()

406

In [14]:
# smd_df.to_csv('smd_df_old.csv', index=False)

In [15]:
old_smd = pd.read_csv('/Users/devin/Dropbox/OpenANC/smd_df_2020-09-03.csv')

In [16]:
compare_smd = pd.merge(old_smd, new_smd, how='inner', on='smd_id', suffixes=['_old', '_new'])

In [17]:
compare_smd['district'] = compare_smd['smd_id'].str.replace('smd_', '')

In [18]:
diff = compare_smd['number_of_candidates_old'] != compare_smd['number_of_candidates_new']

In [19]:
sum(diff)

9

In [20]:
compare_smd.loc[diff, [
    'district'
    , 'list_of_candidates_old'
    , 'list_of_candidates_new'
]].to_clipboard(index=False)