Take two DCBOE files and find differences based on the hashes. 

* old_dcboe = the CSV in the current head commit
* new_dcboe = the CSV in the current working directory, not yet committed

In [1]:
import io
import os
import pandas as pd
from git import Repo

os.chdir('../..') # root of repo is two directories above this notebook

In [2]:
csv_file_path = 'data/dcboe/candidates_dcboe.csv'

oa_repo = Repo('.') 
commit = oa_repo.head.commit
targetfile = commit.tree / csv_file_path

with io.BytesIO(targetfile.data_stream.read()) as f:
    old_dcboe = pd.read_table(f, sep=',', encoding='utf-8')

In [3]:
new_dcboe = pd.read_csv(csv_file_path)

In [4]:
len(old_dcboe)

411

In [5]:
len(new_dcboe)

402

In [6]:
# Change in number of active candidates
len(new_dcboe) - len(old_dcboe)

-9

## Old hashes not in new file

Candidates who will no longer be on the ballot.  

In [7]:
old_hashes_not_in_new = ~( old_dcboe['dcboe_hash_id'].isin(new_dcboe['dcboe_hash_id']))
old_dcboe[old_hashes_not_in_new][['smd_id', 'candidate_name']]

Unnamed: 0,smd_id,candidate_name
20,smd_1B07,Marcia Shia
23,smd_1B09,James A. Turner
80,smd_2E08,Matias Burdman
108,smd_3C09,Nancy J. MacWood
116,smd_3D05,Sabra Horne
118,smd_3D06,Wendy Lynch
256,smd_6A03,Mike Soderman
286,smd_6C06,Tommy Howard
330,smd_7D04,Cinque Culver


In [8]:
old_dcboe['smd_name'] = old_dcboe['smd_id'].str.replace('smd_', '') + ': ' + old_dcboe['candidate_name']
old_dcboe[old_hashes_not_in_new][['smd_name']].to_clipboard(index=False)

## New hashes not in old file

In [9]:
new_hashes_not_in_old = ~( new_dcboe['dcboe_hash_id'].isin(old_dcboe['dcboe_hash_id']))
new_dcboe[new_hashes_not_in_old]

Unnamed: 0,dcboe_hash_id,smd_id,candidate_name,pickup_date,filed_date


## Same hash, changed info

Have any fields changed on the same hash? 

In [10]:
df = pd.merge(old_dcboe, new_dcboe, how='inner', on='dcboe_hash_id', suffixes=['_old', '_new'])

In [11]:
columns_to_check = [c for c in new_dcboe.columns if c != 'dcboe_hash_id']

for c in columns_to_check:
    num_differences = sum(df[c + '_old'] != df[c + '_new'])
    if num_differences > 0:
        print(df[df[c + '_old'] != df[c + '_new']])