Take two DCBOE files and find differences based on the hashes. 

* old_dcboe = the CSV in the current head commit
* new_dcboe = the CSV in the current working directory, not yet committed

In [1]:
import io
import os
import pandas as pd
from git import Repo

os.chdir('../..') # root of repo is two directories above this notebook

In [2]:
csv_file_path = 'data/dcboe/candidates_dcboe.csv'

oa_repo = Repo('.') 
commit = oa_repo.head.commit
targetfile = commit.tree / csv_file_path

with io.BytesIO(targetfile.data_stream.read()) as f:
    old_dcboe = pd.read_table(f, sep=',', encoding='utf-8')

In [3]:
new_dcboe = pd.read_csv(csv_file_path)

In [4]:
len(old_dcboe)

420

In [5]:
len(new_dcboe)

411

## Old hashes not in new file

In [17]:
old_hashes_not_in_new = ~( old_dcboe['dcboe_hash_id'].isin(new_dcboe['dcboe_hash_id']))
old_dcboe[old_hashes_not_in_new][['smd_id', 'candidate_name']]

Unnamed: 0,smd_id,candidate_name
66,smd_2C02,Jamaal Burton
138,smd_3F07,Lisa Cox
295,smd_6E01,Mary A. Sutherland
303,smd_6E05,Katherine Kortum
313,smd_7B04,Nicole Smith-McDermott
366,smd_8A02,Ira L. Lovelace
396,smd_8C06,Rhonda L. Edwards-Hines
404,smd_8D05,Ellen Armstead
405,smd_8D05,Shanquella 'Goldie' Ross


In [18]:
old_dcboe['smd_name'] = old_dcboe['smd_id'].str.replace('smd_', '') + ': ' + old_dcboe['candidate_name']
old_dcboe[old_hashes_not_in_new][['smd_name']].to_clipboard(index=False)

## New hashes not in old file

In [7]:
new_hashes_not_in_old = ~( new_dcboe['dcboe_hash_id'].isin(old_dcboe['dcboe_hash_id']))
new_dcboe[new_hashes_not_in_old]

Unnamed: 0,dcboe_hash_id,smd_id,candidate_name,pickup_date,filed_date


## Same hash, changed info

Have any fields changed on the same hash? 

In [8]:
df = pd.merge(old_dcboe, new_dcboe, how='inner', on='dcboe_hash_id', suffixes=['_old', '_new'])

In [9]:
columns_to_check = [c for c in old_dcboe.columns if c != 'dcboe_hash_id']

for c in columns_to_check:
    num_differences = sum(df[c + '_old'] != df[c + '_new'])
    if num_differences > 0:
        print(df[df[c + '_old'] != df[c + '_new']])

                                         dcboe_hash_id smd_id_old  \
367  a18cc1ecaaf81f445eeb1e80ddc7d01168aa3d278ea46f...   smd_8A06   

    candidate_name_old pickup_date_old filed_date_old smd_id_new  \
367  Kristina Leszczak      2020-08-26            NaN   smd_8A06   

    candidate_name_new pickup_date_new filed_date_new  
367  Kristina Leszczak      2020-08-26            NaN  
