In [1]:
import pandas as pd, os, numpy as np, requests, json
from time import time
%matplotlib inline

In [2]:
# load the data
data_types = {'zip':str, 'block_fips':str, 'tract_fips':str}
locations = pd.read_csv('locations-fips.csv', encoding='utf-8', converters=data_types)

In [3]:
# create a new column to filter dupes
locations['name_tract'] = locations['name'] + '+++' + locations['tract_fips'].astype('string')
locations['name_block'] = locations['name'] + '+++' + locations['block_fips'].astype('string')

In [4]:
# the duplicates are any rows that match another in terms of name + fips
dupes_temp = locations['name_block'].value_counts()
dupes_temp = dupes_temp[dupes_temp > 1]
name_tract = pd.Series(dupes_temp.index)

In [5]:
duplicates  = pd.DataFrame()
duplicates['name'] = name_tract.str.split('\+\+\+', return_type='frame')[0]
duplicates['block_fips'] = name_tract.str.split('\+\+\+', return_type='frame')[1]
duplicates.head()

Unnamed: 0,name,block_fips
0,Shizmo Brewing Co. (by appt only),60855008001000
1,Buffalo Brewpub,360290096001003
2,Old Boys' Brew House,261390211001025
3,River City Brewing Company,201730043002040
4,Telegraph Brewing Co.,60830009001017


In [6]:
# locations_with_dupes contains all rows that match another row's name+block_fips
locations_with_dupes = locations[locations['name_block'].isin(name_tract)]

In [7]:
len(locations_with_dupes)

879

In [44]:
# get a list of duplicate IDs in the full data set
dupes_temp = locations['id'].value_counts()
dupes_temp = dupes_temp[dupes_temp > 1]
dupe_IDs = pd.Series(dupes_temp.index)

# output to csv all rows that have the same ID as another row in the full data set
rows_with_dupe_IDs = locations[locations['id'].isin(dupe_IDs)]
rows_with_dupe_IDs = rows_with_dupe_IDs.drop(axis=1, labels=['name_tract', 'name_block'])
rows_with_dupe_IDs = rows_with_dupe_IDs.sort('id')
rows_with_dupe_IDs.to_csv('rows_with_dupe_IDs.csv', encoding='utf-8', index=False)
len(rows_with_dupe_IDs)

676

In [47]:
# output to csv all dupe (name+block_fips) rows that DO NOT have the same ID as another row among the dupes
dupes_temp = locations_with_dupes['id'].value_counts()
dupes_temp = dupes_temp[dupes_temp > 1]
dupe_IDs = pd.Series(dupes_temp.index)

non_dupe_IDs = locations_with_dupes[~locations_with_dupes['id'].isin(dupe_ID)]
non_dupe_IDs = non_dupe_IDs.drop(axis=1, labels=['name_tract', 'name_block'])
non_dupe_IDs = non_dupe_IDs.sort('name')
non_dupe_IDs.to_csv('non-dupe-IDs.csv', encoding='utf-8', index=False)
len(non_dupe_IDs)

731

In [15]:
# remove unnecessary columns
locations_with_dupes = locations_with_dupes.drop(axis=1, labels=['name_tract'])

In [16]:
# save to csv
locations_with_dupes.to_csv('dupes.csv', encoding='utf-8', index=False)