In [34]:
from IPython.display import HTML
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore") 
from pprint import pprint

In [3]:
HTML('''<script>code_show=true;function code_toggle() {if (code_show){
$('div.input').hide();} else {$('div.input').show();}code_show = !code_show} 
$( document ).ready(code_toggle);</script><a href="javascript:code_toggle()">Toggle Code</a>.''')

In [35]:
censuslocs = pd.read_csv('censuslocs_new_061518.csv')

In [5]:
states = pd.read_csv('states.csv')

In [6]:
levels = ['State','County','City/Town','Tract/BG']

In [7]:
data = pd.concat([censuslocs,states],ignore_index=True).reset_index()

In [8]:
data.name = data.name.astype(str,errors='ignore')
data = data.drop(['Unnamed: 0', 'Unnamed: 0.1','index'],axis=1)

In [9]:
#creating a concatenated field of name and parent-location, since same town name can appear in multiple states
data['full_name'] = data.name + "-" + data['parent-location']

In [10]:
#unique list of state names in the data
states_list = states.name.unique()

In [11]:
#splitting into distinct data sets in case I want them later
states_only = data[data.Division == 'State']
counties_only = data[data.Division == 'County']
cities_towns_only = data[data.Division == 'City/Town']
tracts_bgs_only = data[data.Division == 'Tract/BG']

In [12]:
numStates = len(states_only.name.unique())
numCounties = len(counties_only.full_name.unique())
numCitiesTowns = len(cities_towns_only.full_name.unique())
numTractsBGs = len(tracts_bgs_only.full_name.unique())

In [13]:
#some functions I use
def percentify(numerator,denominator):
    return str(int(numerator/denominator*100))+'%'
def commafy(number):
    return "{:,}".format(int(number))

# <font color='black'> CAT Challenge 1 Gate 1 </font> 

# <font color='#31b1ed'> Fields Added </font> 

#### Division: Created to subdivide data into the appropriate levels (state, county, city/town, tracts/block groups) 
- If name was a state: "State"
- If name contained the word "county": "County"
- If parent name was a state: "City/Town"
- If name contained a number: "Tract/BG"

#### full_name: Concatentation of name and parent-location (because many town names are duplicated across states)

#  <font color='#31b1ed'> Data Validation </font>

## *Ensure the data we have is the data we expect*

##### <font color='#3dad00'>Columns available</font> 

In [14]:
print(data.columns.values)

['Division' 'eviction-filing-rate' 'eviction-filings' 'eviction-rate'
 'evictions' 'imputed' 'low-flag' 'median-gross-rent'
 'median-household-income' 'median-property-value' 'name' 'parent-location'
 'pct-af-am' 'pct-am-ind' 'pct-asian' 'pct-hispanic' 'pct-multiple'
 'pct-nh-pi' 'pct-other' 'pct-renter-occupied' 'pct-white' 'population'
 'poverty-rate' 'rent-burden' 'renter-occupied-households' 'subbed' 'year'
 'full_name']


##### <font color='#3dad00'>All states are included in the data set</font> 

In [15]:
print("Number of unique states: ",numStates,"(50 + DC)")

Number of unique states:  51 (50 + DC)


##### All counties, towns/cities, tracts/block groups are included in the data set

In [16]:
print("Number of unique counties: ",numCounties)
print("Number of unique cities/towns: ",numCitiesTowns)
print("Number of unique tracts/block groups: ",numTractsBGs)

Number of unique counties:  3194
Number of unique cities/towns:  29940
Number of unique tracts/block groups:  293829


In [17]:
print("(Need another data set to validate that these are the correct numbers)")

(Need another data set to validate that these are the correct numbers)


##### <font color='#3dad00'>Max/Min years in data set</font> 

In [18]:
print('Earliest year in data set: ', data.year.min())
print('Latest year in data set: ', data.year.max())

Earliest year in data set:  2000
Latest year in data set:  2016


##### <font color='#ad1100'>All geographic locations at all levels have the same amount of data: </font> 

In [19]:
print('Years of data by state:')
states[['name','year']].groupby(by='name').count()

Years of data by state:


Unnamed: 0_level_0,year
name,Unnamed: 1_level_1
Alabama,17
Alaska,17
Arizona,17
Arkansas,17
California,17
Colorado,17
Connecticut,17
Delaware,17
District of Columbia,17
Florida,17


In [20]:
df17 = pd.DataFrame(None,columns=['# < 17','# 17','# > 17','% < 17','% 17','% > 17'],index=levels)
group17 = data[['Division','full_name','year']].groupby(by=['Division','full_name'],as_index=False).count()
is17 = group17[group17.year == 17]
less17 = group17[group17.year < 17]
more17 = group17[group17.year > 17]
for l in levels: 
    less = len(less17[less17.Division==l]) 
    equal = len(is17[is17.Division==l])
    more = len(more17[more17.Division==l])
    allofit = len(group17[group17.Division==l])
    df17['# < 17'][l] = commafy(less)
    df17['# 17'][l] = commafy(equal)
    df17['# > 17'][l] = commafy(more)
    df17['% < 17'][l] = percentify(less,allofit)
    df17['% 17'][l] = percentify(equal,allofit)
    df17['% > 17'][l] = percentify(more,allofit)
df17

Unnamed: 0,# < 17,# 17,# > 17,% < 17,% 17,% > 17
State,0,51,0,0%,100%,0%
County,185,3007,2,5%,94%,0%
City/Town,5462,24356,122,18%,81%,0%
Tract/BG,7382,285848,599,2%,97%,0%


##### <font color='#3dad00'>Population figures are reasonable</font> 

In [36]:
levels = ['State','County','City/Town','Tract/BG']
pop_df = pd.DataFrame(None, columns=['Max Pop','Max Name','Max Year','Min Pop','Min Name','Min Year'],index=levels)
for l in levels:
    dfs = {'State':states_only, 'County': counties_only,'City/Town':cities_towns_only,'Tract/BG':tracts_bgs_only}
    wax = dfs[l].population[dfs[l].Division==l].max()
    win = dfs[l].population[dfs[l].Division==l].min()
    wax_name = dfs[l].full_name[dfs[l].population == wax].iloc[0]
    win_name = dfs[l].full_name[dfs[l].population == win].iloc[0]
    pop_df['Max Pop'][l] = "{:,}".format(int(wax))
    pop_df['Max Name'][l] = wax_name
    pop_df['Max Year'][l] = dfs[l].year[(dfs[l].population == wax)&(dfs[l].full_name == wax_name)].iloc[0]
    pop_df['Min Pop'][l] = "{:,}".format(int(win))
    pop_df['Min Name'][l] = win_name
    pop_df['Min Year'][l] = dfs[l].year[(dfs[l].population == win)&(dfs[l].full_name == win_name)].iloc[0]

print('Max/Min population numbers by level')
pop_df

Max/Min population numbers by level


Unnamed: 0,Max Pop,Max Name,Max Year,Min Pop,Min Name,Min Year
State,38421464,California-USA,2011,493782,Wyoming-USA,2000
County,10038388,Los Angeles County-California,2011,25,Bear Valley CDP (Alpine County)-California,2011
City/Town,8426743,New York-New York,2011,1,Drummond-Idaho,2011
Tract/BG,53812,"6731.01-Fort Bend County, Texas",2011,0,"2604.03.2-Baltimore city, Maryland",2005


In [22]:
print("(Need another data set to cross-reference and validate that specific population numbers are correct)")

(Need another data set to cross-reference and validate that specific population numbers are correct)


##### <font color='#3dad00'> Fields using percentages do not contain percentages that exceed 100% or fall below 0% </font> 

In [23]:
columns = ['pct-renter-occupied','pct-white','pct-af-am','pct-hispanic','pct-am-ind','pct-asian','pct-nh-pi','pct-multiple','pct-other']
percentage_fields = data[columns]
per_df = pd.DataFrame(None, columns=['max_per','max_name','max_pop','min_per','min_name','min_pop'],index=columns)
for c in columns:
    wax = data[c].max()
    win = data[c].min()
    waxName = data.full_name[data[c] == wax].iloc[0]
    winName = data.full_name[data[c] == win].iloc[0]
    per_df.max_per[c] = wax #max percent
    per_df.max_name[c] = waxName #name of that location
    per_df.max_pop[c] = data.population[data.full_name == waxName].iloc[0] #population of that location
    per_df.min_per[c] = win
    per_df.min_name[c] = winName
    per_df.min_pop[c] = data.population[data.full_name == winName].iloc[0]
per_df

Unnamed: 0,max_per,max_name,max_pop,min_per,min_name,min_pop
pct-renter-occupied,100.0,Fort Rucker-Alabama,6052.0,0,Benton-Alabama,47
pct-white,100.0,Arley-Alabama,290.0,0,Boykin-Alabama,275
pct-af-am,100.0,Boykin-Alabama,275.0,0,Addison-Alabama,723
pct-hispanic,100.0,Thornburg-Iowa,84.0,0,Wilcox County-Alabama,13183
pct-am-ind,100.0,Kickapoo Site 2-Kansas,34.0,0,Choctaw County-Alabama,15922
pct-asian,100.0,"2606.05.6-Baltimore city, Maryland",43.27,0,Clay County-Alabama,14254
pct-nh-pi,85.88,"412.0-Kauai County, Hawaii",160.0,0,Butler County-Alabama,21399
pct-multiple,100.0,"525.18-Orange County, California",3.0,0,Wilcox County-Alabama,13183
pct-other,67.11,"1003.2-Suffolk County, Massachusetts",891.0,0,Bibb County-Alabama,20826


##### <font color='#3dad00'> Sum of demographics fields equals 100% </font> 

In [37]:
demo_cols = ['pct-white','pct-af-am','pct-hispanic','pct-am-ind','pct-asian','pct-nh-pi','pct-multiple','pct-other']
demos = data[demo_cols]
demos_sums = demos.sum(axis=1)
bad_demos = demos_sums[(demos_sums > 100.04) & (demos_sums < 99.96)]
print("Number of fields where demographics do not total 100%: ",len(bad_demos))

Number of fields where demographics do not total 100%:  0


# <font color='#0b80a0'> Data Cleansing </font>

##### If population is zero, delete the record

In [25]:
print("Number of records where population is less than or equal to zero: ", len(data[data.population<=0]))

Number of records where population is less than or equal to zero:  29735


In [26]:
data_c = data[data.population>0]
states_only = data_c[data_c.Division == 'State']
counties_only = data_c[data_c.Division == 'County']
cities_towns_only = data_c[data_c.Division == 'City/Town']
tracts_bgs_only = data_c[data_c.Division == 'Tract/BG']
print('...Processing complete')

...Processing complete


In [27]:
print("Number of records where population is less than or equal to zero: ", len(data_c[data_c.population<=0]))

Number of records where population is less than or equal to zero:  0


##### If duplicate record, delete

In [28]:
print('Current number of records: ',commafy(len(data_c)))

Current number of records:  5,422,851


In [29]:
data_c = data_c.drop_duplicates()

In [30]:
print("...Processing complete")
print('Current number of records: ',commafy(len(data_c)))
print("Number of records removed: ",commafy(len(data)-len(data_c)))

...Processing complete
Current number of records:  5,422,839
Number of records removed:  29,747


In [31]:
df17 = pd.DataFrame(None,columns=['# < 17','# 17','# > 17','% < 17','% 17','% > 17'],index=levels)
group17 = data_c[['Division','full_name','year']].groupby(by=['Division','full_name'],as_index=False).count()
is17 = group17[group17.year == 17]
less17 = group17[group17.year < 17]
more17 = group17[group17.year > 17]
for l in levels: 
    less = len(less17[less17.Division==l]) 
    equal = len(is17[is17.Division==l])
    more = len(more17[more17.Division==l])
    allofit = len(group17[group17.Division==l])
    df17['# < 17'][l] = commafy(less)
    df17['# 17'][l] = commafy(equal)
    df17['# > 17'][l] = commafy(more)
    df17['% < 17'][l] = percentify(less,allofit)
    df17['% 17'][l] = percentify(equal,allofit)
    df17['% > 17'][l] = percentify(more,allofit)
print("Locations with less than 17 years increased likely because of newly incorporated places that had placeholder zeros before")
df17

Locations with less than 17 years increased likely because of newly incorporated places that had placeholder zeros before


Unnamed: 0,# < 17,# 17,# > 17,% < 17,% 17,% > 17
State,0,51,0,0%,100%,0%
County,184,3007,2,5%,94%,0%
City/Town,5562,24234,121,18%,81%,0%
Tract/BG,8434,284604,134,2%,97%,0%


##### There are 63 cities that have the same name as a state

In [33]:
xx = data[(data['name'].isin(states_list)) & (data['parent-location'] != 'USA')]
xx[['name', 'parent-location']].drop_duplicates().sort_values(by='name').reset_index(drop=True)

Unnamed: 0,name,parent-location
0,California,Kentucky
1,California,Missouri
2,California,Maryland
3,California,Pennsylvania
4,Delaware,Ohio
5,Delaware,Oklahoma
6,Delaware,New Jersey
7,Delaware,Iowa
8,District of Columbia,District of Columbia
9,Florida,Missouri
