### What impact does perceived safety have on livability in urban neighborhoods?


### 1. Import necessary modules and load pickled files

In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm, tqdm_pandas

In [3]:
with open('data/pickled/ss_orig_nyc.pkl', 'r') as picklefile:
    sso = pickle.load(picklefile)
    
with open('data/pickled/ss_withzip.pkl', 'r') as picklefile:
    ssp = pickle.load(picklefile)
    
with open('data/pickled/crime_withzip.pkl', 'r') as picklefile:
    crime = pickle.load(picklefile)

In [19]:
# split crime data into relevant years for sso and ssp data
crime2011 = crime[crime['Occurrence Year'] == 2011]
crime2014 = crime[crime['Occurrence Year'] == 2014]

### 2. Identify areas with high differences in perceived vs. actual crime by comparing actual crime rates vs. perceived crime scores

#### Perceived safety/crime scores

Normalize q-scores to 0-10 range.

In [5]:
def normalize(df, score_col, norm_col, range_start = 0, range_end = 10):
    min_score = float(df[score_col].min())
    max_score = float(df[score_col].max())
    norm_func = lambda score: range_start + (float(score) - min_score) * (range_end - range_start) / (max_score - min_score)
    normalized = [norm_func(score) for score in df[score_col].tolist()]
    df.loc[:, norm_col] = normalized

In [9]:
normalize(ssp, 'q-score', 'q_norm')
normalize(sso, 'QS Safer', 'q_norm')

Convert/invert q-score of safety to danger score.

In [10]:
normalize(ssp, 'q-score', 'q_norm_rev', range_start = 10, range_end = 0)
normalize(sso, 'QS Safer', 'q_norm_rev', range_start = 10, range_end = 0)

Get average q-score by zip code.

In [11]:
def agg_qscores(df, geo_col, score_col):
    grouped = df.groupby(geo_col)
    scores = pd.DataFrame(grouped[score_col].mean())
    return scores

In [12]:
ssbyzcta = agg_qscores(ssp, 'zcta', 'q_norm')
ss_orig_byzcta = agg_qscores(sso, 'zcta', 'q_norm')

In [13]:
ssbyz_danger = agg_qscores(ssp, 'zcta', 'q_norm_rev')
ssorigbyz_danger = agg_qscores(sso, 'zcta', 'q_norm_rev')

Get average q-score by Census tract.

In [14]:
# perceived safety
ss_by_ct = agg_qscores(ssp, 'fips_tract', 'q_norm')
ssorig_by_ct = agg_qscores(sso, 'fips_tract', 'q_norm')

In [21]:
# perceived crime
ssp_by_ct = agg_qscores(ssp, 'fips_tract', 'q_norm_rev')
sso_by_ct = agg_qscores(sso, 'fips_tract', 'q_norm_rev')

Map to shape file census tract.

In [111]:
with open('data/pickled/TRACT_DICT.pkl', 'r') as picklefile:
    tract_dict = pickle.load(picklefile)

In [113]:
# get truncated tract numbers
ssp_by_ct['tract_copy'] = ssp_by_ct.index.values
ssp_by_ct['tract_short'] = ssp_by_ct.tract_copy.apply(lambda x: tract_dict[x] if x in tract_dict else '')
ssp_by_ct.head()

Unnamed: 0_level_0,q_norm_rev,tract_short,tract_copy
fips_tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
36005000200,3.726878,2000200,36005000200
36005000400,4.086433,2000400,36005000400
36005001600,3.97163,2001600,36005001600
36005001900,4.954953,2001900,36005001900
36005002000,3.646495,2002000,36005002000


Export for cartodb viz

In [706]:
def zip_to_cartodb(df, path):
    df['country'] = 'United States'
    df.to_csv(path)

In [106]:
def tract_to_cartodb(df, path):
    df.to_csv(path, index = False)

In [115]:
tract_to_cartodb(ssp_by_ct, 'data/for_cartodb/ss_by_ct.csv')

In [707]:
zip_to_cartodb(ssbyzcta, 'data/for_cartodb/ss_by_zcta.csv')
zip_to_cartodb(ss_orig_byzcta, 'data/for_cartodb/ssorig_by_zcta.csv')

In [711]:
zip_to_cartodb(ss_by_ct, 'data/for_cartodb/ss_by_tract.csv')
zip_to_cartodb(ssorig_by_ct, 'data/for_cartodb/ssorig_by_tract.csv')

#### Actual crime scores

In [24]:
def get_crimes_by_geo(df, geo_col):
    """Given original crimes df, return df of counts by crime type by chosen geography."""
    byzip = df.groupby([geo_col, 'Offense'])
    crimesdf = byzip.OBJECTID.count().unstack(level=-1)
    crimesdf.fillna(0, inplace = True)
    crimesdf['total'] = crimesdf.sum(axis = 1)
    crimesdf[geo_col] = crimesdf.index
    return crimesdf

In [60]:
# def clean_acs(df, geo_col, data_col, geo_col_name):
#     """Given ACS zcta data file, return dictionary with key = zcta and values = data."""
#     df[geo_col_name] = df[geo_col].apply(lambda x: int(x.split()[1]) if len(x.split()) > 1 else x)
#     data_dict = dict(zip(df[geo_col_name].values, df[data_col].values.tolist()))
#     return data_dict

def add_acs_data(acs_df, acs_geo_col, acs_data_col, df, df_geo_col, df_new_data_name):
    """Given ACS zcta data file, return dictionary with key = geography and values = data."""
    # create dictionary of {geo: value} from acs data
    data_dict = dict(zip(acs_df[acs_geo_col].values, acs_df[acs_data_col].values.tolist()))
    
    # fetch value for key and add to df 
    df[df_new_data_name] = df.loc[:, df_geo_col].apply(lambda x: int(data_dict[x]) if x in data_dict else 'no data')

In [17]:
# def clean_acs_df(df, zcta_col, data_col):
#     """Given ACS zcta data file, return df of relevant data."""
#     df['zcta'] = df[zcta_col].apply(lambda x: int(x.split()[1]) if len(x.split()) > 1 else x)
#     keep = data_col + ['zcta']
#     return df[keep]

In [None]:
"""normalizing crime:
- crime rate - crime per 100,000 residents
- great post: http://opendata.stackexchange.com/questions/381/how-to-normalize-the-data-when-mapping-crime-reports
"""

#### Normalize data by converting raw crime count to crime rates based on census tract population.

Potential ways of normalizing: 
* by population in that geography
    * problematic for areas with fewer residents
* by area of the geography
 

In [25]:
# get dataframe of crimes by geography
crime2014_ct = get_crimes_by_geo(crime2014, 'fips_tract')

In [27]:
# fetch population data
pop_ct = pd.read_csv('data/dem_data/acs_pop_by_ct_2014.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [61]:
add_acs_data(pop_ct, 'GEO.id2', 'HC01_VC03', crime2014_ct, 'fips_tract', 'population')

In [67]:
crime2014_ct = crime2014_ct[crime2014_ct.population > 5]
crime2014_ct['crime_rate'] = (crime2014_ct.total / crime2014_ct.population) * 1000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [81]:
normalize(crime2014_ct, 'crime_rate', 'crime_rate_norm')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [99]:
crime.loc[0, 'fips_detail']

{u'ALAND10': 58636,
 u'AWATER10': 0,
 u'BLKGRPCE10': u'3',
 u'COUNTYFP10': u'047',
 u'FUNCSTAT10': u'S',
 u'GEOID10': u'360470244003',
 u'INTPTLAT10': u'+40.6229769',
 u'INTPTLON10': u'-073.9880886',
 u'MTFCC10': u'G5030',
 u'NAMELSAD10': u'Block Group 3',
 u'STATEFP10': u'36',
 u'TRACTCE10': u'024400'}

In [33]:
pop = pd.read_csv('data/acs-pop-by-zcta.csv')

# create dictionary of key = zip code, and value = population
pop_zcta = clean_acs(pop, 'GEO.display-label', ['HC01_VC03'])

TypeError: clean_acs() takes exactly 4 arguments (3 given)

In [391]:
# map population and divide crime by population
crbyzip2014['population'] = crbyzip2014.zcta.apply(lambda x: int(pop_zcta[x][0]) if x in pop_zcta else np.nan)
crbyzip2014['crime_p_1k'] = (crbyzip2014.total / crbyzip2014.population) * 1000

In [725]:
# drop nans and 0s
crbyzip2014.dropna(subset = ['crime_p_1k'], inplace = True)
crbyzip2014 = crbyzip2014[crbyzip2014.population > 1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [729]:
zip_to_cartodb(crbyzip2014, 'data/for_cartodb/crime_by_zip_2014.csv')

#### Compare crime vs. perceived safety

Potential scores:
* ratio of perceived safety (or perceived crime, inverse) to actual crime rate
* quantile/rank of zip compared to other zip codes? e.g. 5th most "least safe" (dangerous) vs. 100th highest crime rate

In [84]:
ssp_by_ct.head()

Unnamed: 0_level_0,q_norm_rev
fips_tract,Unnamed: 1_level_1
36005000200,3.726878
36005000400,4.086433
36005001600,3.97163
36005001900,4.954953
36005002000,3.646495


In [85]:
crime2014_ct.head()

Offense,BURGLARY,FELONY ASSAULT,GRAND LARCENY,GRAND LARCENY OF MOTOR VEHICLE,MURDER & NON-NEGL. MANSLAUGHTE,RAPE,ROBBERY,total,fips_tract,population,crime_rate,crime_rate_norm
fips_tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
36005000100,0,368,2,0,0,0,5,375,36005000100,8430,44.483986,0.16895
36005000200,9,10,10,2,0,0,5,36,36005000200,5095,7.065751,0.025343
36005000400,6,10,12,6,0,0,4,38,36005000400,5572,6.819813,0.024399
36005001600,7,16,17,7,1,0,14,62,36005001600,5412,11.456024,0.042193
36005001900,15,16,32,19,0,0,10,92,36005001900,2569,35.8116,0.135667


In [86]:
ratios_by_ct = ssp_by_ct.merge(crime2014_ct, left_index = True, right_index = True)

In [94]:
ratios_by_ct2 = ratios_by_ct[['fips_tract', 'q_norm_rev', 'total', 'crime_rate', 'crime_rate_norm']]

In [95]:
ratios_by_ct2['per_act_ratio'] = ratios_by_ct2.q_norm_rev / ratios_by_ct2.crime_rate_norm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [96]:
ratios_by_ct2.sort_values(by = 'per_act_ratio')

Unnamed: 0_level_0,fips_tract,q_norm_rev,total,crime_rate,crime_rate_norm,per_act_ratio
fips_tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
36061009400,36061009400,3.718042,86,2606.060606,10.000000,0.371804
36061010900,36061010900,3.604663,502,2281.818182,8.755593,0.411698
36081060701,36081060701,4.164057,20,2000.000000,7.674006,0.542618
36081038302,36081038302,4.627396,88,2000.000000,7.674006,0.602996
36005043500,36005043500,3.994203,78,1418.181818,5.441052,0.734087
36061009600,36061009600,3.583851,174,1183.673469,4.541034,0.789215
36047085200,36047085200,3.397900,8,1000.000000,3.836116,0.885766
36061011300,36061011300,3.863431,163,1116.438356,4.282993,0.902040
36081005000,36081005000,3.330028,4,571.428571,2.191306,1.519655
36061029700,36061029700,3.815104,26,619.047619,2.374062,1.606994


In [83]:
ssbyz_danger.head()

Unnamed: 0_level_0,q_norm_rev
zcta,Unnamed: 1_level_1
10001,3.619298
10002,3.829057
10003,3.34104
10004,4.566766
10005,4.159614


In [738]:
ratios_by_zip = ssbyz_danger.merge(crbyzip2014, left_index = True, right_index = True)

In [740]:
keep = ['q_norm_rev', 'crime_p_1k']

In [741]:
ratios = ratios_by_zip[keep]

In [743]:
ratios['crime_ratio'] = ratios.q_norm_rev / ratios.crime_p_1k

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [745]:
ratios.sort_values(by = 'crime_ratio', ascending = False)

Unnamed: 0_level_0,q_norm_rev,crime_p_1k,crime_ratio
zcta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10280,4.732504,1.814676,2.607906
10312,3.737261,2.798462,1.335470
10314,3.939777,3.713267,1.061000
11357,3.682912,3.523209,1.045329
11363,4.590532,4.459016,1.029494
10308,3.918249,4.261030,0.919554
11430,4.997403,5.847953,0.854556
10307,3.704024,4.398241,0.842160
10305,4.120639,5.039666,0.817641
11377,4.026929,4.980942,0.808467
