In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
crime_data = pd.read_csv('./crime-housing-austin-2015.csv')
zip_code_data = pd.read_csv('./AustinZipCodes.csv')

zip_code_data['Population'] = zip_code_data['Population'].str.replace(',', '').astype(int)
zip_code_data['People / Sq. Mile'] = zip_code_data['People / Sq. Mile'].str.replace(',', '').astype(float)

crime_data['crimes'] = 0
crime_data['Cleared_by_arrest'] = crime_data['Clearance_Status'] == 'C'
crime_data['Cleared_by_exception'] = crime_data['Clearance_Status'] == 'O'
crime_data['Not_cleared'] = crime_data['Clearance_Status'] == 'N'

crime_data = pd.merge(
	crime_data,
	zip_code_data,
	how='left',
	left_on='Zip_Code_Crime',
	right_on='Zip Code'
).reset_index()

violent_crimes = ['Robbery', 'Agg Assault', 'Rape', 'Murder']

crime_data['Violent'] = crime_data.Highest_NIBRS_UCR_Offense_Description.isin(violent_crimes)

grouped_by_zip_code = crime_data.groupby('Zip_Code_Crime')

crime_data_by_zip_code = grouped_by_zip_code.agg({
	'Populationbelowpovertylevel': 'first',
	'Medianhouseholdincome': 'first',
	'Populationwithdisability': 'first',
	'Unemployment': 'first',
	'Medianrent': 'first',
	'Medianhomevalue': 'first',
	'Percentageofhomeswithin1/4-mioftransitstop': 'first',
	'crimes': 'count',
	'Cleared_by_arrest': 'sum',
	'Cleared_by_exception': 'sum',
	'Not_cleared': 'sum',
	'Population': 'first',
	'Violent': 'sum'
}).dropna().reset_index()

crime_data_by_zip_code['clearance_rate'] = (
	crime_data_by_zip_code['Cleared_by_arrest'] + crime_data_by_zip_code['Cleared_by_exception']
) / crime_data_by_zip_code['crimes']

fake_number_columns = [
	'Populationbelowpovertylevel',
	'Percentageofhomeswithin1/4-mioftransitstop',
	'Medianhouseholdincome',
	'Populationwithdisability',
	'Unemployment',
	'Medianrent',
	'Medianhomevalue'
]

percent_columns = [
	'Populationbelowpovertylevel',
	'Populationwithdisability',
	'Unemployment'
]

for column in fake_number_columns:
	crime_data_by_zip_code[column] = crime_data_by_zip_code[column].str.extract('(\d+)').astype(int)

for column in percent_columns:
	crime_data_by_zip_code[column] = crime_data_by_zip_code[column] / 100

crime_data_by_zip_code

crime_data_by_zip_code['Crimes per capita'] = crime_data_by_zip_code.crimes / crime_data_by_zip_code.Population
# Pull out the outlier
crime_data_by_zip_code = crime_data_by_zip_code[crime_data_by_zip_code['Crimes per capita'] < 0.3]



In [None]:
display(stats.pearsonr(crime_data_by_zip_code.Medianhouseholdincome, crime_data_by_zip_code['Crimes per capita']))

sns.regplot(crime_data_by_zip_code, x='Medianhouseholdincome', y='Crimes per capita')

higher_household_incomes = crime_data_by_zip_code[crime_data_by_zip_code.Medianhouseholdincome > 54215]
lower_household_incomes = crime_data_by_zip_code[crime_data_by_zip_code.Medianhouseholdincome <= 54215]

display(stats.ttest_ind(higher_household_incomes['Crimes per capita'], lower_household_incomes['Crimes per capita']))

In [None]:
violent_crime_data = crime_data_by_zip_code.copy()

violent_crime_data['percent_violent'] = violent_crime_data.Violent / violent_crime_data.crimes * 100

display(stats.pearsonr(violent_crime_data['Percentageofhomeswithin1/4-mioftransitstop'], violent_crime_data['Crimes per capita']))

sns.regplot(violent_crime_data, x='Percentageofhomeswithin1/4-mioftransitstop', y='Crimes per capita', label='Crimes per capita')

violent_crime_data['Violent crimes per capita'] = violent_crime_data.Violent / violent_crime_data.Population

display(stats.pearsonr(violent_crime_data['Percentageofhomeswithin1/4-mioftransitstop'], violent_crime_data['Violent crimes per capita']))

sns.regplot(violent_crime_data, x='Percentageofhomeswithin1/4-mioftransitstop', y='Violent crimes per capita', label='Violent crimes per capita')

plt.legend()

display(stats.pearsonr(violent_crime_data['Percentageofhomeswithin1/4-mioftransitstop'], violent_crime_data.percent_violent))