In [31]:
# Necessary imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import scipy.stats as stats

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
%matplotlib inline

In [32]:
# Read in county information
counties = pd.read_csv('data/acs2017_county_data.csv')

In [33]:
# Look at it!
counties.columns

Index(['CountyId', 'State', 'County', 'TotalPop', 'Men', 'Women', 'Hispanic',
       'White', 'Black', 'Native', 'Asian', 'Pacific', 'VotingAgeCitizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment'],
      dtype='object')

In [8]:
# Check to see if '(city)' appears in any rows
for index, row in counties.iterrows():
    if '(city)' in row.County.split():
        print(row.County)

In [9]:
states_abrv = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
              "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
              "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
              "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
              "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado", 
              "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois", 
              "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland", 
              "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana", 
              "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
              "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania", 
              "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah", 
              "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]
states_dict = dict(zip(states, states_abrv))

In [10]:
counties['State_abrv'] = counties['State'].map(states_dict)

In [11]:
# See if 'city' appears in any rows
for index, row in counties.iterrows():
    if 'city' in row.County.split():
        print(row.County)

Baltimore city
St. Louis city
Alexandria city
Bristol city
Buena Vista city
Charlottesville city
Chesapeake city
Colonial Heights city
Covington city
Danville city
Emporia city
Fairfax city
Falls Church city
Franklin city
Fredericksburg city
Galax city
Hampton city
Harrisonburg city
Hopewell city
Lexington city
Lynchburg city
Manassas city
Manassas Park city
Martinsville city
Newport News city
Norfolk city
Norton city
Petersburg city
Poquoson city
Portsmouth city
Radford city
Richmond city
Roanoke city
Salem city
Staunton city
Suffolk city
Virginia Beach city
Waynesboro city
Williamsburg city
Winchester city


In [12]:
# It does, so we need to capitalize it
counties['County'] = counties.County.apply(lambda x: ' '.join(x.split()[:-1]) + ' City' \
                                            if x.split()[-1] == 'city' else x)

In [13]:
# Notice how 'city' doesn't appear in any rows anymore
for index, row in counties.iterrows():
    if 'city' in row.County.split():
        print(row.County)

In [14]:
# Now, let's add 'County' to the end of these county names:
counties['County'] = counties.County.apply(lambda x: ' '.join(x.split()) + ', County' \
                                            if x.split()[-1] != 'County' else x)

In [15]:
counties['County'] = counties['County'] + ', ' + counties['State_abrv']

In [16]:
counties.sample(20)

Unnamed: 0,CountyId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,State_abrv
2726,48407,Texas,"San Jacinto County, TX",27436,13563,13873,12.5,74.9,10.7,0.5,...,2.9,2.6,41.3,10221,77.3,15.0,7.6,0.1,6.5,TX
784,18175,Indiana,"Washington County, IN",27807,13783,14024,1.3,96.9,0.1,0.3,...,0.9,4.9,31.4,11952,82.8,9.9,7.2,0.1,7.0,IN
984,20193,Kansas,"Thomas County, KS",7847,3856,3991,5.5,91.2,1.4,0.8,...,2.6,6.6,11.1,4393,68.7,19.1,12.0,0.2,1.9,KS
2659,48273,Texas,"Kleberg County, TX",31540,16290,15250,72.3,21.0,3.5,0.1,...,2.3,1.8,17.6,13683,69.1,25.7,4.8,0.3,9.7,TX
1781,34015,New Jersey,"Gloucester County, NJ",291372,141589,149783,5.8,79.1,9.9,0.0,...,1.2,3.3,29.9,145749,80.6,15.5,3.8,0.1,7.4,NJ
1393,27159,Minnesota,"Wadena County, MN",13626,6757,6869,1.8,94.7,1.1,0.7,...,1.2,5.7,21.0,6060,79.8,12.4,7.5,0.2,5.8,MN
1684,31061,Nebraska,"Franklin County, NE",3019,1514,1505,2.1,96.2,0.5,0.0,...,1.0,5.4,18.2,1496,67.3,19.7,12.8,0.3,2.6,NE
2506,47157,Tennessee,"Shelby County, TN",937847,446846,491001,6.1,36.5,53.2,0.1,...,1.4,3.2,22.9,430218,81.4,13.4,5.1,0.1,8.6,TN
595,17001,Illinois,"Adams County, IL",66787,32698,34089,1.5,92.0,3.9,0.2,...,1.5,4.5,17.0,32243,83.1,10.6,6.1,0.1,5.5,IL
1841,36027,New York,"Dutchess County, NY",295685,146993,148692,11.7,72.2,9.6,0.1,...,1.0,5.6,32.2,144934,77.7,16.6,5.6,0.2,6.7,NY


In [17]:
# Ok, dank! Now let's read in shooting data.
shootings = pd.read_pickle('data/shootings_by_county_state.pkl')
shootings.head()

Unnamed: 0,County,num_shootings
0,"Los Angeles County, CA",106
1,"Maricopa County, AZ",65
2,"Cook County, IL",33
3,"Harris County, TX",32
4,"San Bernardino County, CA",30


In [18]:
# Strip whitespace from the two columns in each dataframe that we are merging upon
shootings['County'] = shootings['County'].str.strip()
counties['County'] = counties['County'].str.strip()

In [19]:
# Let's join the shootings onto the counties dataframe on the 'County' column
counties2 = pd.merge(counties, shootings, how = 'left', on='County')

In [20]:
# Let's look at it!
counties2.sort_values('num_shootings', ascending = False)

Unnamed: 0,CountyId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,State_abrv,num_shootings
204,6037,California,"Los Angeles County, CA",10105722,4979641,5126081,48.4,26.5,7.9,0.2,...,5.3,30.9,4805817,79.3,11.2,9.3,0.2,7.8,CA,106.0
103,4013,Arizona,"Maricopa County, AZ",4155501,2055464,2100037,30.6,56.3,5.1,1.5,...,6.4,25.8,1929015,82.8,11.1,5.9,0.1,6.0,AZ,65.0
610,17031,Illinois,"Cook County, IL",5238541,2540704,2697837,25.1,42.7,23.4,0.1,...,4.5,32.9,2521437,84.2,11.1,4.6,0.1,8.7,IL,33.0
2623,48201,Texas,"Harris County, TX",4525519,2251060,2274459,42.2,30.6,18.5,0.2,...,3.7,28.9,2180392,83.4,9.8,6.6,0.2,6.4,TX,32.0
221,6071,California,"San Bernardino County, CA",2121220,1055170,1066050,52.3,29.8,8.0,0.3,...,4.7,30.9,869658,77.3,16.2,6.4,0.1,9.9,CA,30.0
1748,32003,Nevada,"Clark County, NV",2112436,1056002,1056434,30.7,44.1,10.8,0.4,...,3.6,24.5,982033,84.5,10.5,4.9,0.1,8.3,NV,28.0
218,6065,California,"Riverside County, CA",2355002,1171711,1183291,48.0,36.6,6.0,0.4,...,5.2,33.1,978726,77.7,14.7,7.5,0.2,9.9,CA,27.0
215,6059,California,"Orange County, CA",3155816,1558245,1597571,34.2,41.4,1.6,0.2,...,5.6,27.4,1560997,82.0,10.1,7.7,0.2,5.8,CA,23.0
362,12086,Florida,"Miami-Dade County, FL",2702602,1311997,1390605,67.5,13.7,16.3,0.1,...,4.9,31.3,1272735,82.4,9.6,7.8,0.2,7.4,FL,23.0
222,6073,California,"San Diego County, CA",3283665,1651147,1632518,33.4,46.2,4.7,0.4,...,7.0,25.7,1536073,77.8,14.2,7.8,0.2,7.1,CA,22.0


In [21]:
# How big is this new dataframe?
counties2.shape

(3220, 39)

In [22]:
# Drop counties where the name of the county is NaN (mainly Puerto Rico)
counties2.dropna(axis = 0, subset = ['County'], inplace = True)

In [23]:
# Check for size again
counties2.shape

(3141, 39)

In [24]:
# Fill NaNs in num_shootings column with 0 (because they had 0 shootings)
counties2['num_shootings'] = counties2['num_shootings'].fillna(value=0)

In [25]:
# Let's take a look!
counties2.sort_values('num_shootings', ascending = False)

Unnamed: 0,CountyId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,State_abrv,num_shootings
204,6037,California,"Los Angeles County, CA",10105722,4979641,5126081,48.4,26.5,7.9,0.2,...,5.3,30.9,4805817,79.3,11.2,9.3,0.2,7.8,CA,106.0
103,4013,Arizona,"Maricopa County, AZ",4155501,2055464,2100037,30.6,56.3,5.1,1.5,...,6.4,25.8,1929015,82.8,11.1,5.9,0.1,6.0,AZ,65.0
610,17031,Illinois,"Cook County, IL",5238541,2540704,2697837,25.1,42.7,23.4,0.1,...,4.5,32.9,2521437,84.2,11.1,4.6,0.1,8.7,IL,33.0
2623,48201,Texas,"Harris County, TX",4525519,2251060,2274459,42.2,30.6,18.5,0.2,...,3.7,28.9,2180392,83.4,9.8,6.6,0.2,6.4,TX,32.0
221,6071,California,"San Bernardino County, CA",2121220,1055170,1066050,52.3,29.8,8.0,0.3,...,4.7,30.9,869658,77.3,16.2,6.4,0.1,9.9,CA,30.0
1748,32003,Nevada,"Clark County, NV",2112436,1056002,1056434,30.7,44.1,10.8,0.4,...,3.6,24.5,982033,84.5,10.5,4.9,0.1,8.3,NV,28.0
218,6065,California,"Riverside County, CA",2355002,1171711,1183291,48.0,36.6,6.0,0.4,...,5.2,33.1,978726,77.7,14.7,7.5,0.2,9.9,CA,27.0
215,6059,California,"Orange County, CA",3155816,1558245,1597571,34.2,41.4,1.6,0.2,...,5.6,27.4,1560997,82.0,10.1,7.7,0.2,5.8,CA,23.0
362,12086,Florida,"Miami-Dade County, FL",2702602,1311997,1390605,67.5,13.7,16.3,0.1,...,4.9,31.3,1272735,82.4,9.6,7.8,0.2,7.4,FL,23.0
2579,48113,Texas,"Dallas County, TX",2552213,1257751,1294462,39.6,30.2,22.1,0.2,...,4.3,27.2,1252101,84.3,9.1,6.4,0.2,5.9,TX,22.0


In [26]:
# Create new column based on whether or not there were shootings in a given county
counties2['had_shootings'] = np.where(counties2['num_shootings'] >= 0.5, 1, 0)

In [27]:
counties2.sample(20)

Unnamed: 0,CountyId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,State_abrv,num_shootings,had_shootings
2983,53061,Washington,"Snohomish County, WA",771904,387078,384826,9.8,71.2,2.6,0.8,...,31.8,390186,81.9,12.5,5.4,0.2,5.4,WA,5.0,1
1010,21035,Kentucky,"Calloway County, KY",38616,18594,20022,2.6,89.7,3.7,0.3,...,16.6,17382,72.7,20.6,6.4,0.3,5.7,KY,0.0,0
2189,40117,Oklahoma,"Pawnee County, OK",16448,8214,8234,2.8,77.2,1.0,12.2,...,26.7,6606,75.7,15.9,8.3,0.2,5.2,OK,0.0,0
1210,24037,Maryland,"St. Mary's County, MD",110979,55420,55559,4.8,75.1,14.2,0.2,...,30.9,54121,64.6,31.1,4.2,0.1,4.2,MD,0.0,0
1822,35051,New Mexico,"Sierra County, NM",11254,5513,5741,29.9,65.7,0.6,1.1,...,16.7,3740,58.8,33.3,7.9,0.0,11.4,NM,1.0,1
726,18059,Indiana,"Hancock County, IN",72776,35716,37060,2.2,93.0,1.9,0.1,...,26.1,37374,83.5,12.7,3.8,0.1,4.0,IN,0.0,0
293,8097,Colorado,"Pitkin County, CO",17747,9363,8384,9.8,85.7,1.2,0.2,...,17.5,11435,77.7,10.0,12.3,0.0,4.3,CO,0.0,0
999,21013,Kentucky,"Bell County, KY",27469,13385,14084,0.3,94.5,3.3,0.1,...,20.6,7559,77.8,16.9,5.1,0.1,9.0,KY,0.0,0
1324,27021,Minnesota,"Cass County, MN",28810,14698,14112,2.0,83.3,0.4,10.9,...,23.8,12264,72.7,17.1,9.7,0.5,5.7,MN,0.0,0
1963,37147,North Carolina,"Pitt County, NC",176484,83309,93175,6.0,55.1,34.3,0.2,...,20.6,82142,77.2,18.5,4.2,0.1,10.1,NC,1.0,1


In [30]:
counties2.to_pickle('data/combined_data.pkl')

In [29]:
counties2.columns

Index(['CountyId', 'State', 'County', 'TotalPop', 'Men', 'Women', 'Hispanic',
       'White', 'Black', 'Native', 'Asian', 'Pacific', 'VotingAgeCitizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment', 'State_abrv',
       'num_shootings', 'had_shootings'],
      dtype='object')