cost of living by town <br>
https://www.cityrating.com/costofliving.asp <br> 
crime stats by town <br>
https://www.cityrating.com/crime-statistics/ <br>
income, education, age data by town <br>
https://factfinder.census.gov/faces/nav/jsf/pages/community_facts.xhtml

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import requests
from bs4 import BeautifulSoup

import pickle
import time

In [2]:
inc = pd.read_csv('./data/income_data_copy.csv')

In [3]:
inc.drop(columns = ['Unnamed: 0'], inplace = True)

### Clean poverty stats

In [4]:
pov = pd.read_csv('./data/poverty_rates.csv', encoding = 'ISO-8859-1')

In [5]:
pov.columns = ['state', 'town', 'poverty']

In [6]:
pov.drop([0,1], inplace = True)
pov.reset_index(drop = True, inplace = True)

In [7]:
# Only for Puerto Rico
for i in range(pov.shape[0]):
    flip = pov.loc[i, 'state']
    if '-' not in flip:
        pov.loc[i, 'state'] = f"-{flip}"

In [8]:
pov['state'] = pov['state'].apply(lambda x: x.split('-')[1].strip())

In [9]:
def clean_towns(town):
    if town[-4:] == ' CDP':
        return town[:-4]
    elif town[-5:] == ' town':
        return town[:-5]
    elif town[-5:] == ' city':
        return town[:-5]

In [10]:
pov['town'] = pov['town'].apply(clean_towns)

In [11]:
pov.head()

Unnamed: 0,state,town,poverty
0,Alabama,,18.0
1,Alabama,Abanda,25.9
2,Alabama,Abbeville,20.7
3,Alabama,Adamsville,16.0
4,Alabama,Addison,34.2


### Merge HS completion stats

In [12]:
hs = pd.read_csv('./data/hs_completion.csv', encoding = 'ISO-8859-1')

In [13]:
hs = hs[['GCT_STUB.display-label', 'GCT_STUB.display-label.1', 'HC01']]

In [14]:
hs.columns = ['state', 'town', 'hs_completion']

In [15]:
hs.drop([0,1], inplace = True)
hs.reset_index(drop = True, inplace = True)

hs national average: 87.3 <br>
poverty national average: 14.6

In [16]:
pov.shape, hs.shape

((29636, 3), (29636, 3))

In [17]:
pov['hs_completion'] = hs['hs_completion']

In [18]:
# Both town and state have to match with income data
pov['townstate'] = pov['town'] + ', ' + pov['state']

In [19]:
pov.head()

Unnamed: 0,state,town,poverty,hs_completion,townstate
0,Alabama,,18.0,85.3,
1,Alabama,Abanda,25.9,8.4,"Abanda, Alabama"
2,Alabama,Abbeville,20.7,79.1,"Abbeville, Alabama"
3,Alabama,Adamsville,16.0,83.5,"Adamsville, Alabama"
4,Alabama,Addison,34.2,85.1,"Addison, Alabama"


### Merge poverty & education with income stats

In [20]:
inc.head()

Unnamed: 0,ZCTA,n_households,med_income,population,density,city,state_id,state_name,lat,lng
0,601,5818,11757,17242,111.4,Adjuntas,PR,Puerto Rico,18.18004,-66.75218
1,602,12719,16190,38442,523.5,Aguada,PR,Puerto Rico,18.36073,-67.17517
2,603,19009,16645,48814,667.9,Aguadilla,PR,Puerto Rico,18.45439,-67.12202
3,606,1959,13387,6437,60.4,Maricao,PR,Puerto Rico,18.16724,-66.93828
4,610,9120,18741,27073,312.0,Anasco,PR,Puerto Rico,18.29032,-67.12243


In [21]:
inc.shape

(33099, 10)

In [22]:
inc['townstate'] = inc['city'] + ', ' + inc['state_name']

In [23]:
inc = pd.merge(pov, inc, left_on = 'townstate', right_on = 'townstate')

In [24]:
inc.head()

Unnamed: 0,state,town,poverty,hs_completion,townstate,ZCTA,n_households,med_income,population,density,city,state_id,state_name,lat,lng
0,Alabama,Abbeville,20.7,79.1,"Abbeville, Alabama",36310,2555,40186,6055,12.2,Abbeville,AL,Alabama,31.60296,-85.2163
1,Alabama,Adamsville,16.0,83.5,"Adamsville, Alabama",35005,2792,50271,7528,89.3,Adamsville,AL,Alabama,33.59515,-87.00089
2,Alabama,Addison,34.2,85.1,"Addison, Alabama",35540,1075,45074,2436,12.6,Addison,AL,Alabama,34.23388,-87.18817
3,Alabama,Akron,48.7,62.8,"Akron, Alabama",35441,372,36731,1089,6.1,Akron,AL,Alabama,32.85374,-87.73861
4,Alabama,Alabaster,11.1,88.8,"Alabaster, Alabama",35007,8628,69691,26328,270.8,Alabaster,AL,Alabama,33.21591,-86.79717


In [25]:
inc.shape

(21167, 15)

### Scrape crime stats

In [27]:
# get unique states
states = []
for i in range(inc.shape[0]):
    states.append(inc.loc[i, 'state'])
states = list(set(states))

In [28]:
crime_types = [
    'Aggravated Assault',
    'Arson',
    'Burglary',
    'Larceny and Theft',
    'Motor Vehicle Theft',
    'Murder and Manslaughter',
    'Rape',
    'Robbery',
    'Crime Rate (Total Incidents)',
    'Property Crime',
    'Violent Crime'
]

In [29]:
def get_crime_rate(state, town):
    state = state.replace(' ', '-')
    town = town.replace(' ', '-')
    try:
        url = f'https://www.cityrating.com/crime-statistics/{state}/{town}.html'
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'lxml')
        property_crime = int(soup.find_all('td', text = 'Property Crime')[0].find_next_sibling('td').text)
        violent_crime = int(soup.find_all('td', text = 'Violent Crime')[0].find_next_sibling('td').text)
        return property_crime + violent_crime
    except:
        return np.nan

In [30]:
inc['crime_rate'] = 0

In [32]:
town_crime_rates = {}
output_name = "/Users/elijahcurme/Desktop/GA/capstone/pkl/town_crime_rates.pkl"
done_states = []

In [44]:
# Loop through each town in each state to scrape crime data
for state in states:
    if state not in done_states:
        towns = list(set(inc.loc[inc['state'] == state, 'town']))
        for town in towns:
            #town_crime_rates[(state, town)] = get_crime_rate(state, town)            
            inc.loc[(inc['state'] == state) & (inc['town'] == town), 'crime_rate'] = get_crime_rate(state, town)
            pd.to_pickle(inc, '/Users/elijahcurme/Desktop/GA/capstone/pkl/inc.pkl')
        done_states.append(state)
    print(f"{state} finished")

Alaska finished
Maine finished
New York finished
Alabama finished
Maryland finished
Oklahoma finished
North Dakota finished
New Mexico finished
Texas finished
Utah finished
Kentucky finished
North Carolina finished
Virginia finished
Tennessee finished
Delaware finished
Idaho finished
Wyoming finished
Louisiana finished
Nevada finished
Indiana finished
Montana finished
Washington finished
California finished
Arizona finished
Hawaii finished
Massachusetts finished
Nebraska finished
Kansas finished
Vermont finished
Michigan finished
Mississippi finished
Colorado finished
South Carolina finished
Wisconsin finished
Rhode Island finished
Connecticut finished
Pennsylvania finished
Florida finished
Arkansas finished
New Jersey finished
Iowa finished
Illinois finished
Ohio finished
Oregon finished
Missouri finished
Georgia finished
South Dakota finished
Minnesota finished
District of Columbia finished
West Virginia finished
New Hampshire finished


In [46]:
inc.to_csv('./data/inc.csv')

In [43]:
#test_inc = pd.read_pickle('/Users/elijahcurme/Desktop/GA/capstone/pkl/inc.pkl')

In [None]:
# with open(output_name, "rb") as f:
#     town_crime_rates = pickle.load(f)

In [None]:
# state_towns = town_crime_rates.keys()
# for state, town in state_towns:
#     inc.loc[(inc['state'] == state) & (inc['town'] == town), 'crime_rate'] = town_crime_rates[(state, town)]

```python
new_dict = {}
for state in states:
    new_dict[state] = {}

for state, town in town_crime_rates.keys():
    new_dict[state][town] = town_crime_rates[(state, town)]
```

```python
done_states = list(set([state, _ for state, town in town_crime_rates.keys()]))
```

### EDA

In [48]:
inc.isnull().sum()

state                0
town                 0
poverty              0
hs_completion        0
townstate            0
ZCTA                 0
n_households         0
med_income           0
population           0
density              0
city                 0
state_id             0
state_name           0
lat                  0
lng                  0
crime_rate       14409
dtype: int64

In [56]:
inc.dropna(inplace = True)

In [58]:
inc.shape

(6758, 16)

In [57]:
inc.head()

Unnamed: 0,state,town,poverty,hs_completion,townstate,ZCTA,n_households,med_income,population,density,city,state_id,state_name,lat,lng,crime_rate
0,Alabama,Abbeville,20.7,79.1,"Abbeville, Alabama",36310,2555,40186,6055,12.2,Abbeville,AL,Alabama,31.60296,-85.2163,62.0
1,Alabama,Adamsville,16.0,83.5,"Adamsville, Alabama",35005,2792,50271,7528,89.3,Adamsville,AL,Alabama,33.59515,-87.00089,269.0
2,Alabama,Addison,34.2,85.1,"Addison, Alabama",35540,1075,45074,2436,12.6,Addison,AL,Alabama,34.23388,-87.18817,15.0
4,Alabama,Alabaster,11.1,88.8,"Alabaster, Alabama",35007,8628,69691,26328,270.8,Alabaster,AL,Alabama,33.21591,-86.79717,585.0
5,Alabama,Alabaster,11.1,88.8,"Alabaster, Alabama",35114,3022,80948,8624,223.3,Alabaster,AL,Alabama,33.22306,-86.87245,585.0


In [140]:
inc['crime_rate'].value_counts()

0.0      255
1.0      103
3.0       75
12.0      72
5.0       65
        ... 
911.0      1
935.0      1
458.0      1
776.0      1
795.0      1
Name: crime_rate, Length: 938, dtype: int64

In [128]:
inc['crime_rate_pc'] = inc['crime_rate'] / inc['population']

In [60]:
inc.head()

Unnamed: 0,state,town,poverty,hs_completion,townstate,ZCTA,n_households,med_income,population,density,city,state_id,state_name,lat,lng,crime_rate,crime_rate_pc
0,Alabama,Abbeville,20.7,79.1,"Abbeville, Alabama",36310,2555,40186,6055,12.2,Abbeville,AL,Alabama,31.60296,-85.2163,62.0,0.010239
1,Alabama,Adamsville,16.0,83.5,"Adamsville, Alabama",35005,2792,50271,7528,89.3,Adamsville,AL,Alabama,33.59515,-87.00089,269.0,0.035733
2,Alabama,Addison,34.2,85.1,"Addison, Alabama",35540,1075,45074,2436,12.6,Addison,AL,Alabama,34.23388,-87.18817,15.0,0.006158
4,Alabama,Alabaster,11.1,88.8,"Alabaster, Alabama",35007,8628,69691,26328,270.8,Alabaster,AL,Alabama,33.21591,-86.79717,585.0,0.02222
5,Alabama,Alabaster,11.1,88.8,"Alabaster, Alabama",35114,3022,80948,8624,223.3,Alabaster,AL,Alabama,33.22306,-86.87245,585.0,0.067834


In [113]:
# 15 rows have 0 for population which causes infinity crime rate per capita
inc['population'].value_counts()

0        15
5792      4
2274      4
6572      4
3091      4
         ..
1142      1
10093     1
15481     1
9338      1
10600     1
Name: population, Length: 5891, dtype: int64

In [119]:
# Fill the 15 with the median
inc.loc[inc['population'] == 0, 'population'] = inc['population'].median()

### Modeling

In [129]:
features = [
    'med_income',
    'poverty',
    'hs_completion',
    'density'
]
X = inc[features]
y = inc['crime_rate_pc']

In [130]:
ss = StandardScaler()
X_scaled = ss.fit_transform(X)

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state = 42)

In [137]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.0016794088038287747, 0.013729502142857175)

### Alakazam

In [105]:
url = 'https://www.cityrating.com/crime-statistics/new-york/adams-village.html'
res = requests.get(url)

In [106]:
soup = BeautifulSoup(res.content, 'lxml')

In [108]:
soup.find_all('td')

[<td class="key">Aggravated Assault</td>,
 <td class="value">0</td>,
 <td class="key">Arson</td>,
 <td class="value">0</td>,
 <td class="key">Burglary</td>,
 <td class="value">0</td>,
 <td class="key">Larceny and Theft</td>,
 <td class="value">3</td>,
 <td class="key">Motor Vehicle Theft</td>,
 <td class="value">1</td>,
 <td class="key">Murder and Manslaughter</td>,
 <td class="value">0</td>,
 <td class="key">Rape</td>,
 <td class="value">N/A</td>,
 <td class="key">Robbery</td>,
 <td class="value">0</td>,
 <td class="key"><span class="total">Crime Rate</span> (Total Incidents)</td>,
 <td class="value">4</td>,
 <td class="key"><span class="total">Property Crime</span></td>,
 <td class="value">4</td>,
 <td class="key"><span class="total">Violent Crime</span></td>,
 <td class="value">0</td>,
 <td class="key">Aggravated Assault</td>,
 <td class="value">0</td>,
 <td class="key">Arson</td>,
 <td class="value">0</td>,
 <td class="key">Burglary</td>,
 <td class="value">1</td>,
 <td class="key"

In [22]:
# First half of td's are actual 2016 stats. Second half are 2020 projected.
half = int(len(soup.find_all('td'))/2)