In [46]:
import psycopg2
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline

In [48]:
cd oakland-crime-housing/

/Users/danaezoule/Documents/oakland-crime-housing


In [25]:
execfile("database.py")

Creating Database
Connecting to Database
Creating Crime Table
Loading Shapes
Creating Crime Geom Table
Creating Feature Table


In [49]:
conn = psycopg2.connect("dbname=oakland user=danaezoule")
cur = conn.cursor()
cur.execute("SELECT * FROM area_features;")
df = pd.DataFrame(cur.fetchall())

In [50]:
cdf = df.copy()

In [44]:
'''
cdf.columns = ['Idx', 'OPD_RD', 'Date', 'Time', 'Lat', 'Lng', 'year', 'year_month', 'quality', 'nonviolent', 'car_break_in', 'car_theft', 'violent', 'geom', 'block_group']
cdf['quarter'] = pd.DatetimeIndex(cdf.Date).quarter
cdf['day_of_week'] = pd.DatetimeIndex(cdf.Date).dayofweek
cdf['week'] = pd.DatetimeIndex(cdf.Date).week
cdf['month'] = pd.DatetimeIndex(cdf.Date).month
cdf['day'] = pd.DatetimeIndex(cdf.Date).day
cdf['day_of_year'] = pd.DatetimeIndex(cdf.Date).dayofyear
cdf['week_of_year'] = pd.DatetimeIndex(cdf.Date).weekofyear
cdf['hour'] = [i.hour for i in cdf.Time]
#combine date and time column so I can plot over 'em
cdf['Datetime'] = pd.to_datetime(cdf['Date'].astype(str) + ' ' + cdf['Time'].astype(str))
'''




In [52]:
cdf.columns = ['Group_Block', 'Quality', 'Nonviolent', 'Vehicle_Break_In', 'Vehicle_Theft', 'Violent']

In [56]:
from sklearn.cluster import KMeans
km = KMeans()

In [36]:
clus = km.fit_predict(cdf[['Quality', 'Nonviolent', 'Vehicle_Break_In', 'Vehicle_Theft', 'Violent']])

In [37]:
clus

array([3, 3, 6, 3, 5, 3, 6, 3, 6, 3, 1, 3, 7, 3, 3, 6, 3, 6, 6, 3, 3, 1, 1,
       4, 6, 3, 5, 1, 1, 4, 6, 6, 1, 6, 6, 6, 3, 1, 6, 3, 3, 1, 6, 3, 6, 5,
       6, 6, 6, 3, 3, 1, 6, 3, 1, 0, 6, 1, 1, 1, 0, 1, 1, 3, 6, 6, 1, 1, 6,
       3, 6, 3, 5, 6, 3, 3, 1, 1, 6, 6, 3, 1, 5, 3, 1, 3, 6, 3, 3, 3, 5, 3,
       6, 1, 3, 6, 6, 6, 3, 3, 3, 2, 3, 1, 1, 3, 3, 3, 6, 3, 3, 3, 6, 6, 3,
       3, 5, 3, 3, 1, 1, 1, 6, 6, 3, 3, 1, 6, 6, 3, 1, 3, 3, 3, 6, 3, 3, 3,
       3, 3, 6, 1, 1, 1, 3, 6, 3, 3, 1, 5, 3, 3, 0, 3, 6, 1, 3, 6, 6, 1, 3,
       6, 6, 6, 6, 3, 6, 3, 3, 5, 6, 3, 5, 6, 3, 1, 2, 1, 5, 3, 1, 1, 0, 6,
       3, 1, 1, 6, 6, 1, 6, 1, 1, 6, 6, 1, 1, 3, 3, 3, 6, 1, 3, 3, 1, 3, 1,
       1, 6, 1, 3, 1, 1, 3, 1, 1, 6, 1, 1, 6, 3, 0, 1, 3, 3, 3, 5, 5, 3, 1,
       4, 3, 3, 5, 1, 6, 1, 3, 3, 1, 6, 3, 1, 3, 0, 6, 4, 1, 6, 3, 3, 3, 6,
       1, 5, 3, 6, 3, 3, 0, 0, 1, 3, 3, 3, 1, 6, 3, 3, 1, 1, 3, 1, 1, 3, 3,
       1, 6, 0, 4, 3, 6, 0, 3, 3, 6, 6, 3, 3, 7, 1, 3, 1, 3, 0, 0, 3, 3, 5,
       5, 6,

In [None]:
'''Feature Engineering Brainstorming:
Normalization:
    Normalize by population: none, assume census divisions cover this
    Normalize by geography (square footage or meterage)
    Normalize by total crime count

Geographical:
    Census tracts, group blocks, or blocks

Time Group By:
    Month, quarter, year

Time features:
    Count for weekday or weekend
    Count for time of day (morning, afternoon, eve, night)
        Split by data. First hypothesis:
        Morning: 6am-noon
        Afternoon: noon-6pm
        Eve: 6pm-midnight
        Early: midnight-6am
    
Housing etc:
    Are Trulia neighborhoods census tracts? Can I get block group info from Trulia?
    Will the ACS be helpful? Can I get yearly or quarterly ACS information?
    
Time component:
    Create centroids from earliest data, map all points to same centroids
    Create new centroids for each year (with varied data) as below 
    
Data that varies from year to year:
    If I use it, can I detect similar centroids between years?
    Should I instead ignore this completely, despite losing Lovely connection?
'''

In [57]:
# Splitting data by year, clustering by first year, predicting clusters for following years.
cdf.columns = ['Group_Block', 'Year','Quality', 'Nonviolent', 'Vehicle_Break_In', 'Vehicle_Theft', 'Violent']

In [58]:
cdf.head()

Unnamed: 0,Group_Block,Year,Quality,Nonviolent,Vehicle_Break_In,Vehicle_Theft,Violent
0,17604,2010,8,8,5,8,16
1,12609,2009,10,21,3,12,15
2,3583,2009,17,49,17,23,35
3,9707,2015,1,0,1,2,7
4,2023,2010,11,72,26,16,14


In [70]:
from sklearn.cluster import KMeans
km = KMeans()
columns = ['Quality', 'Nonviolent', 'Vehicle_Break_In', 'Vehicle_Theft', 'Violent']

In [71]:
clus = km.fit_predict(cdf[cdf.Year == 2009][columns])

In [72]:
clus10 = km.predict(cdf[cdf.Year == 2010][columns])

In [78]:
clus11 = km.predict(cdf[cdf.Year == 2011][columns])