# Data Cleaning
Import the data and clean for EDA. Drop columns that don't relate to our analysis, drop rows with unusable data or that are not in our time frame (2015-2019).

In [1]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
### cs109default ### 
import random
random.seed(112358)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import scale
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.model_selection import GridSearchCV
from random import randint 

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import tree
%matplotlib inline


import statsmodels.api as sm
from statsmodels.api import OLS


import seaborn as sns
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)

from sklearn.utils import shuffle

In [3]:
### cs109default ### 
# TensorFlow and tf.keras
import tensorflow as tf

print(tf.__version__)  # You should see a 2.0.0 here!

2.0.0


__Read in Files__ from csv into pandas dataframes.

In [4]:
property_2019_full    = pd.read_csv('data/property-assessment-fy2019.csv')
property_2018_full    = pd.read_csv('data/property-assessment-fy2018.csv')
property_2017_full    = pd.read_csv('data/property-assessment-fy2017.csv')
property_2016_full    = pd.read_csv('data/property-assessment-fy2016.csv')
property_2015_full    = pd.read_csv('data/property-assessment-fy2015.csv')
streetlights_full     = pd.read_csv('data/streetlight_locations.csv')
crime_incidents_full  = pd.read_csv('data/crime_incident_reports.csv')

__Read in 311__ seperately because it takes longer so you don't have to run if not needed.

In [5]:
#incident_reports_full = pd.read_csv('data/311.csv')

__Drop Columns__ after careful inspection of the data contained in each dataset, drop columns that will not help in our modeling. Columns were dropped if they had no effect on the outcome of interest (such as indeces or number of fireplaces in a property) or if the information in them was a duplicate (such as location if we were already given longitude and latitude).

1. from `streetlamps` drop everything but `Long` and `Lat`
2. from `property_assessment` we only care where the property is and what it's valued at so drop everything that doesn't relate
3. from `crime_incidents` drop `Location` and the index, since the location information was duplicationg `Long` and `Lat` and the index was not useful for analysis

In [6]:
# drop everything but lat and long
streetlights = streetlights_full.drop(['the_geom','TYPE','OBJECTID'],axis=1)

In [7]:
# list of columns to save for properties
property_cols = ['ST_NUM','ST_NAME','ST_NAME_SUF','UNIT_NUM','ZIPCODE',
                 'AV_LAND','AV_BLDG','AV_TOTAL','GROSS_TAX']

# drop all columns not in list (keep _ at end of name to show not fully clean yet)
property_2019_ = property_2019_full[property_2019_full.columns[property_2019_full.columns.isin(property_cols)]]
property_2018_ = property_2018_full[property_2018_full.columns[property_2018_full.columns.isin(property_cols)]]
property_2017_ = property_2017_full[property_2017_full.columns[property_2017_full.columns.isin(property_cols)]]
property_2016_ = property_2016_full[property_2016_full.columns[property_2016_full.columns.isin(property_cols)]]
property_2015_ = property_2015_full[property_2015_full.columns[property_2015_full.columns.isin(property_cols)]]

In [8]:
# list of columns to drop for crime incidents
#Offense Code is a numerical code of offense description (redundant data)
crime_cols_drop = ['INCIDENT_NUMBER','UCR_PART','Location', 'OFFENSE_CODE', 'OCCURRED_ON_DATE']

# drop columns and keep only descriptors of crime, date, and location
crime_incidents_ = crime_incidents_full.drop(crime_cols_drop,axis=1)


In [9]:
# Convert 'Shooting Column' into Integer Boolean

crime_incidents_['SHOOTING'].fillna(0, inplace = True)
crime_incidents_['SHOOTING'].astype(str).value_counts()
#print(crime_incidents_['SHOOTING'].value_counts())

crime_incidents_['SHOOTING'].replace(('Y'), ('1'), inplace=True)
crime_incidents_['SHOOTING'] = crime_incidents_['SHOOTING'].astype(int)


__Drop Rows__ that would not be usable in the forseeable future. This includes rows that have no predictor data, or no response variable data, in the form of 'nan' or 'none' or in some cases zeros. Careful inspection of each dataset led us to drop the following:
1. the `streetlights` dataset had no rows with immediately visible issues
2. from `property_assessment` we dropped all rows that had 0 in all four of the price variables, no issues with location were immediately visible
3. from `crime_incidents` we dropped if `Lat` and `Long` did not have usable values because it would be hard to get that information just from the street name and it is vital to our analysis

In [10]:
# drop row if all price values are 0
def property_droprows(df):
    df_new = df[(df.AV_LAND != 0)  | (df.AV_BLDG != 0) | (df.AV_TOTAL != 0) | (df.GROSS_TAX != 0)]
    return(df_new)

In [11]:
# drop property rows for all years
property_2019 = property_droprows(property_2019_)
property_2018 = property_droprows(property_2018_)
property_2017 = property_droprows(property_2017_)
property_2016 = property_droprows(property_2016_)
property_2015 = property_droprows(property_2015_)

# Linking Data with Streetlights

### Deal with NA values in the crime_incidents report

In [12]:
#Deal with NA Values for LAT/LONG in crime reports
import pandas as pd

# drop rows with nan long and lat 
crime_incidents = crime_incidents_.dropna(subset=['Lat','Long'])

crime_incidents_nonull = crime_incidents_.dropna(how='any',axis=0) 
crime_incidents_[crime_incidents_['SHOOTING'].isna()]['SHOOTING'] = 0



### Associating Data

Describe how you will associate the location data in Street Lights with the location data in the Crime Incident Reports Dataset

In [14]:
x = zip(streetlights.Lat.values, streetlights.Long.values)
itemDict = [{'Lat': item[0], 'Long': item[1]} for item in x]

In [34]:
from math import cos, asin, sqrt

def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
    return 12742 * asin(sqrt(a))

def closest(data, v):
    min1 = min(data, key=lambda p: distance(v['Lat'], v['Long'], p['Lat'], p['Long']))
    return distance(min1['Lat'], min1['Long'], v['Lat'], v['Long'])

def closest_mod2(data, v):
    return min(data, key=lambda p: distance(v[1]['Lat'], v[1]['Long'], p['Lat'], p['Long']))

def closest_mod(data, a, b):
    return min(data, key=lambda p: distance(a, b, p['Lat'], p['Long']))



In [35]:
closest(itemDict, crime_incidents_nonull.iloc[1])

0.04921646179397781

In [None]:
#crime_incidents_nonull['Lat'].values[1]

crime_incidents_nonull['streetlight_distance'] = 0.0
#closest(itemDict, crime_incidents_nonull.iloc[1]).get('Lat')
#crime_incidents_nonull.shape[0]

In [None]:
x = []
for i in range(crime_incidents_nonull.shape[0]):
    x.append(closest(itemDict, crime_incidents_nonull.iloc[i]))
                                                                                

In [None]:
#x2 = [closest_mod2(itemDict, item) for item in crime_incidents_nonull[['Lat', 'Long']].iterrows()]
#x = [item for item in crime_incidents_nonull[['Lat', 'Long']].iterrows()]
#a = [closest_mod2(itemDict, x[i]) for i in range(crime_incidents_nonull.shape[0])]

crime_incidents_nonull['streetlight_distance'] = x

In [None]:
# Define Lat and Long List to six floating point characters
from collections import Counter

lat = streetlights['Lat'].values
lat = [ '%.6f' % elem for elem in lat ]
long = streetlights['Long'].values
long = [ '%.6f' % elem for elem in long ]

crime_incidents_nonull_streetlamps = crime_incidents.round(6)


In [None]:
# JOIN WITH STREET LAMP DATA

crime_incidents_nonull_streetlamps['LatSame'] = crime_incidents_nonull_streetlamps['Lat'].isin(lat) 
crime_incidents_nonull_streetlamps['LongSame'] = crime_incidents_nonull_streetlamps['Long'].isin(long)
crime_incidents_nonull_streetlamps_idx = crime_incidents_nonull_streetlamps.index[(crime_incidents_nonull_streetlamps['LatSame'] == True) & (crime_incidents_nonull_streetlamps['LongSame'] == True)].tolist()

crime_incidents_['in_streetlight'] = crime_incidents_.index.isin(crime_incidents_nonull_streetlamps_idx)
crime_incidents_['in_streetlight'].value_counts()


# Initial Analysis of How Many Crimes are Shooting

In [None]:
print(not_in_street['SHOOTING'].value_counts()) #(0.432061803 of crimes are shooting)
notinstreet = not_in_street['SHOOTING'].value_counts()[1] * 100/ not_in_street['SHOOTING'].value_counts()[0]
print(notinstreet)

in_street['SHOOTING'].value_counts() #(0.389619784830 of crimes are shooting)
instreet = in_street['SHOOTING'].value_counts()[1] * 100/ in_street['SHOOTING'].value_counts()[0]
print(instreet)

print(in_street['SHOOTING'].value_counts()[1]  + in_street['SHOOTING'].value_counts()[0])
print(not_in_street['SHOOTING'].value_counts()[1]  + not_in_street['SHOOTING'].value_counts()[0])
