# Data Cleaning
Import the data and clean for EDA. Drop columns that don't relate to our analysis, drop rows with unusable data or that are not in our time frame (2015-2019).

In [9]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

__Read in Files__ from csv into pandas dataframes.

In [11]:
property_2019_full    = pd.read_csv('data/property-assessment-fy2019.csv')
property_2018_full    = pd.read_csv('data/property-assessment-fy2018.csv')
property_2017_full    = pd.read_csv('data/property-assessment-fy2017.csv')
property_2016_full    = pd.read_csv('data/property-assessment-fy2016.csv')
property_2015_full    = pd.read_csv('data/property-assessment-fy2015.csv')
streetlights_full     = pd.read_csv('data/streetlight_locations.csv')
crime_incidents_full  = pd.read_csv('data/crime_incident_reports.csv')

__Read in 311__ seperately because it takes longer so you don't have to run if not needed.

In [12]:
incident_reports_full = pd.read_csv('data/311.csv')

__Drop Columns__ after careful inspection of the data contained in each dataset, drop columns that will not help in our modeling. Columns were dropped if they had no effect on the outcome of interest (such as indeces or number of fireplaces in a property) or if the information in them was a duplicate (such as location if we were already given longitude and latitude).

1. from `streetlamps` drop everything but `Long` and `Lat`
2. from `property_assessment` we only care where the property is and what it's valued at so drop everything that doesn't relate
3. from `crime_incidents` drop `Location` and the index, since the location information was duplicationg `Long` and `Lat` and the index was not useful for analysis

In [13]:
# drop everything but lat and long
streetlights = streetlights_full.drop(['the_geom','TYPE','OBJECTID'],axis=1)

In [14]:
# list of columns to save for properties
property_cols = ['ST_NUM','ST_NAME','ST_NAME_SUF','UNIT_NUM','ZIPCODE',
                 'AV_LAND','AV_BLDG','AV_TOTAL','GROSS_TAX']

# drop all columns not in list (keep _ at end of name to show not fully clean yet)
property_2019_ = property_2019_full[property_2019_full.columns[property_2019_full.columns.isin(property_cols)]]
property_2018_ = property_2018_full[property_2018_full.columns[property_2018_full.columns.isin(property_cols)]]
property_2017_ = property_2017_full[property_2017_full.columns[property_2017_full.columns.isin(property_cols)]]
property_2016_ = property_2016_full[property_2016_full.columns[property_2016_full.columns.isin(property_cols)]]
property_2015_ = property_2015_full[property_2015_full.columns[property_2015_full.columns.isin(property_cols)]]

In [15]:
# list of columns to drop for crime incidents
crime_cols_drop = ['INCIDENT_NUMBER','UCR_PART','Location']

# drop columns and keep only descriptors of crime, date, and location
crime_incidents_ = crime_incidents_full.drop(crime_cols_drop,axis=1)

__Drop Rows__ that would not be usable in the forseeable future. This includes rows that have no predictor data, or no response variable data, in the form of 'nan' or 'none' or in some cases zeros. Careful inspection of each dataset led us to drop the following:
1. the `streetlights` dataset had no rows with immediately visible issues
2. from `property_assessment` we dropped all rows that had 0 in all four of the price variables, no issues with location were immediately visible
3. from `crime_incidents` we dropped if `Lat` and `Long` did not have usable values because it would be hard to get that information just from the street name and it is vital to our analysis

In [16]:
# drop row if all price values are 0
def property_droprows(df):
    df_new = df[(df.AV_LAND != 0)  | (df.AV_BLDG != 0) | (df.AV_TOTAL != 0) | (df.GROSS_TAX != 0)]
    return(df_new)

In [17]:
# drop property rows for all years
property_2019 = property_droprows(property_2019_)
property_2018 = property_droprows(property_2018_)
property_2017 = property_droprows(property_2017_)
property_2016 = property_droprows(property_2016_)
property_2015 = property_droprows(property_2015_)

In [18]:
# drop rows with nan long and lat 
crime_incidents = crime_incidents_.dropna(subset=['Lat','Long'])

In [20]:
# Split into NumPy arrays
X_train = data_train.iloc[:, data_train.columns != 'class'].values
y_train = data_train['class'].values
X_test = data_test.iloc[:, data_test.columns != 'class'].values
y_test = data_test['class'].values

streetlights.head()

Unnamed: 0,Lat,Long
0,42.353159,-71.076044
1,42.353348,-71.075382
2,42.353521,-71.074775
3,42.353556,-71.074404
4,42.353811,-71.073621


In [39]:
#Deal with NA Values for LAT/LONG in crime reports
import pandas as pd
crime_incidents_.isna().sum()

crime_incidents_nonull = crime_incidents_.dropna(how='any',axis=0) 
crime_incidents_nonull

Unnamed: 0,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,STREET,Lat,Long
875,413,Aggravated Assault,ASSAULT - AGGRAVATED - BATTERY,B2,295,Y,2019-09-26 02:38:00,2019,9,Thursday,2,DABNEY ST,42.324098,-71.084830
876,2662,Ballistics,BALLISTICS EVIDENCE/FOUND,B2,295,Y,2019-09-26 02:38:00,2019,9,Thursday,2,DABNEY ST,42.324098,-71.084830
888,413,Aggravated Assault,ASSAULT - AGGRAVATED - BATTERY,C11,344,Y,2019-09-25 23:42:00,2019,9,Wednesday,23,WESTVILLE TER,42.300980,-71.068699
2784,413,Aggravated Assault,ASSAULT - AGGRAVATED - BATTERY,B3,457,Y,2019-09-19 17:16:00,2019,9,Thursday,17,FOWLER ST,42.300068,-71.083369
3407,413,Aggravated Assault,ASSAULT - AGGRAVATED - BATTERY,B2,326,Y,2019-09-17 22:35:00,2019,9,Tuesday,22,CRESTON ST,42.311955,-71.078719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425311,1501,Firearm Violations,"WEAPON - FIREARM - CARRYING / POSSESSING, ETC",B2,238,Y,2015-06-20 00:00:00,2015,6,Saturday,0,DUDLEY ST,42.317808,-71.066850
425506,413,Aggravated Assault,ASSAULT - AGGRAVATED - BATTERY,B2,286,Y,2015-06-19 21:20:00,2015,6,Friday,21,KERR WAY,42.333584,-71.085543
426245,413,Aggravated Assault,ASSAULT - AGGRAVATED - BATTERY,B2,326,Y,2015-06-16 20:41:00,2015,6,Tuesday,20,LAWRENCE AVE,42.311611,-71.076256
426246,413,Aggravated Assault,ASSAULT - AGGRAVATED - BATTERY,B2,326,Y,2015-06-16 20:41:00,2015,6,Tuesday,20,LAWRENCE AVE,42.311611,-71.076256


In [37]:
crime_incidents_[crime_incidents_['Lat'].isnull()]['STREET'].value_counts()
#crime_incidents_[crime_incidents_['Lat'].isnull()]['OFFENSE_DESCRIPTION'].value_counts()

BLUE HILL AVE           1633
WASHINGTON ST           1568
HARRISON AVE             666
BOYLSTON ST              592
MASSACHUSETTS AVE        587
                        ... 
AGANNIS ARENA              1
ISLAND                     1
MARDEN AVE                 1
HOLIDAY ST                 1
PININSULA YATCH CLUB       1
Name: STREET, Length: 2737, dtype: int64