__Data cleaning notebook__

In [10]:
library(tidyverse)
library(geojsonio)

Registered S3 method overwritten by 'geojsonio':
  method         from 
  print.location dplyr


Attaching package: ‘geojsonio’


The following object is masked from ‘package:base’:

    pretty




In [6]:
# READ AND WRANGLE DATA BOSTON CRIME DATA
df <- read.csv("data/crime.csv", encoding = 'latin-1') %>% 
      select("OFFENSE_CODE_GROUP", "DISTRICT", "YEAR", "MONTH", "DAY_OF_WEEK", "HOUR") %>%
      mutate(DAY_OF_WEEK = factor(DAY_OF_WEEK, levels = c("Sunday", "Saturday", "Friday", "Thursday", "Wednesday", "Tuesday", "Monday"))) %>%
      mutate(HOUR = factor(HOUR)) %>%
      filter(DISTRICT != '') %>%
      mutate(DISTRICT = case_when(DISTRICT == 'A1' ~ 'Downtown',
                                  DISTRICT == 'A7' ~ 'East Boston',
                                  DISTRICT == 'A15' ~ 'Charlestown',
                                  DISTRICT == 'B2' ~ 'Roxbury',
                                  DISTRICT == 'B3' ~ 'Mattapan',
                                  DISTRICT == 'C6' ~ 'South Boston',
                                  DISTRICT == 'C11' ~ 'Dorchester',
                                  DISTRICT == 'D4' ~ 'South End',
                                  DISTRICT == 'D14' ~ 'Brighton',
                                  DISTRICT == 'E5' ~ 'West Roxbury',
                                  DISTRICT == 'E13' ~ 'Jamaica Plain',
                                  DISTRICT == 'E18' ~ 'Hyde Park'
                                 ))

__find top 10 crime counts__

In [4]:
df %>%
  group_by(OFFENSE_CODE_GROUP) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  head(11) %>%
  select(OFFENSE_CODE_GROUP)

OFFENSE_CODE_GROUP
<fct>
Motor Vehicle Accident Response
Larceny
Medical Assistance
Investigate Person
Other
Drug Violation
Simple Assault
Vandalism
Verbal Disputes
Towed


In [None]:
__Select only top 10 crime types__

In [8]:
df <- df %>%
    filter(OFFENSE_CODE_GROUP == 'Motor Vehicle Accident Response' |
           OFFENSE_CODE_GROUP == 'Larceny' |
           OFFENSE_CODE_GROUP == 'Medical Assistance' |
           OFFENSE_CODE_GROUP == 'Investigate Person' |
           OFFENSE_CODE_GROUP == 'Drug Violation' |
           OFFENSE_CODE_GROUP == 'Simple Assault' |
           OFFENSE_CODE_GROUP == 'Vandalism' |
           OFFENSE_CODE_GROUP == 'Verbal Disputes' |
           OFFENSE_CODE_GROUP == 'Towed' |
           OFFENSE_CODE_GROUP == 'Investigate Property' 
          )

In [9]:
write_csv(df, 'data/crime_clean.csv')

__import and clean geojson file__

In [None]:
geodf <- geojson_read('data/Boston_Neighborhoods.geojson',  what = "sp")
suppressWarnings(geodf_fortified <- tidy(geodf))
geodf_fortified <- geodf_fortified %>% 
                    mutate(id = str_replace(id, "15", "Downtown"), 
                    id =  str_replace(id, "11", "East Boston"),
                    id =  str_replace(id, "12", "Charlestown"),
                    id =  str_replace(id, "8", "Roxbury"),
                    id =  str_replace(id, "20", "Mattapan"),
                    id =  str_replace(id, "23", "South Boston"),
                    id =  str_replace(id, "21", "Dorchester"),
                    id =  str_replace(id, "9", "South End"),
                    id =  str_replace(id, "17", "Brighton"),
                    id =  str_replace(id, "18", "West Roxbury"),
                    id =  str_replace(id, "1", "Jamaica Plain"),
                    id =  str_replace(id, "19", "Hyde Park")) 
write_csv(geodf_fortified, 'data/geo_fortified.csv')