# Capstone: Exploratory Prediction Modeling

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

# import project utils
import sys
sys.path.append('../src')

import data_utils
from data_utils import Config

import graph_utils


## The Data: San Francisco Police Department Incident Reports

### Read the Data

In [5]:
# Consistent random_state for the project
print(f'Project-wide random_state: {Config.RANDOM_STATE}')

Project-wide random_state: 42


In [6]:
# Which dataset to work from?

sample_file = data_utils.select_sample_csv_file(pct=10)
print(f'Selected sample file: {sample_file}')

Selected sample file: ../data/incidents_clean_10_pct.csv


In [7]:
current_raw_df, current_clean_df = data_utils.get_clean_data_from_csv(sample_file)

Reading file: ../data/incidents_clean_10_pct.csv ... Done: 88,717 rows, 37 columns
... Converting datetime to timeseries ... Done
... Setting index to datetime ... Done
Done


In [8]:
data = data_utils.preprocess_data(current_raw_df.copy())

Pre-processing ... 
... Dropping unwanted columns ... 
... preprocess_drop_cols: Column Unnamed: 0 dropped
... preprocess_drop_cols: Column esncag_-_boundary_file dropped
... preprocess_drop_cols: Column central_market/tenderloin_boundary_polygon_-_updated dropped
... preprocess_drop_cols: Column civic_center_harm_reduction_project_boundary dropped
... preprocess_drop_cols: Column hsoc_zones_as_of_2018-06-05 dropped
... preprocess_drop_cols: Column invest_in_neighborhoods_(iin)_areas dropped
... preprocess_drop_cols: Column report_type_code dropped
... preprocess_drop_cols: Column report_type_description dropped
... preprocess_drop_cols: Column filed_online dropped
... preprocess_drop_cols: Column intersection dropped
... preprocess_drop_cols: Column cnn dropped
... preprocess_drop_cols: Column point dropped
... preprocess_drop_cols: Column supervisor_district dropped
... preprocess_drop_cols: Column supervisor_district_2012 dropped
... preprocess_drop_cols: Column current_supervisor_d

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 82242 entries, 2022-02-10 07:59:00 to 2021-07-09 00:22:00
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             82242 non-null  object 
 1   time             82242 non-null  object 
 2   year             82242 non-null  int64  
 3   day_of_week      82242 non-null  object 
 4   category         82242 non-null  object 
 5   resolution       82242 non-null  object 
 6   police_district  82242 non-null  object 
 7   neighborhood     82242 non-null  object 
 8   latitude         82242 non-null  float64
 9   longitude        82242 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 6.9+ MB


## Summary of EDA

After cleaning the data and performing basic EDA, we have established the following:

1. Target variable `category`
   * Evenly spread across time
   * Incidence of crimes is extremely skewed/unbalanced by category. Larceny (29.02%) by far outweighing the other top-10 categories with each being in the single digits
3. Features impacting `category`
   * Affected by incident time and date components: date, time, day of week, month, year, etc
   * Affected by police disctrict
   * Affect by latitude and logitude (TODO: need visualization)
4. We artificially removed nulls (TODO: will come back to impute data later)

## Feature Engineering

In [13]:
data.head(2)

Unnamed: 0_level_0,date,time,year,day_of_week,category,resolution,police_district,neighborhood,latitude,longitude
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-02-10 07:59:00,2022/02/10,07:59,2022,Thursday,Recovered Vehicle,Open or Active,Ingleside,West of Twin Peaks,37.728975,-122.468077
2022-11-17 23:30:00,2022/11/17,23:30,2022,Thursday,Missing Person,Open or Active,Mission,Mission,37.762579,-122.421662


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 82242 entries, 2022-02-10 07:59:00 to 2021-07-09 00:22:00
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             82242 non-null  object 
 1   time             82242 non-null  object 
 2   year             82242 non-null  int64  
 3   day_of_week      82242 non-null  object 
 4   category         82242 non-null  object 
 5   resolution       82242 non-null  object 
 6   police_district  82242 non-null  object 
 7   neighborhood     82242 non-null  object 
 8   latitude         82242 non-null  float64
 9   longitude        82242 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 6.9+ MB


### Encoding: Time-based columns

Let's unpack the date and time into their components that are still missing so there is less to encode:

In [17]:
data['hour'] = data.index.map(lambda x: x.hour)
data['minute'] = data.index.map(lambda x: x.minute)
data['day'] = data.index.map(lambda x: x.day)
data['month'] = data.index.map(lambda x: x.month)

Now let's encode day_of_week to numeric values:

In [19]:
from sklearn.preprocessing import LabelEncoder

enc_dow = LabelEncoder()
enc_dow.fit(data.day_of_week.unique())
data['dow'] = enc_dow.transform(data.day_of_week)

Let's mark the redundant columns to be dropped after feature engineering:

In [21]:
drop_encoded_cols = ['date', 'time', 'day_of_week']

### Encoding: Resolution

We will also drop the resolution column since it doesn't impact crime prediction:

In [24]:
data.resolution.value_counts()

resolution
Open or Active          65847
Cite or Arrest Adult    16395
Name: count, dtype: int64

In [25]:
drop_encoded_cols.append('resolution')

### Encoding: Category

In [27]:
enc_cat = LabelEncoder()
enc_cat.fit(data.category.unique())
data.category = enc_cat.transform(data.category)

### Encoding: Police District

In [29]:
enc_pd = LabelEncoder()
enc_pd.fit(data.police_district.unique())
data['pd'] = enc_pd.transform(data.police_district)

### Encoding: Neighborhood

In [31]:
enc_hood = LabelEncoder()
enc_hood.fit(data.neighborhood.unique())
data.neighborhood = enc_hood.transform(data.neighborhood)

### Dropping Redundant Columns

We can now drop the encoded columns:

In [34]:
drop_encoded_cols.append('police_district')

print(f'Dropping encoded columns: {drop_encoded_cols}')
data.drop(columns=drop_encoded_cols, inplace=True)

Dropping encoded columns: ['date', 'time', 'day_of_week', 'resolution', 'police_district']


In [35]:
data.head(2)

Unnamed: 0_level_0,year,category,neighborhood,latitude,longitude,hour,minute,day,month,dow,pd
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-02-10 07:59:00,2022,33,39,37.728975,-122.468077,7,59,10,2,4,2
2022-11-17 23:30:00,2022,23,18,37.762579,-122.421662,23,30,17,11,4,3


In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 82242 entries, 2022-02-10 07:59:00 to 2021-07-09 00:22:00
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          82242 non-null  int64  
 1   category      82242 non-null  int64  
 2   neighborhood  82242 non-null  int64  
 3   latitude      82242 non-null  float64
 4   longitude     82242 non-null  float64
 5   hour          82242 non-null  int64  
 6   minute        82242 non-null  int64  
 7   day           82242 non-null  int64  
 8   month         82242 non-null  int64  
 9   dow           82242 non-null  int64  
 10  pd            82242 non-null  int64  
dtypes: float64(2), int64(9)
memory usage: 7.5 MB
