# Capstone: Exploratory Prediction Modeling

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

# import project utils
import sys
sys.path.append('../src')

import data_utils
from data_utils import Config

import graph_utils


## The Data: San Francisco Police Department Incident Reports

### Read the Data

In [5]:
# Consistent random_state for the project
print(f'Project-wide random_state: {Config.RANDOM_STATE}')

Project-wide random_state: 42


In [6]:
# Which dataset to work from?

sample_file = data_utils.select_sample_csv_file(pct=10)
print(f'Selected sample file: {sample_file}')

Selected sample file: ../data/incidents_clean_10_pct.csv


In [7]:
current_raw_df, current_clean_df = data_utils.get_clean_data_from_csv(sample_file)

Reading file: ../data/incidents_clean_10_pct.csv ... Done: 88,717 rows, 38 columns
... Converting datetime and date to timeseries ... Done
... Setting index to datetime ... Done
Done


In [8]:
data = data_utils.preprocess_data(current_raw_df.copy())

Pre-processing ... 
... Dropping unwanted columns ... 
... preprocess_drop_cols: Column Unnamed: 0 dropped
... preprocess_drop_cols: Column esncag_-_boundary_file dropped
... preprocess_drop_cols: Column central_market/tenderloin_boundary_polygon_-_updated dropped
... preprocess_drop_cols: Column civic_center_harm_reduction_project_boundary dropped
... preprocess_drop_cols: Column hsoc_zones_as_of_2018-06-05 dropped
... preprocess_drop_cols: Column invest_in_neighborhoods_(iin)_areas dropped
... preprocess_drop_cols: Column report_type_code dropped
... preprocess_drop_cols: Column report_type_description dropped
... preprocess_drop_cols: Column filed_online dropped
... preprocess_drop_cols: Column intersection dropped
... preprocess_drop_cols: Column cnn dropped
... preprocess_drop_cols: Column point dropped
... preprocess_drop_cols: Column supervisor_district dropped
... preprocess_drop_cols: Column supervisor_district_2012 dropped
... preprocess_drop_cols: Column current_supervisor_d

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 80670 entries, 2022-02-10 07:59:00 to 2021-07-09 00:22:00
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   incident_date          80670 non-null  object        
 1   incident_time          80670 non-null  object        
 2   incident_year          80670 non-null  int64         
 3   incident_day_of_week   80670 non-null  object        
 4   incident_category      80670 non-null  object        
 5   resolution             80670 non-null  object        
 6   police_district        80670 non-null  object        
 7   analysis_neighborhood  80670 non-null  object        
 8   latitude               80670 non-null  float64       
 9   longitude              80670 non-null  float64       
 10  neighborhoods          80670 non-null  float64       
 11  date                   80670 non-null  datetime64[ns]
dtypes: datetime64[ns](1), flo