# Capstone: Exploratory Prediction Modeling

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

# import project utils
import sys
sys.path.append('../src')

import data_utils
from data_utils import Config

import graph_utils


## The Data: San Francisco Police Department Incident Reports

### Read the Data

In [5]:
# Consistent random_state for the project
print(f'Project-wide random_state: {Config.RANDOM_STATE}')

Project-wide random_state: 42


In [6]:
# Which dataset to work from?

sample_file = data_utils.select_sample_csv_file(pct=10)
print(f'Selected sample file: {sample_file}')

Selected sample file: ../data/incidents_clean_10_pct.csv


In [7]:
current_raw_df, current_clean_df = data_utils.get_clean_data_from_csv(sample_file)

Reading file: ../data/incidents_clean_10_pct.csv ... Done: 88,717 rows, 37 columns
... Converting datetime to timeseries ... Done
... Setting index to datetime ... Done
Done


In [8]:
data = data_utils.preprocess_data(current_raw_df.copy())

Pre-processing ... 
... Dropping unwanted columns ... 
... preprocess_drop_cols: Column Unnamed: 0 dropped
... preprocess_drop_cols: Column esncag_-_boundary_file dropped
... preprocess_drop_cols: Column central_market/tenderloin_boundary_polygon_-_updated dropped
... preprocess_drop_cols: Column civic_center_harm_reduction_project_boundary dropped
... preprocess_drop_cols: Column hsoc_zones_as_of_2018-06-05 dropped
... preprocess_drop_cols: Column invest_in_neighborhoods_(iin)_areas dropped
... preprocess_drop_cols: Column report_type_code dropped
... preprocess_drop_cols: Column report_type_description dropped
... preprocess_drop_cols: Column filed_online dropped
... preprocess_drop_cols: Column intersection dropped
... preprocess_drop_cols: Column cnn dropped
... preprocess_drop_cols: Column point dropped
... preprocess_drop_cols: Column supervisor_district dropped
... preprocess_drop_cols: Column supervisor_district_2012 dropped
... preprocess_drop_cols: Column current_supervisor_d

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 82242 entries, 2022-02-10 07:59:00 to 2021-07-09 00:22:00
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             82242 non-null  object 
 1   time             82242 non-null  object 
 2   year             82242 non-null  int64  
 3   day_of_week      82242 non-null  object 
 4   category         82242 non-null  object 
 5   resolution       82242 non-null  object 
 6   police_district  82242 non-null  object 
 7   neighborhood     82242 non-null  object 
 8   latitude         82242 non-null  float64
 9   longitude        82242 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 6.9+ MB


In [10]:
data.head(2)

Unnamed: 0_level_0,date,time,year,day_of_week,category,resolution,police_district,neighborhood,latitude,longitude
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-02-10 07:59:00,2022/02/10,07:59,2022,Thursday,Recovered Vehicle,Open or Active,Ingleside,West of Twin Peaks,37.728975,-122.468077
2022-11-17 23:30:00,2022/11/17,23:30,2022,Thursday,Missing Person,Open or Active,Mission,Mission,37.762579,-122.421662
