## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import datetime
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import folium
from folium import Circle, Marker, Map
from folium.plugins import MarkerCluster

## Read in Combined Boston Crime Data (2015-2022)

In [None]:
df = pd.read_csv('crime_incidents_boston_2015_2022.csv', low_memory=False)
pd.set_option('display.max_columns', None)

## Boston Crime Exploratory Data Analysis (EDA)

In [None]:
# Shape of the crime dataframe
print('Number of Rows: ', df.shape[0])
print('Number of Columns: ', df.shape[1])

In [None]:
print("Shape of the duplicated values in the DataFrame: ",df[df.duplicated()].shape[0])
# This is something our capstone group will need to consider, especially because we do not know 
# if the duplicates are derived from having multiple victims associated with one incident report

In [None]:
# Get a feel for the data to include the column headers and row values
df.head(1)

In [None]:
# Summary of fields containing null values. Notice the amount of Lat/Long records containing null values. 
df.isnull().sum()

In [None]:
df.info()

In [None]:
# Based on numeric fields this doesn't inform us too much considering most of the fields are for offense code, year, month, etc.
# However, this gives us a glimpse of the min and max value for Lat/Long fields. Notice that they fall outside the City of Boston
df.describe()

In [None]:
# Identify the first and last observed crime incident by date/time. Notice the first obsereved was mid-2015 so we don't have a complete years worth of data for 2015.
print("First Observed Date: ", df['OCCURRED_ON_DATE'].min())
print("Last Observed Date: ", df['OCCURRED_ON_DATE'].max())

In [None]:
# Plot crimes by month
sns.countplot(data = df, x='MONTH', color='blue');

In [None]:
# Plot crime by hour 
sns.countplot(data = df, x='HOUR', color='red');

In [None]:
sns.countplot(data = df, x='UCR_PART', color='orange');

In [None]:
df.groupby(['DAY_OF_WEEK'])['INCIDENT_NUMBER'].count().plot( kind= 'bar');

In [None]:
# Heatmap plot for hour and day of the week
x = df['DAY_OF_WEEK']
y = df['HOUR']
fig = go.Figure(go.Histogram2d(x=x,y=y))
fig.update_layout(title='Heatmap plot on number of offenses by hour and day of the week', xaxis_tickfont_size=10,
    yaxis=dict(title='Hour', titlefont_size=15, tickfont_size=12),
    xaxis=dict(title='Day of the Week', titlefont_size=15, tickfont_size=12))
fig.show()

In [None]:
df_year = df.groupby(['YEAR'])['INCIDENT_NUMBER'].agg('count').reset_index().rename(columns={'INCIDENT_NUMBER':'Incident_Count'})

In [None]:
df_year['Incident_Count_Diff'] = (df_year['Incident_Count'].max() - df_year['Incident_Count'])
df_year['Percent_Count'] = (df_year['Incident_Count'] / df_year['Incident_Count'].max()) * 100
df_year['Crimes_Per_Day'] = (df_year['Incident_Count'] / 365)
df_year.head(8)

In [None]:
sns.catplot(x='YEAR',
           kind='count',
            height=9.00, 
            aspect=3.00,
            color='blue',
           data=df,
           )
plt.xticks(size=20)
plt.yticks(size=20)
plt.xlabel('Year', fontsize=30)
plt.ylabel('Count', fontsize=30)
plt.title("Number of Crimes per Year in the City of Boston", size='35');

In [None]:
df_street = df.groupby(['STREET'])['INCIDENT_NUMBER'].agg('count').reset_index()
df_street = df_street.rename(columns={'INCIDENT_NUMBER':'Incident_Count'}).sort_values(by='Incident_Count',ascending=False).reset_index()
del df_street['index']
df_street.head(10)

In [None]:
# Obtain just the top ten streets with the highest crime rate
df_street = df_street.head(10)

In [None]:
# Percentage distribution of Crimes/Offenses on Streets in the visualization of Pie graph
fig = px.pie(df_street, values=df_street['Incident_Count'], 
             title='Percent distrubution of offenses by the top 10 highest crime streets', 
             names=df_street['STREET'])
fig.show()

In [None]:
# Notice a small amount of records fall outside the City of Boston since the district was tagged as "External"
sns.catplot(x='DISTRICT',
           kind='count',
            height=8.00, 
            aspect=2.75,
            color='blue',
           data=df,
           )
plt.xticks(size=20)
plt.yticks(size=20)
plt.xlabel('District', fontsize=30)
plt.ylabel('Count', fontsize=30)
plt.title("Number of Crimes per District in the City of Boston", size='35');

In [None]:
ax = df.hist(column='YEAR', by='DISTRICT', bins=25, figsize=(16,18), grid=False)
for x in ax.flatten():
    x.set_xlabel("Year")
    x.set_ylabel("Number of Offenses")

In [None]:
ax = df.hist(column='UCR_PART', by='DISTRICT', bins=25, figsize=(16,18), grid=False)
for x in ax.flatten():
    x.set_xlabel("Uniform Crime Report Type")
    x.set_ylabel("Uniform Crime Report Count")

## Read in City of Boston Boundary Data to clip the crime data within the city limits

In [None]:
cob_url = 'https://bostonopendata-boston.opendata.arcgis.com/datasets/boston::city-of-boston-boundary.geojson?outSR=%7B%22latestWkid%22%3A2249%2C%22wkid%22%3A102686%7D.geojson'
cob = gpd.read_file(cob_url)

In [None]:
cob.crs

In [None]:
cob.plot();

In [None]:
# Make sure the coordinate reference system (crs) is set to the same value as shown in cob.crs. In this case CRS: EPSG:4326
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Long'], df['Lat']), crs='EPSG:4326')

In [None]:
gdf_j = gpd.clip(gdf, cob)
# Shape of the geodataframe within Boston city limits
print('Number of Rows: ', gdf_j.shape[0])
print('Number of Columns: ', gdf_j.shape[1])

In [None]:
gdf_j['SHOOTING'].value_counts()

In [None]:
gdf_j.dtypes

In [None]:
# Create a seperate geodataframe with only those records that involved a shooting incident
oh_shoot = gdf_j[(gdf_j['SHOOTING'] == '1') | (gdf_j['SHOOTING'] == 'Y')]
# Shape of the boston shooting incidents geodataframe 
print('Number of Rows: ', gdf_j.shape[0])
print('Number of Columns: ', gdf_j.shape[1])

In [None]:
plt.figure(num=None, figsize=(10, 8))
plt.scatter("Long", "Lat", data = oh_shoot, c = 'y',alpha = 0.4, edgecolor = 'black', s=2)
plt.grid()
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('City of Boston Shooting Locations')
plt.tight_layout()
plt.axis('tight')

plt.show()

In [None]:
# Mapping the City of Boston shootings with popup markers to display several data elements
cmap = folium.Map(location=[42.361145, -71.057083],zoom_start=11)

def generate_popup(a,b,c,d,e):
    return f'''<strong>Incident Number:</strong> {a}<br><strong>Crime Description:</strong> {b}<br><strong>Date:</strong> {c}<br><strong>Time:</strong> {d}<br><strong>Address:</strong> {e}'''

cluster = MarkerCluster().add_to(cmap)

for _, row in oh_shoot.iterrows():
    folium.Marker(location=[row['Lat'],row['Long']], 
                        popup=generate_popup(row['INCIDENT_NUMBER'], 
                                             row['OFFENSE_DESCRIPTION'], 
                                             row['OCCURRED_ON_DATE'],
                                             row['DISTRICT'],
                                             row['STREET'])).add_to(cluster)
    
# Create a layer control object and add it to our map instance
folium.LayerControl().add_to(cluster)

#Show map
cmap