In [55]:
# Import dependencies
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
from pylab import rcParams
import plotly.express as px
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import re
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [56]:
# Import our Boston_Crime.csv
Boston_df = pd.read_csv("Boston_Crime.csv", encoding = "ISO-8859-1")
Boston_df.head(10)

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,PLTEST005,520,,BURGLARY - RESIDENTIAL,B2,289.0,1,2021-10-13 00:00:00,2021,10,Wednesday,0,,SCHROEDER PLZ,42.333679,-71.091878,"(42.33367921810846, -71.09187754618458)"
1,PLTEST003,3114,,INVESTIGATE PROPERTY,B2,289.0,0,2021-05-12 00:00:00,2021,5,Wednesday,0,,SCHROEDER PLZ,42.333679,-71.091878,"(42.33367921810846, -71.09187754618458)"
2,PLTEST002,3114,,INVESTIGATE PROPERTY,B2,289.0,0,2021-05-12 00:00:00,2021,5,Wednesday,0,,SCHROEDER PLZ,42.333679,-71.091878,"(42.33367921810846, -71.09187754618458)"
3,PLTEST001,3114,,INVESTIGATE PROPERTY,B2,,0,2021-03-29 00:00:00,2021,3,Monday,0,,SCHROEDER PLAZA,0.0,0.0,"(0, 0)"
4,312030979,3201,,PROPERTY - LOST/ MISSING,E13,465.0,0,2021-03-22 00:00:00,2021,3,Monday,0,,BLUE HILL AVE,42.284826,-71.091374,"(42.28482576580488, -71.09137368938802)"
5,292152228,3301,,VERBAL DISPUTE,C11,347.0,0,2021-07-28 20:45:00,2021,7,Wednesday,20,,LYON ST,42.306383,-71.060212,"(42.30638322801941, -71.06021217719662)"
6,282080177,3801,,M/V ACCIDENT - OTHER,A1,,0,2021-11-02 13:23:00,2021,11,Tuesday,13,,CAMBRIDGE ST & BOWDOIN ST\nBOSTON MA 02114\nU...,42.36119,-71.06288,"(42.361189964341946, -71.06287997127626)"
7,222000453,617,,LARCENY THEFT FROM BUILDING,E13,,0,2021-12-30 12:30:00,2021,12,Thursday,12,,MONTEBELLO RD,42.311768,-71.099716,"(42.31176765436927, -71.09971563708591)"
8,222000599,1106,,FRAUD - CREDIT CARD / ATM FRAUD,B2,282.0,0,2021-12-29 22:40:00,2021,12,Wednesday,22,,WASHINGTON ST,42.328663,-71.085634,"(42.32866283555608, -71.08563401183545)"
9,222000592,1102,,FRAUD - FALSE PRETENSE / SCHEME,A1,98.0,0,2021-11-23 00:00:00,2021,11,Tuesday,0,,MILK ST,42.356733,-71.057559,"(42.356733021573184, -71.05755938440176)"


In [57]:
# Identify the columns that have null values
for column in Boston_df.columns:
    print(f"{column} has {Boston_df[column].isnull().sum()} null values")

INCIDENT_NUMBER has 0 null values
OFFENSE_CODE has 0 null values
OFFENSE_CODE_GROUP has 71721 null values
OFFENSE_DESCRIPTION has 0 null values
DISTRICT has 993 null values
REPORTING_AREA has 0 null values
SHOOTING has 0 null values
OCCURRED_ON_DATE has 0 null values
YEAR has 0 null values
MONTH has 0 null values
DAY_OF_WEEK has 0 null values
HOUR has 0 null values
UCR_PART has 71721 null values
STREET has 679 null values
Lat has 0 null values
Long has 0 null values
Location has 0 null values


In [58]:
# Find duplicate entries
print(f"Duplicate entries: {Boston_df.duplicated().sum()}")

Duplicate entries: 0


In [59]:
# Look at data types and columns
Boston_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71721 entries, 0 to 71720
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   INCIDENT_NUMBER      71721 non-null  object 
 1   OFFENSE_CODE         71721 non-null  int64  
 2   OFFENSE_CODE_GROUP   0 non-null      float64
 3   OFFENSE_DESCRIPTION  71721 non-null  object 
 4   DISTRICT             70728 non-null  object 
 5   REPORTING_AREA       71721 non-null  object 
 6   SHOOTING             71721 non-null  int64  
 7   OCCURRED_ON_DATE     71721 non-null  object 
 8   YEAR                 71721 non-null  int64  
 9   MONTH                71721 non-null  int64  
 10  DAY_OF_WEEK          71721 non-null  object 
 11  HOUR                 71721 non-null  int64  
 12  UCR_PART             0 non-null      float64
 13  STREET               71042 non-null  object 
 14  Lat                  71721 non-null  float64
 15  Long                 71721 non-null 

In [60]:
# Drop OFFENSE_CODE_GROUP and UCR_PART columns : have null values throughout
# Also determined to not need these columns - object data types
Crime_df = Boston_df.drop(columns=["OFFENSE_CODE_GROUP", "UCR_PART",
                                   "INCIDENT_NUMBER","REPORTING_AREA","STREET","YEAR"])


# View new DataFrame
Crime_df.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_DESCRIPTION,DISTRICT,SHOOTING,OCCURRED_ON_DATE,MONTH,DAY_OF_WEEK,HOUR,Lat,Long,Location
0,520,BURGLARY - RESIDENTIAL,B2,1,2021-10-13 00:00:00,10,Wednesday,0,42.333679,-71.091878,"(42.33367921810846, -71.09187754618458)"
1,3114,INVESTIGATE PROPERTY,B2,0,2021-05-12 00:00:00,5,Wednesday,0,42.333679,-71.091878,"(42.33367921810846, -71.09187754618458)"
2,3114,INVESTIGATE PROPERTY,B2,0,2021-05-12 00:00:00,5,Wednesday,0,42.333679,-71.091878,"(42.33367921810846, -71.09187754618458)"
3,3114,INVESTIGATE PROPERTY,B2,0,2021-03-29 00:00:00,3,Monday,0,0.0,0.0,"(0, 0)"
4,3201,PROPERTY - LOST/ MISSING,E13,0,2021-03-22 00:00:00,3,Monday,0,42.284826,-71.091374,"(42.28482576580488, -71.09137368938802)"


In [61]:
# Rename columns for easier identification
Crime_df = Crime_df.rename(columns={
    "OFFENSE_CODE": "Offense_Code",
    "OFFENSE_DESCRIPTION": "Offense_Info",
    "DISTRICT": "District",
    "SHOOTING": "Shooting",
    "OCCURRED_ON_DATE": "Date",
    "MONTH": "Month",
    "DAY_OF_WEEK": "Day",
    "HOUR": "Hour",
    "Lat":"Lat",
    "Long":"Long",
    "LOCATION": "Location",
})

# Display the DataFrame
Crime_df.head()

Unnamed: 0,Offense_Code,Offense_Info,District,Shooting,Date,Month,Day,Hour,Lat,Long,Location
0,520,BURGLARY - RESIDENTIAL,B2,1,2021-10-13 00:00:00,10,Wednesday,0,42.333679,-71.091878,"(42.33367921810846, -71.09187754618458)"
1,3114,INVESTIGATE PROPERTY,B2,0,2021-05-12 00:00:00,5,Wednesday,0,42.333679,-71.091878,"(42.33367921810846, -71.09187754618458)"
2,3114,INVESTIGATE PROPERTY,B2,0,2021-05-12 00:00:00,5,Wednesday,0,42.333679,-71.091878,"(42.33367921810846, -71.09187754618458)"
3,3114,INVESTIGATE PROPERTY,B2,0,2021-03-29 00:00:00,3,Monday,0,0.0,0.0,"(0, 0)"
4,3201,PROPERTY - LOST/ MISSING,E13,0,2021-03-22 00:00:00,3,Monday,0,42.284826,-71.091374,"(42.28482576580488, -71.09137368938802)"


In [62]:
Crime_df["District"].value_counts()
Crime_df['District'] = Crime_df['District'].fillna(0)
Crime_df.isnull().sum()

Offense_Code    0
Offense_Info    0
District        0
Shooting        0
Date            0
Month           0
Day             0
Hour            0
Lat             0
Long            0
Location        0
dtype: int64

In [63]:
# Make an Offense dataframe (with the Object data types)
Offense_df = Crime_df.filter(["Offense_Code", "Offense_Info", "District", "Date","Day","Location"])
Offense_df.head(20)

Unnamed: 0,Offense_Code,Offense_Info,District,Date,Day,Location
0,520,BURGLARY - RESIDENTIAL,B2,2021-10-13 00:00:00,Wednesday,"(42.33367921810846, -71.09187754618458)"
1,3114,INVESTIGATE PROPERTY,B2,2021-05-12 00:00:00,Wednesday,"(42.33367921810846, -71.09187754618458)"
2,3114,INVESTIGATE PROPERTY,B2,2021-05-12 00:00:00,Wednesday,"(42.33367921810846, -71.09187754618458)"
3,3114,INVESTIGATE PROPERTY,B2,2021-03-29 00:00:00,Monday,"(0, 0)"
4,3201,PROPERTY - LOST/ MISSING,E13,2021-03-22 00:00:00,Monday,"(42.28482576580488, -71.09137368938802)"
5,3301,VERBAL DISPUTE,C11,2021-07-28 20:45:00,Wednesday,"(42.30638322801941, -71.06021217719662)"
6,3801,M/V ACCIDENT - OTHER,A1,2021-11-02 13:23:00,Tuesday,"(42.361189964341946, -71.06287997127626)"
7,617,LARCENY THEFT FROM BUILDING,E13,2021-12-30 12:30:00,Thursday,"(42.31176765436927, -71.09971563708591)"
8,1106,FRAUD - CREDIT CARD / ATM FRAUD,B2,2021-12-29 22:40:00,Wednesday,"(42.32866283555608, -71.08563401183545)"
9,1102,FRAUD - FALSE PRETENSE / SCHEME,A1,2021-11-23 00:00:00,Tuesday,"(42.356733021573184, -71.05755938440176)"


In [64]:
Crime_df = Crime_df.drop(columns=["Offense_Code", "Shooting", "Date", "Location", "Lat","Long"])

In [65]:
Crime_df.head()

Unnamed: 0,Offense_Info,District,Month,Day,Hour
0,BURGLARY - RESIDENTIAL,B2,10,Wednesday,0
1,INVESTIGATE PROPERTY,B2,5,Wednesday,0
2,INVESTIGATE PROPERTY,B2,5,Wednesday,0
3,INVESTIGATE PROPERTY,B2,3,Monday,0
4,PROPERTY - LOST/ MISSING,E13,3,Monday,0


In [66]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(Crime_df, columns=["Offense_Info","District","Day"])
X.head(10)

Unnamed: 0,Month,Hour,Offense_Info_AFFRAY,Offense_Info_AIRCRAFT INCIDENTS,Offense_Info_ANIMAL ABUSE,"Offense_Info_ANIMAL INCIDENTS (DOG BITES, LOST DOG, ETC)",Offense_Info_ARSON,Offense_Info_ASSAULT - AGGRAVATED,Offense_Info_ASSAULT - SIMPLE,Offense_Info_AUTO THEFT,...,District_E18,District_E5,District_External,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
0,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,7,20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,11,13,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,12,12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,12,22,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [76]:
# Save new dataframe
Crime_df.to_csv("ML_Boston_Crime.csv")