## Imports and Setup

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from scipy import stats
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso, ElasticNet

## Inspecting Pearson Correlations with all types of crime

In [2]:
#CREATING DF FOR ALL CRIME

# Database credentials
username = 'crimeadmin'
password = '4DAnuuSBWVQ92w!F'
host = 'crimedbmysql.cspoouh9lugd.us-east-2.rds.amazonaws.com'
database = 'crimedb_mysql'
port = '3306'  # default MySQL port

# Create the database engine
engine = create_engine(f'mysql+mysqlconnector://{username}:{password}@{host}:{port}/{database}')

# SQL query or table name
query = "SELECT * FROM ML_CRIMETYPES_EDU_INCOME"  #Name of table 

# Create a DataFrame from the SQL query
df = pd.read_sql(query, engine)

df_types = df.drop(columns='Year')

In [3]:
df_types.head(1)

Unnamed: 0,Sector,Month,ARSON,ASSAULT,BATTERY,BURGLARY,CONCEALED CARRY LICENSE VIOLATION,CRIM SEXUAL ASSAULT,CRIMINAL DAMAGE,CRIMINAL SEXUAL ASSAULT,...,S_INCOME_LOW,S_INCOME_MEDIUM,S_INCOME_HIGH,TRUST,T_EDUCATION_LOW,T_EDUCATION_MEDIUM,T_EDUCATION_HIGH,T_INCOME_LOW,T_INCOME_MEDIUM,T_INCOME_HIGH
0,110,2017-11,0.0,25.0,41.0,3.0,0.0,2.0,15.0,0.0,...,,,,65.02,,,,,,


In [4]:
df_types.columns

Index(['Sector', 'Month', 'ARSON', 'ASSAULT', 'BATTERY', 'BURGLARY',
       'CONCEALED CARRY LICENSE VIOLATION', 'CRIM SEXUAL ASSAULT',
       'CRIMINAL DAMAGE', 'CRIMINAL SEXUAL ASSAULT', 'CRIMINAL TRESPASS',
       'DECEPTIVE PRACTICE', 'GAMBLING', 'HOMICIDE', 'HUMAN TRAFFICKING',
       'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION', 'KIDNAPPING',
       'LIQUOR LAW VIOLATION', 'MOTOR VEHICLE THEFT', 'NARCOTICS',
       'NON-CRIMINAL', 'NON-CRIMINAL (SUBJECT SPECIFIED)', 'OBSCENITY',
       'OFFENSE INVOLVING CHILDREN', 'OTHER NARCOTIC VIOLATION',
       'OTHER OFFENSE', 'PROSTITUTION', 'PUBLIC INDECENCY',
       'PUBLIC PEACE VIOLATION', 'RITUALISM', 'ROBBERY', 'SEX OFFENSE',
       'STALKING', 'THEFT', 'WEAPONS VIOLATION', 'SAFETY', 'S_EDUCATION_LOW',
       'S_EDUCATION_MEDIUM', 'S_EDUCATION_HIGH', 'S_INCOME_LOW',
       'S_INCOME_MEDIUM', 'S_INCOME_HIGH', 'TRUST', 'T_EDUCATION_LOW',
       'T_EDUCATION_MEDIUM', 'T_EDUCATION_HIGH', 'T_INCOME_LOW',
       'T_INCOME_MEDIUM', 'T

In [5]:
# List of specific crime type columns
crime_columns = [
    'ARSON', 'ASSAULT', 'BATTERY', 'BURGLARY',
    'CONCEALED CARRY LICENSE VIOLATION', 'CRIM SEXUAL ASSAULT',
    'CRIMINAL DAMAGE', 'CRIMINAL SEXUAL ASSAULT', 'CRIMINAL TRESPASS',
    'DECEPTIVE PRACTICE', 'GAMBLING', 'HOMICIDE', 'HUMAN TRAFFICKING',
    'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION', 'KIDNAPPING',
    'LIQUOR LAW VIOLATION', 'MOTOR VEHICLE THEFT', 'NARCOTICS',
    'NON-CRIMINAL', 'NON-CRIMINAL (SUBJECT SPECIFIED)', 'OBSCENITY',
    'OFFENSE INVOLVING CHILDREN', 'OTHER NARCOTIC VIOLATION',
    'OTHER OFFENSE', 'PROSTITUTION', 'PUBLIC INDECENCY',
    'PUBLIC PEACE VIOLATION', 'RITUALISM', 'ROBBERY', 'SEX OFFENSE',
    'STALKING', 'THEFT', 'WEAPONS VIOLATION'
]

# List of non-crime columns for correlation
non_crime_columns = ['SAFETY', 'S_EDUCATION_LOW', 'S_EDUCATION_MEDIUM', 'S_EDUCATION_HIGH', 
                     'S_INCOME_LOW', 'S_INCOME_MEDIUM', 'S_INCOME_HIGH', 'TRUST', 
                     'T_EDUCATION_LOW', 'T_EDUCATION_MEDIUM', 'T_EDUCATION_HIGH', 
                     'T_INCOME_LOW', 'T_INCOME_MEDIUM', 'T_INCOME_HIGH']

# Calculating the correlation matrix for all columns
correlation_matrix = df[crime_columns + non_crime_columns].corr()

# Filtering the correlation matrix to show only correlations of non-crime
# Filtering the correlation matrix
# Selecting rows for non-crime columns and columns for crime columns
filtered_correlation = correlation_matrix.loc[non_crime_columns, crime_columns]

# Sorting by magnitude of correlation for each non-crime column
# Using 'abs' for absolute value and 'sort_values' for sorting
# Note: Sorting separately for each non-crime column
sorted_correlations = {col: filtered_correlation.loc[col].abs().sort_values(ascending=False) for col in non_crime_columns}

# Displaying the sorted correlations
for col, corr in sorted_correlations.items():
    print(f"Correlations for {col}:\n{corr}\n")


Correlations for SAFETY:
WEAPONS VIOLATION                    0.573417
BATTERY                              0.471369
ASSAULT                              0.458571
HOMICIDE                             0.392043
CRIMINAL DAMAGE                      0.355934
OTHER OFFENSE                        0.313857
OFFENSE INVOLVING CHILDREN           0.291521
NARCOTICS                            0.282464
ROBBERY                              0.273576
THEFT                                0.259236
INTERFERENCE WITH PUBLIC OFFICER     0.237987
DECEPTIVE PRACTICE                   0.235253
ARSON                                0.226705
CRIMINAL SEXUAL ASSAULT              0.146753
MOTOR VEHICLE THEFT                  0.134999
GAMBLING                             0.127897
CRIMINAL TRESPASS                    0.096196
PROSTITUTION                         0.089523
CRIM SEXUAL ASSAULT                  0.084607
PUBLIC PEACE VIOLATION               0.080978
BURGLARY                             0.074514
KIDNAPPIN

### We don't see any particularly strong correlations. Battery, Assault and weapons violations seem to have the highest correlation with safety and trust scores, but usually ranges between .4 to .57 which is a moderate correlation

## Attempt at creating multi-output Regression Models for all crime types

In [6]:
# Converting the 'Month' column to datetime
df_types['Month'] = pd.to_datetime(df_types['Month'])

In [7]:
"""
#STANDARDIZING ALL TARGET VARIABLES AS WELL 

numerical_columns = df_types.select_dtypes(include=['float64', 'int64']).columns
numerical_columns = numerical_columns.drop(['Sector'])

# Applying standardization
scaler = StandardScaler()
df_types[numerical_columns] = scaler.fit_transform(df_types[numerical_columns])

df_types.head()
"""

"\n#STANDARDIZING ALL TARGET VARIABLES AS WELL \n\nnumerical_columns = df_types.select_dtypes(include=['float64', 'int64']).columns\nnumerical_columns = numerical_columns.drop(['Sector'])\n\n# Applying standardization\nscaler = StandardScaler()\ndf_types[numerical_columns] = scaler.fit_transform(df_types[numerical_columns])\n\ndf_types.head()\n"

In [8]:
#NOT STANDARDIZING THE TARGET VARIABLES 

exclude_columns = [
    'ARSON', 'ASSAULT', 'BATTERY', 'BURGLARY',
    'CONCEALED CARRY LICENSE VIOLATION', 'CRIM SEXUAL ASSAULT',
    'CRIMINAL DAMAGE', 'CRIMINAL SEXUAL ASSAULT', 'CRIMINAL TRESPASS',
    'DECEPTIVE PRACTICE', 'GAMBLING', 'HOMICIDE', 'HUMAN TRAFFICKING',
    'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION', 'KIDNAPPING',
    'LIQUOR LAW VIOLATION', 'MOTOR VEHICLE THEFT', 'NARCOTICS',
    'NON-CRIMINAL', 'NON-CRIMINAL (SUBJECT SPECIFIED)', 'OBSCENITY',
    'OFFENSE INVOLVING CHILDREN', 'OTHER NARCOTIC VIOLATION',
    'OTHER OFFENSE', 'PROSTITUTION', 'PUBLIC INDECENCY',
    'PUBLIC PEACE VIOLATION', 'RITUALISM', 'ROBBERY', 'SEX OFFENSE',
    'STALKING', 'THEFT', 'WEAPONS VIOLATION'
]

numerical_columns = df_types.select_dtypes(include=['float64', 'int64']).columns
numerical_columns = [col for col in numerical_columns if col != 'Sector' and col not in exclude_columns]

# Applying standardization
scaler = StandardScaler()
df_types[numerical_columns] = scaler.fit_transform(df_types[numerical_columns])

df_types.head()

Unnamed: 0,Sector,Month,ARSON,ASSAULT,BATTERY,BURGLARY,CONCEALED CARRY LICENSE VIOLATION,CRIM SEXUAL ASSAULT,CRIMINAL DAMAGE,CRIMINAL SEXUAL ASSAULT,...,S_INCOME_LOW,S_INCOME_MEDIUM,S_INCOME_HIGH,TRUST,T_EDUCATION_LOW,T_EDUCATION_MEDIUM,T_EDUCATION_HIGH,T_INCOME_LOW,T_INCOME_MEDIUM,T_INCOME_HIGH
0,110,2017-11-01,0.0,25.0,41.0,3.0,0.0,2.0,15.0,0.0,...,,,,0.550908,,,,,,
1,110,2017-12-01,0.0,22.0,30.0,3.0,0.0,1.0,18.0,1.0,...,,,,0.523214,,,,,,
2,110,2018-01-01,0.0,16.0,58.0,0.0,0.0,0.0,13.0,0.0,...,,,,0.877906,,,,,,
3,110,2018-02-01,0.0,30.0,43.0,2.0,0.0,2.0,26.0,1.0,...,,,,1.147387,,,,,,
4,110,2018-03-01,0.0,33.0,51.0,1.0,0.0,5.0,16.0,0.0,...,0.891993,2.184311,1.098875,1.167625,0.052758,1.128835,0.840233,-0.027277,1.368428,0.88794


In [9]:
#Dropping dates that don't have demographic specific Trust and Safety scores  

df_types.set_index('Month', inplace=True)

# Filtering the data to include only dates from March 2018 onwards
df_types_filtered = df_types[df_types.index >= '2018-03-01']

In [10]:
# List of columns to exclude from lag feature creation
exclude_columns = [
    'ARSON', 'ASSAULT', 'BATTERY', 'BURGLARY',
    'CONCEALED CARRY LICENSE VIOLATION', 'CRIM SEXUAL ASSAULT',
    'CRIMINAL DAMAGE', 'CRIMINAL SEXUAL ASSAULT', 'CRIMINAL TRESPASS',
    'DECEPTIVE PRACTICE', 'GAMBLING', 'HOMICIDE', 'HUMAN TRAFFICKING',
    'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION', 'KIDNAPPING',
    'LIQUOR LAW VIOLATION', 'MOTOR VEHICLE THEFT', 'NARCOTICS',
    'NON-CRIMINAL', 'NON-CRIMINAL (SUBJECT SPECIFIED)', 'OBSCENITY',
    'OFFENSE INVOLVING CHILDREN', 'OTHER NARCOTIC VIOLATION',
    'OTHER OFFENSE', 'PROSTITUTION', 'PUBLIC INDECENCY',
    'PUBLIC PEACE VIOLATION', 'RITUALISM', 'ROBBERY', 'SEX OFFENSE',
    'STALKING', 'THEFT', 'WEAPONS VIOLATION'
]

# Creating 1-month lagged features for numerical columns excluding target variables
numerical_columns = df_types_filtered.select_dtypes(include=['float64', 'int64']).columns
numerical_columns = [col for col in numerical_columns if col != 'Sector' and col not in exclude_columns]

# Creating separate dataframe from copy (to remove warning for using a slice of a copy)
df_types_filtered_copy = df_types_filtered.copy()

for column in numerical_columns:
    df_types_filtered_copy[f'{column}_lag1'] = df_types_filtered_copy[column].shift(1)

# Dropping the initial row with NaN values due to lagging
df_types_filtered = df_types_filtered_copy.dropna()

df_types_filtered.head()



Unnamed: 0_level_0,Sector,ARSON,ASSAULT,BATTERY,BURGLARY,CONCEALED CARRY LICENSE VIOLATION,CRIM SEXUAL ASSAULT,CRIMINAL DAMAGE,CRIMINAL SEXUAL ASSAULT,CRIMINAL TRESPASS,...,S_INCOME_LOW_lag1,S_INCOME_MEDIUM_lag1,S_INCOME_HIGH_lag1,TRUST_lag1,T_EDUCATION_LOW_lag1,T_EDUCATION_MEDIUM_lag1,T_EDUCATION_HIGH_lag1,T_INCOME_LOW_lag1,T_INCOME_MEDIUM_lag1,T_INCOME_HIGH_lag1
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-01,110,0.0,26.0,38.0,0.0,0.0,2.0,24.0,1.0,11.0,...,0.891993,2.184311,1.098875,1.167625,0.052758,1.128835,0.840233,-0.027277,1.368428,0.88794
2018-05-01,110,0.0,30.0,49.0,9.0,0.0,0.0,23.0,1.0,12.0,...,0.494996,2.1275,0.935931,0.897079,-0.171098,0.996959,0.642263,-1.026521,1.473232,0.771547
2018-06-01,110,0.0,19.0,48.0,1.0,0.0,2.0,32.0,0.0,13.0,...,0.651208,2.851046,1.36813,0.511497,-0.283915,0.714106,0.26457,-0.575989,1.089254,-0.010649
2018-07-01,110,1.0,26.0,65.0,2.0,0.0,3.0,22.0,0.0,15.0,...,1.116859,2.03853,1.656263,0.64464,0.079407,0.916923,-0.273689,0.495743,-0.209043,0.548626
2018-08-01,110,1.0,21.0,75.0,4.0,0.0,3.0,20.0,0.0,12.0,...,0.890003,1.409313,1.460531,1.01531,0.963282,1.006053,0.180638,1.180258,0.162286,0.900763


In [11]:
df_types_filtered.columns

Index(['Sector', 'ARSON', 'ASSAULT', 'BATTERY', 'BURGLARY',
       'CONCEALED CARRY LICENSE VIOLATION', 'CRIM SEXUAL ASSAULT',
       'CRIMINAL DAMAGE', 'CRIMINAL SEXUAL ASSAULT', 'CRIMINAL TRESPASS',
       'DECEPTIVE PRACTICE', 'GAMBLING', 'HOMICIDE', 'HUMAN TRAFFICKING',
       'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION', 'KIDNAPPING',
       'LIQUOR LAW VIOLATION', 'MOTOR VEHICLE THEFT', 'NARCOTICS',
       'NON-CRIMINAL', 'NON-CRIMINAL (SUBJECT SPECIFIED)', 'OBSCENITY',
       'OFFENSE INVOLVING CHILDREN', 'OTHER NARCOTIC VIOLATION',
       'OTHER OFFENSE', 'PROSTITUTION', 'PUBLIC INDECENCY',
       'PUBLIC PEACE VIOLATION', 'RITUALISM', 'ROBBERY', 'SEX OFFENSE',
       'STALKING', 'THEFT', 'WEAPONS VIOLATION', 'SAFETY', 'S_EDUCATION_LOW',
       'S_EDUCATION_MEDIUM', 'S_EDUCATION_HIGH', 'S_INCOME_LOW',
       'S_INCOME_MEDIUM', 'S_INCOME_HIGH', 'TRUST', 'T_EDUCATION_LOW',
       'T_EDUCATION_MEDIUM', 'T_EDUCATION_HIGH', 'T_INCOME_LOW',
       'T_INCOME_MEDIUM', 'T_INCOME_H

In [12]:
# List of columns to include as dependent variables (target variables)
dependent_columns = [
    'ARSON', 'ASSAULT', 'BATTERY', 'BURGLARY',
    'CONCEALED CARRY LICENSE VIOLATION', 'CRIM SEXUAL ASSAULT',
    'CRIMINAL DAMAGE', 'CRIMINAL SEXUAL ASSAULT', 'CRIMINAL TRESPASS',
    'DECEPTIVE PRACTICE', 'GAMBLING', 'HOMICIDE', 'HUMAN TRAFFICKING',
    'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION', 'KIDNAPPING',
    'LIQUOR LAW VIOLATION', 'MOTOR VEHICLE THEFT', 'NARCOTICS',
    'NON-CRIMINAL', 'NON-CRIMINAL (SUBJECT SPECIFIED)', 'OBSCENITY',
    'OFFENSE INVOLVING CHILDREN', 'OTHER NARCOTIC VIOLATION',
    'OTHER OFFENSE', 'PROSTITUTION', 'PUBLIC INDECENCY',
    'PUBLIC PEACE VIOLATION', 'RITUALISM', 'ROBBERY', 'SEX OFFENSE',
    'STALKING', 'THEFT', 'WEAPONS VIOLATION'
]

# Create a dataframe for dependent variables (y)
y = df_types_filtered[dependent_columns]

# List of columns to include as independent variables
independent_columns = [
    'SAFETY', 'S_EDUCATION_LOW', 'S_EDUCATION_MEDIUM', 'S_EDUCATION_HIGH',
    'S_INCOME_LOW', 'S_INCOME_MEDIUM', 'S_INCOME_HIGH', 'TRUST',
    'T_EDUCATION_LOW', 'T_EDUCATION_MEDIUM', 'T_EDUCATION_HIGH',
    'T_INCOME_LOW', 'T_INCOME_MEDIUM', 'T_INCOME_HIGH', 'SAFETY_lag1',
    'S_EDUCATION_LOW_lag1', 'S_EDUCATION_MEDIUM_lag1',
    'S_EDUCATION_HIGH_lag1', 'S_INCOME_LOW_lag1', 'S_INCOME_MEDIUM_lag1',
    'S_INCOME_HIGH_lag1', 'TRUST_lag1', 'T_EDUCATION_LOW_lag1',
    'T_EDUCATION_MEDIUM_lag1', 'T_EDUCATION_HIGH_lag1', 'T_INCOME_LOW_lag1',
    'T_INCOME_MEDIUM_lag1', 'T_INCOME_HIGH_lag1'
]

# Create a dataframe for independent variables (X)
X = df_types_filtered[independent_columns]

# Splitting the dataset into training and testing sets
split_idx = int(len(df_types_filtered) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]


In [13]:

from sklearn.ensemble import RandomForestRegressor

# Initialize the multi-output Random Forest Regressor
multi_output_rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
multi_output_rf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = multi_output_rf.predict(X_test)


In [14]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate MAE for each target variable
mae = mean_absolute_error(y_test, y_pred, multioutput='raw_values')

# Calculate MSE for each target variable
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')

# Calculate RMSE for each target variable
rmse = np.sqrt(mse)

# Calculate R2 score for each target variable
r2 = r2_score(y_test, y_pred, multioutput='raw_values')

# Overall metrics
overall_mae = mean_absolute_error(y_test, y_pred)
overall_mse = mean_squared_error(y_test, y_pred)
overall_rmse = np.sqrt(overall_mse)
overall_r2 = r2_score(y_test, y_pred)

# Print metrics
print("MAE for each target variable:")
print(mae)
print("\nMSE for each target variable:")
print(mse)
print("\nRMSE for each target variable:")
print(rmse)
print("\nR2 score for each target variable:")
print(r2)

print("\nOverall MAE:", overall_mae)
print("Overall MSE:", overall_mse)
print("Overall RMSE:", overall_rmse)
print("Overall R2 score:", overall_r2)


MAE for each target variable:
[6.84183784e-01 8.15209730e+00 1.60807459e+01 4.89462703e+00
 3.54972973e-01 5.10162162e-01 1.09280757e+01 1.21099459e+00
 3.10903784e+00 8.66345946e+00 1.13481081e-01 8.22475676e-01
 2.42486486e-02 9.30389189e-01 3.64983784e-01 2.61005405e-01
 3.96875676e-01 1.14602486e+01 8.34103784e+00 1.05405405e-02
 7.67567568e-04 1.21448649e-01 1.61441081e+00 1.12864865e-02
 6.79339459e+00 6.81145946e-01 1.67351351e-02 1.18634595e+00
 2.37837838e-04 5.25392432e+00 1.12491892e+00 6.11502703e-01
 2.51216000e+01 5.42137297e+00]

MSE for each target variable:
[8.28679459e-01 1.03691252e+02 4.01574539e+02 4.61785448e+01
 3.71991676e-01 3.43438703e-01 1.84101601e+02 2.96602649e+00
 1.55803762e+01 1.35066418e+02 3.88595676e-02 1.37807708e+00
 9.88335135e-03 1.69873935e+00 2.56337189e-01 1.52719027e-01
 3.32502162e-01 3.04724863e+02 2.02131690e+02 1.26475676e-03
 2.30270270e-05 5.97509189e-02 4.22388595e+00 5.47827027e-03
 7.25970157e+01 4.34616735e+00 7.58616216e-03 4.53948