## Imports and Setup

In [1]:
#Imports

import mysql.connector
import csv 
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)


In [2]:
#Outdated data 

"""
file_path = '/Users/ericchestnut/Documents/Data Science Class /Project/Cleaned Data/Chicago_Crime_Main.csv'


df = pd.read_csv(file_path)

"""

#Updated crime data with all of 2023

file_path = '/Users/ericchestnut/Downloads/Crimes_-_2001_to_Present_20240111.csv'

df = pd.read_csv(file_path)

## Cleaning Dataset

In [3]:
df.columns

Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location'],
      dtype='object')

In [4]:
"""
crime_df = df.drop(columns=['Latitude', 'Longitude', 'Zip Codes', 'Wards', 'Police Beats', 'Police Districts', 'Boundaries - ZIP Codes', 'Updated On', 'X Coordinate',
       'Y Coordinate','District', 'Ward','FBI Code', 'Year','Historical Wards 2003-2015', 'Census Tracts','Community Areas','Arrest', 'Domestic',])

"""

"\ncrime_df = df.drop(columns=['Latitude', 'Longitude', 'Zip Codes', 'Wards', 'Police Beats', 'Police Districts', 'Boundaries - ZIP Codes', 'Updated On', 'X Coordinate',\n       'Y Coordinate','District', 'Ward','FBI Code', 'Year','Historical Wards 2003-2015', 'Census Tracts','Community Areas','Arrest', 'Domestic',])\n\n"

In [5]:
crime_df = df.dropna(subset=['Community Area'])

In [6]:
crime_df = crime_df[crime_df['Community Area'] != 0]


In [7]:
crime_df.head(5)

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,11037294,JA371270,03/18/2015 12:00:00 PM,0000X W WACKER DR,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,BANK,False,False,...,42.0,32.0,11,,,2015,08/01/2017 03:52:26 PM,,,
1,11646293,JC213749,12/20/2018 03:00:00 PM,023XX N LOCKWOOD AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,APARTMENT,False,False,...,36.0,19.0,11,,,2018,04/06/2019 04:04:43 PM,,,
2,11645836,JC212333,05/01/2016 12:25:00 AM,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,15.0,63.0,11,,,2016,04/06/2019 04:04:43 PM,,,
3,11645959,JC211511,12/20/2018 04:00:00 PM,045XX N ALBANY AVE,2820,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,False,False,...,33.0,14.0,08A,,,2018,04/06/2019 04:04:43 PM,,,
4,11645601,JC212935,06/01/2014 12:01:00 AM,087XX S SANGAMON ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,...,21.0,71.0,11,,,2014,04/06/2019 04:04:43 PM,,,


## Creating columns for month and year. Changing dataframe to total crimes for each area and month

In [8]:
#Convert to datetime and create new columns for year and month 
crime_df['Date'] = pd.to_datetime(df['Date'])
# Create a new column 'Month' combining year and month in 'YYYY-MM' format
crime_df['Month'] = crime_df['Date'].dt.strftime('%Y-%m')


In [9]:
#Group crime_counts into each community area by month
grouped = crime_df.groupby(['Community Area', 'Year', 'Month'])
crime_counts = grouped.size().reset_index(name='Total Crimes')
crime_counts.head(5)

Unnamed: 0,Community Area,Year,Month,Total Crimes
0,1.0,2001,2001-01,11
1,1.0,2001,2001-02,3
2,1.0,2001,2001-03,4
3,1.0,2001,2001-04,5
4,1.0,2001,2001-05,3


In [10]:
#Dropping years 2001 and 2002 since these seem to have incomplete data. 
crime_df = crime_df[(crime_df['Year'] != 2001) & (crime_df['Year'] != 2002)]

#re-run grouping 
grouped = crime_df.groupby(['Community Area', 'Year', 'Month'])
crime_counts = grouped.size().reset_index(name='Total Crimes')

crime_counts.tail(5)

Unnamed: 0,Community Area,Year,Month,Total Crimes
19475,77.0,2023,2023-09,315
19476,77.0,2023,2023-10,282
19477,77.0,2023,2023-11,270
19478,77.0,2023,2023-12,262
19479,77.0,2024,2024-01,22


## Creating dataframe for each type of crime 

In [11]:
file_path = '/Users/ericchestnut/Documents/Data Science Class /Project/Cleaned Data/Crime_IUCR_Codes.csv'

iucr_codes = pd.read_csv(file_path)

In [12]:
iucr_codes

Unnamed: 0,IUCR,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,INDEX CODE,ACTIVE
0,110,HOMICIDE,FIRST DEGREE MURDER,I,True
1,130,HOMICIDE,SECOND DEGREE MURDER,I,True
2,141,HOMICIDE,INVOLUNTARY MANSLAUGHTER,N,True
3,142,HOMICIDE,RECKLESS HOMICIDE,N,True
4,261,CRIMINAL SEXUAL ASSAULT,AGGRAVATED - HANDGUN,I,True
5,262,CRIMINAL SEXUAL ASSAULT,AGGRAVATED - OTHER FIREARM,I,True
6,263,CRIMINAL SEXUAL ASSAULT,AGGRAVATED - KNIFE / CUTTING INSTRUMENT,I,True
7,264,CRIMINAL SEXUAL ASSAULT,AGGRAVATED - OTHER DANGEROUS WEAPON,I,True
8,265,CRIMINAL SEXUAL ASSAULT,AGGRAVATED - OTHER,I,True
9,266,CRIMINAL SEXUAL ASSAULT,PREDATORY,I,True


### I think there are too many IUCR codes for it to be useful for data given there would be single digit numbers of crimes of each in any given month. Think its better to aggregate them based on the Primary Description

In [13]:
grouped = crime_df.groupby(['Community Area', 'Year', 'Month', 'Primary Type']).size().reset_index(name='Count')
crime_types = grouped.pivot_table(index=['Community Area', 'Year', 'Month'], columns='Primary Type', values='Count', fill_value=0)
crime_types.tail(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Primary Type,ARSON,ASSAULT,BATTERY,BURGLARY,CONCEALED CARRY LICENSE VIOLATION,CRIM SEXUAL ASSAULT,CRIMINAL DAMAGE,CRIMINAL SEXUAL ASSAULT,CRIMINAL TRESPASS,DECEPTIVE PRACTICE,...,OTHER OFFENSE,PROSTITUTION,PUBLIC INDECENCY,PUBLIC PEACE VIOLATION,RITUALISM,ROBBERY,SEX OFFENSE,STALKING,THEFT,WEAPONS VIOLATION
Community Area,Year,Month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
77.0,2023,2023-09,0.0,20.0,52.0,18.0,0.0,0.0,37.0,4.0,5.0,23.0,...,18.0,0.0,0.0,2.0,0.0,6.0,4.0,1.0,80.0,4.0
77.0,2023,2023-10,0.0,23.0,43.0,10.0,0.0,0.0,23.0,4.0,9.0,24.0,...,15.0,0.0,0.0,1.0,0.0,10.0,4.0,1.0,80.0,1.0
77.0,2023,2023-11,0.0,23.0,43.0,12.0,0.0,0.0,20.0,1.0,7.0,11.0,...,22.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,101.0,2.0
77.0,2023,2023-12,0.0,16.0,39.0,7.0,0.0,0.0,41.0,2.0,8.0,12.0,...,13.0,1.0,0.0,0.0,0.0,9.0,2.0,3.0,84.0,0.0
77.0,2024,2024-01,0.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0


In [14]:

grouped = crime_df.groupby(['Community Area', 'Year', 'Month', 'Primary Type']).size().reset_index(name='Count')
crime_types = grouped.pivot_table(index=['Community Area', 'Year', 'Month'], 
                                  columns='Primary Type', 
                                  values='Count', 
                                  fill_value=0).reset_index()

crime_types.head(500)


Primary Type,Community Area,Year,Month,ARSON,ASSAULT,BATTERY,BURGLARY,CONCEALED CARRY LICENSE VIOLATION,CRIM SEXUAL ASSAULT,CRIMINAL DAMAGE,...,OTHER OFFENSE,PROSTITUTION,PUBLIC INDECENCY,PUBLIC PEACE VIOLATION,RITUALISM,ROBBERY,SEX OFFENSE,STALKING,THEFT,WEAPONS VIOLATION
0,1.0,2003,2003-01,2.0,28.0,91.0,42.0,0.0,2.0,73.0,...,57.0,16.0,0.0,2.0,0.0,22.0,3.0,0.0,85.0,1.0
1,1.0,2003,2003-02,0.0,32.0,86.0,30.0,0.0,4.0,50.0,...,48.0,6.0,0.0,0.0,0.0,18.0,1.0,0.0,93.0,5.0
2,1.0,2003,2003-03,1.0,27.0,122.0,54.0,0.0,4.0,90.0,...,46.0,8.0,0.0,4.0,0.0,23.0,6.0,1.0,123.0,7.0
3,1.0,2003,2003-04,0.0,29.0,137.0,36.0,0.0,5.0,87.0,...,58.0,7.0,0.0,2.0,0.0,27.0,5.0,1.0,120.0,3.0
4,1.0,2003,2003-05,0.0,35.0,137.0,52.0,0.0,1.0,102.0,...,36.0,15.0,0.0,5.0,0.0,28.0,4.0,0.0,164.0,6.0
5,1.0,2003,2003-06,2.0,28.0,116.0,52.0,0.0,3.0,86.0,...,61.0,15.0,0.0,2.0,0.0,35.0,3.0,0.0,120.0,4.0
6,1.0,2003,2003-07,0.0,30.0,114.0,51.0,0.0,1.0,85.0,...,29.0,11.0,0.0,2.0,0.0,31.0,3.0,0.0,147.0,12.0
7,1.0,2003,2003-08,0.0,52.0,150.0,49.0,0.0,1.0,101.0,...,42.0,2.0,0.0,4.0,0.0,47.0,1.0,0.0,152.0,4.0
8,1.0,2003,2003-09,2.0,36.0,118.0,37.0,0.0,1.0,78.0,...,54.0,8.0,0.0,1.0,0.0,33.0,2.0,0.0,122.0,5.0
9,1.0,2003,2003-10,1.0,32.0,111.0,27.0,0.0,1.0,85.0,...,45.0,11.0,0.0,3.0,0.0,38.0,4.0,0.0,106.0,10.0


In [15]:
#Dropping crime types that either have really low values or are just uninteresting
crime_types = crime_types.drop(columns=[
    'CONCEALED CARRY LICENSE VIOLATION', 
    'CRIMINAL SEXUAL ASSAULT', 
    'PUBLIC INDECENCY', 
    'OTHER OFFENSE', 
    'PUBLIC PEACE VIOLATION', 
    'RITUALISM', 
    'STALKING',
    # 'NON-CRIMINAL', 
    #'NON-CRIMINAL (SUBJECT SPECIFIED)',
    'OBSCENITY',
    'INTERFERENCE WITH PUBLIC OFFICER',
    'PUBLIC PEACE VIOLATION',
    'LIQUOR LAW VIOLATION',
    'INTIMIDATION',
    'OTHER NARCOTIC VIOLATION',
    #'NON - CRIMINAL',
    'HUMAN TRAFFICKING', # - Interesting but really low instances 
    'Year'
])


In [16]:
crime_types.head(5)

Primary Type,Community Area,Month,ARSON,ASSAULT,BATTERY,BURGLARY,CRIM SEXUAL ASSAULT,CRIMINAL DAMAGE,CRIMINAL TRESPASS,DECEPTIVE PRACTICE,...,NARCOTICS,NON - CRIMINAL,NON-CRIMINAL,NON-CRIMINAL (SUBJECT SPECIFIED),OFFENSE INVOLVING CHILDREN,PROSTITUTION,ROBBERY,SEX OFFENSE,THEFT,WEAPONS VIOLATION
0,1.0,2003-01,2.0,28.0,91.0,42.0,2.0,73.0,44.0,17.0,...,91.0,0.0,0.0,0.0,3.0,16.0,22.0,3.0,85.0,1.0
1,1.0,2003-02,0.0,32.0,86.0,30.0,4.0,50.0,28.0,14.0,...,75.0,0.0,0.0,0.0,7.0,6.0,18.0,1.0,93.0,5.0
2,1.0,2003-03,1.0,27.0,122.0,54.0,4.0,90.0,29.0,20.0,...,70.0,0.0,0.0,0.0,2.0,8.0,23.0,6.0,123.0,7.0
3,1.0,2003-04,0.0,29.0,137.0,36.0,5.0,87.0,26.0,18.0,...,83.0,0.0,0.0,0.0,5.0,7.0,27.0,5.0,120.0,3.0
4,1.0,2003-05,0.0,35.0,137.0,52.0,1.0,102.0,26.0,30.0,...,96.0,0.0,0.0,0.0,8.0,15.0,28.0,4.0,164.0,6.0


## Creating dataset for just violent crime counts

In [17]:
"""
# List of violent crime types
violent_crimes = ['ASSAULT', 'BATTERY', 'CRIMINAL SEXUAL ASSAULT', 'HOMICIDE', 'ROBBERY']

# Filter rows where 'Primary Type' is in the list of violent crimes
filtered_crime_df = crime_df[crime_df['Primary Type'].isin(violent_crimes)]

#Group crime_counts into each community area by month
grouped = filtered_crime_df.groupby(['Community Area', 'Year', 'Month'])
violent_crime = grouped.size().reset_index(name='Total Crimes')
violent_crime.rename(columns={'Total Crimes': 'Violent Crimes'}, inplace=True)
violent_crime.head(5)
"""

"\n# List of violent crime types\nviolent_crimes = ['ASSAULT', 'BATTERY', 'CRIMINAL SEXUAL ASSAULT', 'HOMICIDE', 'ROBBERY']\n\n# Filter rows where 'Primary Type' is in the list of violent crimes\nfiltered_crime_df = crime_df[crime_df['Primary Type'].isin(violent_crimes)]\n\n#Group crime_counts into each community area by month\ngrouped = filtered_crime_df.groupby(['Community Area', 'Year', 'Month'])\nviolent_crime = grouped.size().reset_index(name='Total Crimes')\nviolent_crime.rename(columns={'Total Crimes': 'Violent Crimes'}, inplace=True)\nviolent_crime.head(5)\n"

In [18]:
#REFINING VIOLENT CRIMES FOR A NEW DATASET

# List of violent crime types
violent_crimes = ['ASSAULT', 'BATTERY', 'HOMICIDE']

# Filter rows where 'Primary Type' is in the list of violent crimes
filtered_crime_df = crime_df[crime_df['Primary Type'].isin(violent_crimes)]

#Group crime_counts into each community area by month
grouped = filtered_crime_df.groupby(['Community Area', 'Year', 'Month'])
violent_crime = grouped.size().reset_index(name='Total Crimes')
violent_crime.rename(columns={'Total Crimes': 'Violent Crimes'}, inplace=True)
violent_crime.head(5)

Unnamed: 0,Community Area,Year,Month,Violent Crimes
0,1.0,2003,2003-01,119
1,1.0,2003,2003-02,118
2,1.0,2003,2003-03,150
3,1.0,2003,2003-04,166
4,1.0,2003,2003-05,173


## Uploading three new datasets to database

In [19]:
#Removing 'Year' column

# List of your DataFrames
dataframes = [crime_counts, violent_crime]

# Iterate over each DataFrame and drop the 'Year' column
for df in dataframes:
    df.drop('Year', axis=1, inplace=True)


In [20]:
# Database credentials
username = 'crimeadmin'  
password = '4DAnuuSBWVQ92w!F'  
host = 'crimedbmysql.cspoouh9lugd.us-east-2.rds.amazonaws.com'  
database = 'crimedb_mysql'  
port = '3306'  # default MySQL port

from sqlalchemy import create_engine



In [21]:

#Uploading crime_counts

from sqlalchemy import create_engine

# Create the database engine
engine = create_engine(f'mysql+mysqlconnector://{username}:{password}@{host}/{database}')

# Specify the new table name here
new_table_name = 'Chicago_Crime_Totals'

# Upload the dataframe to the new table
crime_counts.to_sql(new_table_name, con=engine, if_exists='replace', index=False)



19480

In [22]:

#Uploading crime_types

from sqlalchemy import create_engine

# Create the database engine
engine = create_engine(f'mysql+mysqlconnector://{username}:{password}@{host}/{database}')

# Specify the new table name here
new_table_name = 'Chicago_Crimes_of_each_Type'

# Upload the dataframe to the new table
crime_types.to_sql(new_table_name, con=engine, if_exists='replace', index=False)


19480

In [23]:
"""
#Uploading crime for violent crime 

from sqlalchemy import create_engine

# Create the database engine
engine = create_engine(f'mysql+mysqlconnector://{username}:{password}@{host}/{database}')

# Specify the new table name here
new_table_name = 'Chicago_Violent_Crime_Totals'

# Upload the dataframe to the new table
violent_crime.to_sql(new_table_name, con=engine, if_exists='replace', index=False)
"""

"\n#Uploading crime for violent crime \n\nfrom sqlalchemy import create_engine\n\n# Create the database engine\nengine = create_engine(f'mysql+mysqlconnector://{username}:{password}@{host}/{database}')\n\n# Specify the new table name here\nnew_table_name = 'Chicago_Violent_Crime_Totals'\n\n# Upload the dataframe to the new table\nviolent_crime.to_sql(new_table_name, con=engine, if_exists='replace', index=False)\n"

In [24]:
"""
#CREATING NEW TABLE FOR DATABASE WITH REFINED VIOLENT CRIME. USING VARIABLES THAT HAVE HIGHER CORRELATION WITH SAFETY AND TRUST SCORES


#Uploading crime for violent crime 

from sqlalchemy import create_engine

# Create the database engine
engine = create_engine(f'mysql+mysqlconnector://{username}:{password}@{host}/{database}')

# Specify the new table name here
new_table_name = 'Chicago_Violent_REVISED'

# Upload the dataframe to the new table
violent_crime.to_sql(new_table_name, con=engine, if_exists='replace', index=False)
"""

"\n#CREATING NEW TABLE FOR DATABASE WITH REFINED VIOLENT CRIME. USING VARIABLES THAT HAVE HIGHER CORRELATION WITH SAFETY AND TRUST SCORES\n\n\n#Uploading crime for violent crime \n\nfrom sqlalchemy import create_engine\n\n# Create the database engine\nengine = create_engine(f'mysql+mysqlconnector://{username}:{password}@{host}/{database}')\n\n# Specify the new table name here\nnew_table_name = 'Chicago_Violent_REVISED'\n\n# Upload the dataframe to the new table\nviolent_crime.to_sql(new_table_name, con=engine, if_exists='replace', index=False)\n"

## Creating Dataframe by Police Sector

In [25]:
#Create sector column by converting police beat 
crime_df['Sector'] = (crime_df['Beat'] // 10) * 10

crime_df.head(5)

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,Month,Sector
0,11037294,JA371270,2015-03-18 12:00:00,0000X W WACKER DR,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,BANK,False,False,...,11,,,2015,08/01/2017 03:52:26 PM,,,,2015-03,110
1,11646293,JC213749,2018-12-20 15:00:00,023XX N LOCKWOOD AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,APARTMENT,False,False,...,11,,,2018,04/06/2019 04:04:43 PM,,,,2018-12,2510
2,11645836,JC212333,2016-05-01 00:25:00,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,11,,,2016,04/06/2019 04:04:43 PM,,,,2016-05,820
3,11645959,JC211511,2018-12-20 16:00:00,045XX N ALBANY AVE,2820,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,False,False,...,08A,,,2018,04/06/2019 04:04:43 PM,,,,2018-12,1720
4,11645601,JC212935,2014-06-01 00:01:00,087XX S SANGAMON ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,...,11,,,2014,04/06/2019 04:04:43 PM,,,,2014-06,2220


In [26]:
num_sectors = crime_df['Sector'].nunique()

num_sectors

76

In [27]:
unique_sectors = crime_df['Sector'].unique()
sorted_unique_sectors = np.sort(unique_sectors)
print(sorted_unique_sectors)

[ 110  120  130  210  220  230  310  320  330  410  420  430  510  520
  530  610  620  630  710  720  730  810  820  830  910  920  930 1010
 1020 1030 1110 1120 1130 1210 1220 1230 1310 1320 1330 1410 1420 1430
 1510 1520 1530 1610 1620 1630 1650 1710 1720 1730 1810 1820 1830 1910
 1920 1930 2010 2020 2030 2110 2120 2130 2210 2220 2230 2310 2320 2330
 2410 2420 2430 2510 2520 2530]


In [28]:
num_beats = crime_df['Beat'].nunique()

num_beats

304

#### Both the number of beats and number of sectors are higher than they should be. There are no 13, 21 or 23rd beats/sectors in the beats geometrical data. Not sure if this is a mistake or what

In [29]:

# List of values you want to count
values_to_count = [1310, 1320, 1330, 2110, 2120, 2130, 2310, 2320, 2330]

# Counting the occurrences of each value in the 'Sector' column
counts = crime_df['Sector'].value_counts().loc[values_to_count]

# Displaying the counts
print(counts)



Sector
1310    33617
1320    35103
1330    31864
2110    26289
2120    33843
2130    25759
2310    27443
2320    21657
2330    26705
Name: count, dtype: int64


In [30]:
# Convert the 'Date' column to datetime format if it's not already

crime_df_2017 = crime_df

crime_df_2017['Date'] = pd.to_datetime(crime_df['Date'])

# Filter the DataFrame to keep rows from 2017 onwards
crime_df_2017 = crime_df[crime_df['Date'].dt.year >= 2017]


In [31]:
# List of values you want to count
values_to_count = [1310, 1320, 1330, 2110, 2120, 2130, 2310, 2320, 2330]

try:
    # Counting the occurrences of each value in the 'Sector' column
    counts_2017 = crime_df_2017['Sector'].value_counts().loc[values_to_count]
except KeyError:
    print("None of these values in dataframe")

# Displaying the counts
try:
    print(counts_2017)
except NameError:
    pass


None of these values in dataframe


In [32]:
num_sectors = crime_df_2017['Sector'].nunique()

num_sectors



67

#### Removing data before 2017 resolved the issue. There must have been some beats/district changes at some point in time. Number of sectors is now correct

### Creating and uploading dataframe for all crime types

In [33]:
#Dropping crimes before November 2017 (this is when police sentiment data starts)

# Ensure the date column is in datetime format
crime_df['date'] = pd.to_datetime(crime_df['Date'])

# Filter the DataFrame to keep dates on or after November 2017 and before 2023 
crime_df = crime_df[crime_df['date'] >= '2017-11-01']
crime_df = crime_df[crime_df['Date'] <= '2023-12-31']



In [34]:
#Group crime_counts into each community area by month
grouped = crime_df.groupby(['Sector', 'Year', 'Month'])
crime_counts_sector = grouped.size().reset_index(name='Total Crimes')
crime_counts_sector.drop('Year', axis=1, inplace=True)
crime_counts_sector.head(5)

Unnamed: 0,Sector,Month,Total Crimes
0,110,2017-11,622
1,110,2017-12,620
2,110,2018-01,575
3,110,2018-02,524
4,110,2018-03,541


In [35]:
from sqlalchemy import create_engine

# Create the database engine
engine = create_engine(f'mysql+mysqlconnector://{username}:{password}@{host}/{database}')

# Specify the new table name here
new_table_name = 'MACHINE_LEARNING_TOTAL_CRIME'

# Upload the dataframe to the new table
crime_counts_sector.to_sql(new_table_name, con=engine, if_exists='replace', index=False)

4958

### Creating and uploading data table for Sectors, November 2017 onward, for violent crime totals 

In [36]:
"""
# List of violent crime types
violent_crimes = ['ASSAULT', 'BATTERY', 'CRIMINAL SEXUAL ASSAULT', 'HOMICIDE', 'ROBBERY']

# Filter rows where 'Primary Type' is in the list of violent crimes
filtered_crime_df = crime_df[crime_df['Primary Type'].isin(violent_crimes)]

#Group crime_counts into each community area by month
grouped = filtered_crime_df.groupby(['Sector', 'Year', 'Month'])
violent_crime_sector = grouped.size().reset_index(name='Total Crimes')
violent_crime_sector.rename(columns={'Total Crimes': 'Violent Crimes'}, inplace=True)
"""


"\n# List of violent crime types\nviolent_crimes = ['ASSAULT', 'BATTERY', 'CRIMINAL SEXUAL ASSAULT', 'HOMICIDE', 'ROBBERY']\n\n# Filter rows where 'Primary Type' is in the list of violent crimes\nfiltered_crime_df = crime_df[crime_df['Primary Type'].isin(violent_crimes)]\n\n#Group crime_counts into each community area by month\ngrouped = filtered_crime_df.groupby(['Sector', 'Year', 'Month'])\nviolent_crime_sector = grouped.size().reset_index(name='Total Crimes')\nviolent_crime_sector.rename(columns={'Total Crimes': 'Violent Crimes'}, inplace=True)\n"

In [37]:
#CREATING REFINED VIOLENT CRIMES. REMOVING ONES THAT DON'T HAVE HIGH CORRELATION WITH TRUST AND SAFETY SCORES

# List of violent crime types
violent_crimes = ['ASSAULT', 'BATTERY', 'HOMICIDE']

# Filter rows where 'Primary Type' is in the list of violent crimes
filtered_crime_df = crime_df[crime_df['Primary Type'].isin(violent_crimes)]

#Group crime_counts into each community area by month
grouped = filtered_crime_df.groupby(['Sector', 'Year', 'Month'])
violent_crime_sector = grouped.size().reset_index(name='Total Crimes')
violent_crime_sector.rename(columns={'Total Crimes': 'Violent Crimes'}, inplace=True)

In [38]:
violent_crime_sector.drop('Year', axis=1, inplace=True)
violent_crime_sector.head(5)

Unnamed: 0,Sector,Month,Violent Crimes
0,110,2017-11,66
1,110,2017-12,52
2,110,2018-01,74
3,110,2018-02,73
4,110,2018-03,84


In [39]:
#UNMODIFIED VIOLENT CRIME 
"""
from sqlalchemy import create_engine

# Create the database engine
engine = create_engine(f'mysql+mysqlconnector://{username}:{password}@{host}/{database}')

# Specify the new table name here
new_table_name = 'MACHINE_LEARNING_VIOLENT_CRIME'

# Upload the dataframe to the new table
violent_crime_sector.to_sql(new_table_name, con=engine, if_exists='replace', index=False)
"""

"\nfrom sqlalchemy import create_engine\n\n# Create the database engine\nengine = create_engine(f'mysql+mysqlconnector://{username}:{password}@{host}/{database}')\n\n# Specify the new table name here\nnew_table_name = 'MACHINE_LEARNING_VIOLENT_CRIME'\n\n# Upload the dataframe to the new table\nviolent_crime_sector.to_sql(new_table_name, con=engine, if_exists='replace', index=False)\n"

In [43]:
from sqlalchemy import create_engine

# Create the database engine
engine = create_engine(f'mysql+mysqlconnector://{username}:{password}@{host}/{database}')

# Specify the new table name here
new_table_name = 'MACHINE_LEARNING_VIOLENT_REVISED'

# Upload the dataframe to the new table
violent_crime_sector.to_sql(new_table_name, con=engine, if_exists='replace', index=False)

4958

### Creating and uploading data table for all crime types

In [41]:

grouped = crime_df.groupby(['Sector', 'Year', 'Month', 'Primary Type']).size().reset_index(name='Count')
crime_types = grouped.pivot_table(index=['Sector', 'Year', 'Month'], 
                                  columns='Primary Type', 
                                  values='Count', 
                                  fill_value=0).reset_index()

crime_types.head(500)


Primary Type,Sector,Year,Month,ARSON,ASSAULT,BATTERY,BURGLARY,CONCEALED CARRY LICENSE VIOLATION,CRIM SEXUAL ASSAULT,CRIMINAL DAMAGE,...,OTHER OFFENSE,PROSTITUTION,PUBLIC INDECENCY,PUBLIC PEACE VIOLATION,RITUALISM,ROBBERY,SEX OFFENSE,STALKING,THEFT,WEAPONS VIOLATION
0,110,2017,2017-11,0.0,25.0,41.0,3.0,0.0,2.0,15.0,...,12.0,0.0,0.0,3.0,0.0,28.0,2.0,0.0,378.0,0.0
1,110,2017,2017-12,0.0,22.0,30.0,3.0,0.0,1.0,18.0,...,11.0,0.0,0.0,4.0,0.0,19.0,3.0,0.0,382.0,0.0
2,110,2018,2018-01,0.0,16.0,58.0,0.0,0.0,0.0,13.0,...,12.0,0.0,0.0,0.0,0.0,24.0,2.0,0.0,328.0,2.0
3,110,2018,2018-02,0.0,30.0,43.0,2.0,0.0,2.0,26.0,...,14.0,0.0,0.0,4.0,0.0,21.0,0.0,0.0,272.0,1.0
4,110,2018,2018-03,0.0,33.0,51.0,1.0,0.0,5.0,16.0,...,13.0,0.0,0.0,2.0,0.0,17.0,3.0,0.0,270.0,2.0
5,110,2018,2018-04,0.0,26.0,38.0,0.0,0.0,2.0,24.0,...,17.0,0.0,0.0,3.0,0.0,22.0,1.0,1.0,331.0,0.0
6,110,2018,2018-05,0.0,30.0,49.0,9.0,0.0,0.0,23.0,...,14.0,0.0,1.0,2.0,0.0,16.0,2.0,1.0,383.0,0.0
7,110,2018,2018-06,0.0,19.0,48.0,1.0,0.0,2.0,32.0,...,10.0,0.0,0.0,7.0,0.0,22.0,2.0,0.0,341.0,1.0
8,110,2018,2018-07,1.0,26.0,65.0,2.0,0.0,3.0,22.0,...,12.0,0.0,0.0,3.0,0.0,18.0,5.0,0.0,332.0,2.0
9,110,2018,2018-08,1.0,21.0,75.0,4.0,0.0,3.0,20.0,...,23.0,0.0,0.0,5.0,0.0,17.0,2.0,0.0,488.0,0.0


In [42]:
#Uploading crime_types

from sqlalchemy import create_engine

# Create the database engine
engine = create_engine(f'mysql+mysqlconnector://{username}:{password}@{host}/{database}')

# Specify the new table name here
new_table_name = 'MACHINE_LEARNING_CRIME_TYPES'

# Upload the dataframe to the new table
crime_types.to_sql(new_table_name, con=engine, if_exists='replace', index=False)


4958