In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

#### Police Sentiment Data
Get average safety and trust scores for each district per month and year.

In [2]:
police_df_cleaned = pd.read_csv('/Users/Emi/Documents/GitHub/Crime-Analysis/police_df_cleaned.csv')
crime_df = pd.read_csv('Chicago_Crime_Data_Cleaned.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/Emi/Documents/GitHub/Crime-Analysis/police_df_cleaned.csv'

In [None]:
crime_df.head()

In [None]:
#Keeping only overall safety and trust scores to start, we can add more later if necessary
police_df_filtered = police_df_cleaned[['DISTRICT','SAFETY','TRUST','START_DATE','END_DATE']]

In [None]:
crime_df.District = crime_df.District.astype('str')
len(list(crime_df.District.unique()))

In [None]:
police_df_filtered.DISTRICT = police_df_filtered.DISTRICT.astype('str')
len(list(police_df_filtered.DISTRICT.unique()))

In [None]:
police_df_filtered['START_DATE'] = pd.to_datetime(police_df_filtered['START_DATE'])
police_df_filtered['END_DATE'] = pd.to_datetime(police_df_filtered['END_DATE'])

In [None]:
(police_df_filtered['END_DATE'] - police_df_filtered['START_DATE']).dt.days.value_counts()
#These all seem to show a time period of one month, so we can just create columns for the Month and Year

In [None]:
police_df_filtered['Year_Month'] = police_df_filtered['START_DATE'].dt.to_period('M')

In [None]:
police_df_filtered.drop(columns=['START_DATE','END_DATE'])

In [None]:
police_df_filtered.head()

There are multiple safety and trust scores for each district since we removed sectors, so we can take the average scores for each district

In [None]:
mean_sentiment_scores = police_df_filtered.groupby(['DISTRICT','Year_Month'])['SAFETY','TRUST'].mean().round(2).reset_index()

In [None]:
mean_sentiment_scores[mean_sentiment_scores.DISTRICT.eq('1.0')]

In [None]:
mean_sentiment_scores.head()

In [None]:
crime_df['Date'] = pd.to_datetime(crime_df['Date'])

In [None]:
crime_df['Year_Month'] = crime_df['Date'].dt.to_period('M')

In [None]:
mean_sentiment_scores.rename(columns={'SAFETY':'Police Safety Score',
                                      'TRUST':'Police Trust Score',
                                      'DISTRICT':'Police Districts'},inplace=True)

In [None]:
crime_df['Police Districts'] = crime_df['Police Districts'].astype('str')

In [None]:
crime_and_police = pd.merge(crime_df,mean_sentiment_scores,how='left',on=['Police Districts','Year_Month'])
crime_and_police.head(5)

#### Grocery Store Data
This data apparently only applies to 2013 so I'm not sure if it will be useful for our overall model. If we wanted to do a snapshot analysis of 2013, we still have time series data that could be frequent enough for analysis. We can discuss this more.

In [None]:
grocery_stores = pd.read_csv('Grocery_Stores_-_2013_20231109.csv')
#Information about dataset here: https://github.com/Chicago/food-deserts

We could identify which areas are food deserts and create a 'Y'/'N' column for this using distance between the crime location and a grocery store. Food deserts are defined as areas in Chicago which are more than 0.5 or 1 mile from a grocery store, depending on the grocery store size. This is represented by the 'A' or 'B' buffer size. We could also represent this as the count of grocery stores within 0.5 or 1 mile.

We could also simply provide the count of grocery stores for each community area, but we would probably need to control for population size using census data that can give us population numbers by Community Area. I think we could try this first, see if any correlation exists, and if one does, we can investigate further and focus in on food deserts.

In [None]:
grocery_stores.columns.values

In [None]:
len(grocery_stores['COMMUNITY AREA'].unique())

In [None]:
grocery_store_count = grocery_stores.groupby(['COMMUNITY AREA']).agg(Grocery_Store_Count=('STORE NAME','count')).reset_index()

In [None]:
merged_crime_df = pd.merge(crime_and_police, grocery_store_count, how='left',left_on='Community Area',right_on='COMMUNITY AREA')

In [None]:
merged_crime_df.drop(columns='COMMUNITY AREA',inplace=True)

In [None]:
merged_crime_df.rename(columns={'Grocery_Store_Count':'Grocery Stores per Community Area'},inplace=True)

#### Housing Data
This data is clean enough and we can provide the number of affordable housing units per community area, but since there is no time column, I'm unsure how we could use this to train our model. This data was updated in October 2023, but there is no easy way to tell when each apartment/unit was built, so these numbers would likely only be reliable for the past 5-10 years. 

The grocery store data presents the same problem. We may need to think about separate analyses for time series vs. regression since some of our independent variables are so time restricted.

In [None]:
housing_df = pd.read_csv('Affordable_Rental_Housing_Developments_20231109.csv')

In [None]:
housing_df.rename(columns={'Community Area Number':'Community Area'}, inplace=True)

In [None]:
housing_df.head()

In [None]:
affordable_housing_units = housing_df.groupby('Community Area')['Units'].sum().reset_index()

In [None]:
affordable_housing_units.rename(columns={'Units':'Affordable Housing Units per Community Area',
                                        'Police Safety Score':'Police Safety Score per Community Area',
                                        'Police Trust Score': 'Police Trust Score per Community Area'}
                                        ,inplace=True)

In [None]:
merged_crime_df = pd.merge(merged_crime_df,affordable_housing_units, how='left',on='Community Area')

In [None]:
merged_crime_df.head(0)

#### Census Data

In [None]:
census_df = pd.read_csv('Census_Data_-_Selected_socioeconomic_indicators_in_Chicago__2008___2012_20231109.csv')

In [None]:
census_df.shape

In [None]:
len(census_df['Community Area Number'].unique())

In [None]:
census_df.drop(columns='COMMUNITY AREA NAME',inplace=True)

In [None]:
census_df.columns = census_df.columns.str.lower() + " per Community Area"
census_df.columns  = census_df.columns.str.title()
census_df.rename(columns={'Community Area Number Per Community Area': 'Community Area'},inplace=True)

In [None]:
merged_crime_df = pd.merge(merged_crime_df, census_df, how='left', on='Community Area')

### Train Station Data

In [None]:
train_station_df = pd.read_csv('train_stops_converted.csv')

In [None]:
train_station_df.columns.values

In [None]:
train_station_df['Month'] = pd.to_datetime(train_station_df['Month'])

In [None]:
train_station_df['Year_Month'] = train_station_df['Month'].dt.to_period('M')

In [None]:
average_train_stats = train_station_df.groupby(['Comm_Num','Year_Month'])['Avg_Weekday_Rides','Avg_Saturday_Rides',
                        'Avg_Sunday/Holiday_Rides','Monthly_Total'].mean().reset_index()

In [None]:
average_train_stats.rename(columns={'Comm_Num':'Community Area'},inplace=True)

In [None]:
len(average_train_stats['Community Area'].unique())

In [None]:
merged_crime_df = pd.merge(merged_crime_df, average_train_stats, how='left', on=['Community Area','Year_Month'])

In [None]:
#merged_crime_df.to_csv('merged_crime_df.csv',index=False)

In [4]:
file_path = '/Users/ericchestnut/Documents/Data Science Class /Project/Cleaned Data/Police_Sentiment_Scores.csv'

police_df = pd.read_csv(file_path)


In [17]:
police_df = police_df.dropna(subset=['SECTOR'])
police_df

Unnamed: 0,AREA,DISTRICT,SECTOR,SAFETY,S_RACE_AFRICAN_AMERICAN,S_RACE_ASIAN_AMERICAN,S_RACE_HISPANIC,S_RACE_WHITE,S_RACE_OTHER,S_AGE_LOW,S_AGE_MEDIUM,S_AGE_HIGH,S_SEX_FEMALE,S_SEX_MALE,S_EDUCATION_LOW,S_EDUCATION_MEDIUM,S_EDUCATION_HIGH,S_INCOME_LOW,S_INCOME_MEDIUM,S_INCOME_HIGH,TRUST,T_RACE_AFRICAN_AMERICAN,T_RACE_ASIAN_AMERICAN,T_RACE_HISPANIC,T_RACE_WHITE,T_RACE_OTHER,T_AGE_LOW,T_AGE_MEDIUM,T_AGE_HIGH,T_SEX_FEMALE,T_SEX_MALE,T_EDUCATION_LOW,T_EDUCATION_MEDIUM,T_EDUCATION_HIGH,T_INCOME_LOW,T_INCOME_MEDIUM,T_INCOME_HIGH,T_LISTEN,T_LISTEN_RACE_AFRICAN_AMERICAN,T_LISTEN_RACE_ASIAN_AMERICAN,T_LISTEN_RACE_HISPANIC,T_LISTEN_RACE_WHITE,T_LISTEN_RACE_OTHER,T_LISTEN_AGE_LOW,T_LISTEN_AGE_MEDIUM,T_LISTEN_AGE_HIGH,T_LISTEN_SEX_FEMALE,T_LISTEN_SEX_MALE,T_LISTEN_EDUCATION_LOW,T_LISTEN_EDUCATION_MEDIUM,T_LISTEN_EDUCATION_HIGH,T_LISTEN_INCOME_LOW,T_LISTEN_INCOME_MEDIUM,T_LISTEN_INCOME_HIGH,T_RESPECT,T_RESPECT_RACE_AFRICAN_AMERICAN,T_RESPECT_RACE_ASIAN_AMERICAN,T_RESPECT_RACE_HISPANIC,T_RESPECT_RACE_WHITE,T_RESPECT_RACE_OTHER,T_RESPECT_AGE_LOW,T_RESPECT_AGE_MEDIUM,T_RESPECT_AGE_HIGH,T_RESPECT_SEX_FEMALE,T_RESPECT_SEX_MALE,T_RESPECT_EDUCATION_LOW,T_RESPECT_EDUCATION_MEDIUM,T_RESPECT_EDUCATION_HIGH,T_RESPECT_INCOME_LOW,T_RESPECT_INCOME_MEDIUM,T_RESPECT_INCOME_HIGH,START_DATE,END_DATE
0,area_5,14.0,1420.0,56.69,40.27,58.65,61.66,60.06,44.52,48.83,65.43,47.35,65.63,45.56,63.39,58.57,47.30,57.60,56.49,57.74,60.90,57.83,64.16,76.54,40.49,84.10,57.72,53.94,68.25,57.65,65.77,65.63,53.16,72.49,68.69,68.18,44.89,58.82,52.71,62.63,73.52,38.62,81.83,55.60,52.69,66.64,55.97,63.12,63.91,51.84,69.33,68.33,65.39,42.14,62.99,62.96,65.70,79.56,42.37,86.37,59.85,55.18,69.87,59.33,68.41,67.35,54.49,75.65,69.04,70.98,47.64,2021-04-01,2021-04-30
1,area_2,5.0,510.0,35.51,32.37,43.24,41.55,48.80,57.43,34.53,34.21,49.43,38.51,36.03,33.98,32.20,43.14,40.45,22.57,51.54,47.52,36.04,67.34,58.50,75.51,73.10,53.69,51.99,51.94,60.90,46.71,49.25,39.71,65.96,42.03,49.46,69.18,44.59,34.34,66.00,52.18,74.94,71.37,49.28,50.82,49.60,55.15,45.16,40.79,38.86,64.12,39.73,46.37,67.23,50.46,37.73,68.69,64.83,76.08,74.84,58.10,53.16,54.28,66.65,48.26,57.72,40.56,67.80,44.34,52.56,71.14,2021-04-01,2021-04-30
2,area_4,11.0,1110.0,42.63,41.12,41.60,39.17,52.04,61.11,47.86,54.41,27.39,41.09,44.90,42.56,41.88,46.41,38.33,41.77,43.38,46.38,35.31,58.47,54.72,40.01,58.34,50.37,53.67,36.67,46.45,38.51,61.26,36.87,41.59,42.65,49.07,42.79,41.94,30.57,56.98,50.75,38.25,56.08,45.70,51.14,32.10,44.76,30.49,53.88,35.13,38.13,40.89,42.59,37.01,50.82,40.05,59.95,58.69,41.77,60.60,55.04,56.20,41.24,48.13,46.52,68.64,38.61,45.05,44.41,55.56,48.57,2021-04-01,2021-04-30
3,area_1,9.0,930.0,46.02,37.58,36.90,49.17,50.06,55.76,45.34,38.00,51.73,42.72,44.47,49.92,42.36,37.39,47.72,47.91,33.79,63.57,51.23,63.81,63.01,86.34,54.79,58.43,66.15,68.76,68.63,64.91,57.84,72.11,71.57,60.55,69.56,73.26,59.67,48.34,64.39,57.33,84.45,50.21,53.61,61.62,66.46,64.71,61.77,53.66,69.19,67.65,56.53,66.57,70.05,67.48,54.12,63.24,68.69,88.24,59.37,63.24,70.69,71.07,72.54,68.04,62.02,75.03,75.50,64.57,72.54,76.47,2021-04-01,2021-04-30
4,area_5,16.0,1620.0,55.97,57.84,51.32,59.89,53.82,48.03,62.47,54.67,52.92,55.61,55.31,60.94,55.75,48.57,57.74,51.88,63.00,79.77,72.23,85.09,82.14,77.07,92.10,88.24,78.13,79.14,76.45,82.68,80.18,79.26,77.36,78.38,79.16,79.19,78.90,69.03,83.93,81.75,75.75,91.76,89.18,77.58,77.56,75.52,81.66,79.72,78.23,75.86,76.23,78.77,78.12,80.65,75.43,86.26,82.53,78.40,92.43,87.31,78.69,80.72,77.37,83.70,80.64,80.29,78.86,80.53,79.55,80.26,2021-04-01,2021-04-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6455,area_5,17.0,1710.0,64.68,59.89,72.58,56.43,70.55,60.76,66.31,64.86,62.45,62.18,68.06,64.92,61.17,69.33,48.90,65.74,75.10,66.93,61.56,71.72,67.96,67.82,60.62,65.71,66.43,69.55,67.26,66.40,65.63,63.10,78.80,54.95,66.56,76.05,65.50,60.99,68.22,70.95,64.45,61.45,66.74,60.86,67.60,65.35,66.03,66.67,61.33,74.45,58.64,64.56,71.07,68.35,62.12,75.23,64.97,71.20,59.79,64.69,72.00,71.50,69.16,66.77,64.60,64.87,83.15,51.26,68.56,81.03,2023-09-01,2023-09-30
6456,area_3,12.0,1210.0,66.53,69.47,65.76,55.82,68.04,74.48,65.44,69.22,59.09,62.56,72.23,56.51,67.05,73.15,73.68,61.01,70.06,63.30,57.57,64.51,55.13,65.83,50.95,56.86,72.90,69.77,55.49,73.27,54.19,61.57,76.07,64.39,59.35,64.11,62.91,56.77,60.63,56.58,65.13,46.76,56.38,73.76,69.87,54.75,73.84,54.97,61.45,73.42,65.60,60.38,60.73,63.68,58.38,68.39,53.68,66.52,55.13,57.34,72.04,69.68,56.23,72.69,53.40,61.70,78.71,63.19,58.31,67.48,2023-09-01,2023-09-30
6457,area_3,1.0,110.0,69.07,66.12,61.83,60.71,73.42,81.56,70.23,71.89,63.56,63.24,74.26,78.94,63.29,74.35,50.94,71.94,73.83,73.06,72.00,62.37,61.43,77.58,56.66,75.88,74.49,63.23,70.05,73.96,72.24,65.48,81.44,57.80,65.32,87.90,72.01,76.29,56.65,61.19,76.60,49.00,74.65,75.42,61.24,66.46,75.16,77.06,62.61,80.67,55.96,63.36,89.31,74.10,67.71,68.08,61.67,78.55,64.32,77.11,73.56,65.23,73.64,72.76,67.43,68.36,82.20,59.65,67.27,86.49,2023-09-01,2023-09-30
6458,area_5,16.0,1620.0,65.99,45.71,62.55,64.75,67.13,57.25,70.91,66.51,57.34,65.11,67.67,67.90,64.19,73.33,61.26,62.29,78.03,71.75,59.29,74.63,73.04,72.15,54.14,70.08,73.14,68.06,71.11,72.63,76.47,70.00,75.22,63.15,71.98,78.73,69.41,63.43,70.23,70.23,70.27,53.35,69.49,71.22,64.29,68.65,70.41,71.57,68.89,73.30,60.75,69.90,75.74,74.08,55.14,79.04,75.84,74.03,54.94,70.66,75.06,71.83,73.56,74.84,81.37,71.11,77.14,65.54,74.05,81.72,2023-09-01,2023-09-30


In [24]:
unique_sectors = police_df['SECTOR'].unique()


sorted_unique_sectors = np.sort(unique_sectors)

print(sorted_unique_sectors)


[ 110.  120.  130.  210.  220.  230.  310.  320.  330.  410.  420.  430.
  510.  520.  530.  610.  620.  630.  710.  720.  730.  810.  820.  830.
  910.  920.  930. 1010. 1020. 1030. 1110. 1120. 1130. 1210. 1220. 1230.
 1410. 1420. 1430. 1510. 1520. 1530. 1610. 1620. 1630. 1650. 1710. 1720.
 1730. 1810. 1820. 1830. 1910. 1920. 1930. 2010. 2020. 2030. 2210. 2220.
 2230. 2410. 2420. 2430. 2510. 2520. 2530. 2540. 2550.]


In [8]:
file_path = '/Users/ericchestnut/Downloads/PoliceBeatDec2012.csv'
beats_df = pd.read_csv(file_path)

In [9]:
beats_df

Unnamed: 0,the_geom,DISTRICT,SECTOR,BEAT,BEAT_NUM
0,MULTIPOLYGON (((-87.7047252651434 41.975774430...,17,1,1,1713
1,MULTIPOLYGON (((-87.83365455041093 41.97535481...,31,0,0,3100
2,MULTIPOLYGON (((-87.90684167275818 41.97656175...,16,5,5,1651
3,MULTIPOLYGON (((-87.64491798475646 41.96972709...,19,1,1,1914
4,MULTIPOLYGON (((-87.63724132684592 41.96598776...,19,1,1,1915
5,MULTIPOLYGON (((-87.65967036145184 41.96902531...,19,1,1,1913
6,MULTIPOLYGON (((-87.66389849134416 41.76855169...,7,3,3,735
7,MULTIPOLYGON (((-87.66749666157422 41.96890048...,19,1,1,1912
8,MULTIPOLYGON (((-87.71336307475804 41.96840083...,17,2,2,1723
9,MULTIPOLYGON (((-87.67918558275845 41.96875617...,19,1,1,1911


In [23]:
unique_sectors = beats_df['BEAT_NUM'].unique()

sorted_unique_sectors = np.sort(unique_sectors)

print(sorted_unique_sectors)

[ 111  112  113  114  121  122  123  124  131  132  133  211  212  213
  214  215  221  222  223  224  225  231  232  233  234  235  311  312
  313  314  321  322  323  324  331  332  333  334  411  412  413  414
  421  422  423  424  431  432  433  434  511  512  513  522  523  524
  531  532  533  611  612  613  614  621  622  623  624  631  632  633
  634  711  712  713  714  715  722  723  724  725  726  731  732  733
  734  735  811  812  813  814  815  821  822  823  824  825  831  832
  833  834  835  911  912  913  914  915  921  922  923  924  925  931
  932  933  934  935 1011 1012 1013 1014 1021 1022 1023 1024 1031 1032
 1033 1034 1111 1112 1113 1114 1115 1121 1122 1123 1124 1125 1131 1132
 1133 1134 1135 1211 1212 1213 1214 1215 1221 1222 1223 1224 1225 1231
 1232 1233 1234 1235 1411 1412 1413 1414 1421 1422 1423 1424 1431 1432
 1433 1434 1511 1512 1513 1522 1523 1524 1531 1532 1533 1611 1612 1613
 1614 1621 1622 1623 1624 1631 1632 1633 1634 1651 1652 1653 1654 1655
 1711 