In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

#### Police Sentiment Data
Get average safety and trust scores for each district per month and year.

In [25]:
"""
police_df_cleaned = pd.read_csv('/Users/Emi/Documents/GitHub/Crime-Analysis/police_df_cleaned.csv')
crime_df = pd.read_csv('Chicago_Crime_Data_Cleaned.csv')
"""

"\npolice_df_cleaned = pd.read_csv('/Users/Emi/Documents/GitHub/Crime-Analysis/police_df_cleaned.csv')\ncrime_df = pd.read_csv('Chicago_Crime_Data_Cleaned.csv')\n"

In [None]:
crime_df.head()

In [None]:
#Keeping only overall safety and trust scores to start, we can add more later if necessary
police_df_filtered = police_df_cleaned[['DISTRICT','SAFETY','TRUST','START_DATE','END_DATE']]

In [None]:
crime_df.District = crime_df.District.astype('str')
len(list(crime_df.District.unique()))

In [None]:
police_df_filtered.DISTRICT = police_df_filtered.DISTRICT.astype('str')
len(list(police_df_filtered.DISTRICT.unique()))

In [None]:
police_df_filtered['START_DATE'] = pd.to_datetime(police_df_filtered['START_DATE'])
police_df_filtered['END_DATE'] = pd.to_datetime(police_df_filtered['END_DATE'])

In [None]:
(police_df_filtered['END_DATE'] - police_df_filtered['START_DATE']).dt.days.value_counts()
#These all seem to show a time period of one month, so we can just create columns for the Month and Year

In [None]:
police_df_filtered['Year_Month'] = police_df_filtered['START_DATE'].dt.to_period('M')

In [None]:
police_df_filtered.drop(columns=['START_DATE','END_DATE'])

In [None]:
police_df_filtered.head()

There are multiple safety and trust scores for each district since we removed sectors, so we can take the average scores for each district

In [None]:
mean_sentiment_scores = police_df_filtered.groupby(['DISTRICT','Year_Month'])['SAFETY','TRUST'].mean().round(2).reset_index()

In [None]:
mean_sentiment_scores[mean_sentiment_scores.DISTRICT.eq('1.0')]

In [None]:
mean_sentiment_scores.head()

In [None]:
crime_df['Date'] = pd.to_datetime(crime_df['Date'])

In [None]:
crime_df['Year_Month'] = crime_df['Date'].dt.to_period('M')

In [None]:
mean_sentiment_scores.rename(columns={'SAFETY':'Police Safety Score',
                                      'TRUST':'Police Trust Score',
                                      'DISTRICT':'Police Districts'},inplace=True)

In [None]:
crime_df['Police Districts'] = crime_df['Police Districts'].astype('str')

In [None]:
crime_and_police = pd.merge(crime_df,mean_sentiment_scores,how='left',on=['Police Districts','Year_Month'])
crime_and_police.head(5)

#### Grocery Store Data
This data apparently only applies to 2013 so I'm not sure if it will be useful for our overall model. If we wanted to do a snapshot analysis of 2013, we still have time series data that could be frequent enough for analysis. We can discuss this more.

In [None]:
grocery_stores = pd.read_csv('Grocery_Stores_-_2013_20231109.csv')
#Information about dataset here: https://github.com/Chicago/food-deserts

We could identify which areas are food deserts and create a 'Y'/'N' column for this using distance between the crime location and a grocery store. Food deserts are defined as areas in Chicago which are more than 0.5 or 1 mile from a grocery store, depending on the grocery store size. This is represented by the 'A' or 'B' buffer size. We could also represent this as the count of grocery stores within 0.5 or 1 mile.

We could also simply provide the count of grocery stores for each community area, but we would probably need to control for population size using census data that can give us population numbers by Community Area. I think we could try this first, see if any correlation exists, and if one does, we can investigate further and focus in on food deserts.

In [None]:
grocery_stores.columns.values

In [None]:
len(grocery_stores['COMMUNITY AREA'].unique())

In [None]:
grocery_store_count = grocery_stores.groupby(['COMMUNITY AREA']).agg(Grocery_Store_Count=('STORE NAME','count')).reset_index()

In [None]:
merged_crime_df = pd.merge(crime_and_police, grocery_store_count, how='left',left_on='Community Area',right_on='COMMUNITY AREA')

In [None]:
merged_crime_df.drop(columns='COMMUNITY AREA',inplace=True)

In [None]:
merged_crime_df.rename(columns={'Grocery_Store_Count':'Grocery Stores per Community Area'},inplace=True)

#### Housing Data
This data is clean enough and we can provide the number of affordable housing units per community area, but since there is no time column, I'm unsure how we could use this to train our model. This data was updated in October 2023, but there is no easy way to tell when each apartment/unit was built, so these numbers would likely only be reliable for the past 5-10 years. 

The grocery store data presents the same problem. We may need to think about separate analyses for time series vs. regression since some of our independent variables are so time restricted.

In [None]:
housing_df = pd.read_csv('Affordable_Rental_Housing_Developments_20231109.csv')

In [None]:
housing_df.rename(columns={'Community Area Number':'Community Area'}, inplace=True)

In [None]:
housing_df.head()

In [None]:
affordable_housing_units = housing_df.groupby('Community Area')['Units'].sum().reset_index()

In [None]:
affordable_housing_units.rename(columns={'Units':'Affordable Housing Units per Community Area',
                                        'Police Safety Score':'Police Safety Score per Community Area',
                                        'Police Trust Score': 'Police Trust Score per Community Area'}
                                        ,inplace=True)

In [None]:
merged_crime_df = pd.merge(merged_crime_df,affordable_housing_units, how='left',on='Community Area')

In [None]:
merged_crime_df.head(0)

#### Census Data

In [None]:
census_df = pd.read_csv('Census_Data_-_Selected_socioeconomic_indicators_in_Chicago__2008___2012_20231109.csv')

In [None]:
census_df.shape

In [None]:
len(census_df['Community Area Number'].unique())

In [None]:
census_df.drop(columns='COMMUNITY AREA NAME',inplace=True)

In [None]:
census_df.columns = census_df.columns.str.lower() + " per Community Area"
census_df.columns  = census_df.columns.str.title()
census_df.rename(columns={'Community Area Number Per Community Area': 'Community Area'},inplace=True)

In [None]:
merged_crime_df = pd.merge(merged_crime_df, census_df, how='left', on='Community Area')

### Train Station Data

In [None]:
train_station_df = pd.read_csv('train_stops_converted.csv')

In [None]:
train_station_df.columns.values

In [None]:
train_station_df['Month'] = pd.to_datetime(train_station_df['Month'])

In [None]:
train_station_df['Year_Month'] = train_station_df['Month'].dt.to_period('M')

In [None]:
average_train_stats = train_station_df.groupby(['Comm_Num','Year_Month'])['Avg_Weekday_Rides','Avg_Saturday_Rides',
                        'Avg_Sunday/Holiday_Rides','Monthly_Total'].mean().reset_index()

In [None]:
average_train_stats.rename(columns={'Comm_Num':'Community Area'},inplace=True)

In [None]:
len(average_train_stats['Community Area'].unique())

In [None]:
merged_crime_df = pd.merge(merged_crime_df, average_train_stats, how='left', on=['Community Area','Year_Month'])

In [None]:
#merged_crime_df.to_csv('merged_crime_df.csv',index=False)