In [1]:
import pandas as pd
pd.options.display.max_columns = None

In [2]:
# reported_crime: each case represents a victim who reported a crime in their neighborhood

raw_crime_data = pd.read_csv('data/Reported_Crime.csv')
"""
Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location'],
      dtype='object')
      
7,153,912 cases
#Data collected from 2001 to April 2020
"""

#Create new dataframe with relevant columns
reported_crime = raw_crime_data[['ID', 'Year', 'Community Area', 'Primary Type']].reset_index()
reported_crime.dropna(inplace=True)
#6,540,874 rows × 4 columns (compare with incomplete data, which was 998,719 rows × 7 columns)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#selecting cases from 2005 to 2011
reported_crime = reported_crime[(reported_crime.Year >= 2005) & (reported_crime.Year <= 2011)].reset_index()
reported_crime
#2,879,927 rows × 5 columns
reported_crime['Primary Type'].value_counts()

THEFT                               578115
BATTERY                             514278
CRIMINAL DAMAGE                     343826
NARCOTICS                           338471
BURGLARY                            180618
OTHER OFFENSE                       176269
ASSAULT                             169494
MOTOR VEHICLE THEFT                 135649
ROBBERY                             108385
DECEPTIVE PRACTICE                   95288
CRIMINAL TRESPASS                    86045
PROSTITUTION                         33234
WEAPONS VIOLATION                    27091
PUBLIC PEACE VIOLATION               21894
OFFENSE INVOLVING CHILDREN           18731
CRIM SEXUAL ASSAULT                  10311
SEX OFFENSE                           9868
GAMBLING                              7708
LIQUOR LAW VIOLATION                  6323
INTERFERENCE WITH PUBLIC OFFICER      5048
ARSON                                 4414
HOMICIDE                              3228
KIDNAPPING                            2287
INTIMIDATIO

In [4]:
# neighborhood_health: each case represents a neighborhood measure
neighborhood_health = pd.read_csv('data/Neighborhood_Health.csv')
"""
There are 77 neighborhoods

Index(['Community Area', 'Community Area Name', 'Birth Rate',
       'General Fertility Rate', 'Low Birth Weight',
       'Prenatal Care Beginning in First Trimester', 'Preterm Births',
       'Teen Birth Rate', 'Assault (Homicide)', 'Breast cancer in females',
       'Cancer (All Sites)', 'Colorectal Cancer', 'Diabetes-related',
       'Firearm-related', 'Infant Mortality Rate', 'Lung Cancer',
       'Prostate Cancer in Males', 'Stroke (Cerebrovascular Disease)',
       'Childhood Blood Lead Level Screening', 'Childhood Lead Poisoning',
       'Gonorrhea in Females', 'Gonorrhea in Males', 'Tuberculosis',
       'Below Poverty Level', 'Crowded Housing', 'Dependency',
       'No High School Diploma', 'Per Capita Income', 'Unemployment'],
      dtype='object')
      
# 77 rows x 29 columns
"""

"\nThere are 77 neighborhoods\n\nIndex(['Community Area', 'Community Area Name', 'Birth Rate',\n       'General Fertility Rate', 'Low Birth Weight',\n       'Prenatal Care Beginning in First Trimester', 'Preterm Births',\n       'Teen Birth Rate', 'Assault (Homicide)', 'Breast cancer in females',\n       'Cancer (All Sites)', 'Colorectal Cancer', 'Diabetes-related',\n       'Firearm-related', 'Infant Mortality Rate', 'Lung Cancer',\n       'Prostate Cancer in Males', 'Stroke (Cerebrovascular Disease)',\n       'Childhood Blood Lead Level Screening', 'Childhood Lead Poisoning',\n       'Gonorrhea in Females', 'Gonorrhea in Males', 'Tuberculosis',\n       'Below Poverty Level', 'Crowded Housing', 'Dependency',\n       'No High School Diploma', 'Per Capita Income', 'Unemployment'],\n      dtype='object')\n      \n# 77 rows x 29 columns\n"

In [5]:
# raw: reported_crime (individual-level data) merged with reported_crime (neighborhood_level data)
raw = pd.merge(neighborhood_health, reported_crime, on='Community Area').reset_index(drop=True)
# 2,879,882 rows × 34 columns

In [6]:
# Created 32 variables-- one variable for each Primary Type category.

crime_type = raw['Primary Type'].unique()
list(crime_type)
for crime in crime_type:
    raw[crime] = raw['Primary Type'].apply(lambda x: 1 if x == crime else 0)

# Drop the Year variable
# 2005    453674
# 2006    448073
# 2007    436874
# 2008    426840
# 2009    392536
# 2010    370203
# 2011    351682

raw.drop(columns=['Year'], inplace=True)

# 2,879,882 rows × 65 columns

In [7]:
# Aggregate data by neighborhood

df1 = raw.set_index('Community Area Name')
df1 = raw.loc[:, :'Unemployment'].groupby('Community Area Name').mean().reset_index()
ca = raw['Community Area Name'].reset_index(drop=True)
df2 = raw.loc[:, 'ASSAULT':]
df2 = pd.concat([ca, df2], axis=1)
df2 = df2.groupby('Community Area Name').sum().reset_index()
df3 = pd.merge(df1, df2)
df3.sort_values('Community Area Name', inplace=True)

# 77 rows x 60 columns

In [8]:
#Classifying Crimes

#Violent Crimes
df3["Violence"] =\
\
df3['ASSAULT'] +\
df3['BATTERY'] +\
df3['ARSON'] +\
df3['HOMICIDE'] +\
df3['INTIMIDATION'] +\
df3['KIDNAPPING']

#Sexual Crimes
df3["Sexual Crimes"] =\
\
df3['CRIM SEXUAL ASSAULT'] +\
df3['SEX OFFENSE'] +\
df3['PROSTITUTION'] +\
df3['CRIMINAL SEXUAL ASSAULT'] +\
df3['HUMAN TRAFFICKING']

#Property Crimes
df3["Property Crimes"] =\
\
df3['MOTOR VEHICLE THEFT'] +\
df3['THEFT'] +\
df3['ROBBERY'] +\
df3['BURGLARY'] +\
df3['INTIMIDATION'] +\
df3['KIDNAPPING']

#Drugs
df3["Drugs"] =\
\
df3['NARCOTICS'] +\
df3['OTHER NARCOTIC VIOLATION']

df3.rename(columns={'Assault (Homicide)': 'Homicide_rate_per_100k', 'HOMICIDE': 'Homicide'}, inplace=True)

#Creating neighborhood crime types csv file
#77 rows x 64 columns

df3.to_csv('output_data/neighborhood_full_data.csv', index=None)