# Merges the four features csvs into one file

In [1]:
import pandas as pd
import numpy as np

In [2]:
complaints = pd.read_csv('../data/features/complaints.csv')
crime = pd.read_csv('../data/features/crime.csv')
isr = pd.read_csv('../data/features/isr.csv')
uof = pd.read_csv('../data/features/use_of_force.csv')
census = pd.read_csv('../data/features/census_demographics.csv')

## Standardize Column Names 

### BEATS

In [3]:
#standardize col names
crime.rename(columns={'Beat':'BEAT', 'Year':'YEAR'}, inplace=True)
complaints.rename(columns={'COMPLAINT_YEAR':'YEAR'}, inplace=True)
census.rename(columns={'beat_num':'BEAT'}, inplace=True)

In [4]:
# Remove missing beats from complaints data
complaints = complaints[complaints['BEAT']!='Unknown']

In [5]:
# Convert beat to int for all data
for df in [complaints, crime, isr, uof, census]:
    df['BEAT'] = df['BEAT'].astype(int)

### COMPLAINTS

In [6]:
# Add total complaint col
complaints['TOTAL'] = complaints['COMPLAINANT_RACE_BlackorAfricanAmerican'] + complaints[
                                'COMPLAINANT_RACE_Hispanic,Latino,orSpanishOrigin'] + complaints[
                                'COMPLAINANT_RACE_White'] + complaints[
                                'COMPLAINANT_RACE_Other']

In [7]:
complaints.rename(columns = {
    'COMPLAINANT_RACE_BlackorAfricanAmerican': 'BLACK',
    'COMPLAINANT_RACE_Hispanic,Latino,orSpanishOrigin': 'HISPANIC',
    'COMPLAINANT_RACE_White': 'WHITE'
}, inplace=True)

In [8]:
# Drop other column
complaints.drop(['COMPLAINANT_RACE_Other'], axis=1, inplace=True)

In [9]:
complaints.columns

Index(['BEAT', 'YEAR', 'POLICE_SHOOTING', 'BLACK', 'HISPANIC', 'WHITE',
       'TOTAL'],
      dtype='object')

## Crime

In [10]:
# CONVERT TO UPPER
crime.columns = [c.upper() for c in crime.columns]

In [11]:
crime.columns

Index(['BEAT', 'YEAR', 'TOTAL', 'ARREST', 'DOMESTIC'], dtype='object')

In [12]:
uof.describe()

Unnamed: 0,BEAT,YEAR,TOTAL_COUNT,POLICE_W_WEAPON,POLICE_WO_WEAPON,HISPANIC,BLACK,WHITE
count,1083.0,1083.0,1083.0,1083.0,1083.0,1083.0,1083.0,1083.0
mean,1214.695291,2017.501385,19.067405,1.559557,5.854109,2.479224,14.457987,1.331487
std,708.812303,1.120613,16.051675,1.826815,7.140972,3.878025,15.629692,2.329985
min,0.0,2016.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,631.5,2016.0,8.0,0.0,1.0,0.0,3.0,0.0
50%,1123.0,2018.0,15.0,1.0,3.0,1.0,10.0,0.0
75%,1811.0,2019.0,26.0,2.0,8.0,3.0,22.0,2.0
max,3100.0,2019.0,107.0,13.0,63.0,30.0,103.0,20.0


## CENSUS

In [13]:
census.columns

Index(['beat', 'BEAT', 'district', 'sector', 'Total Pop', 'White', 'Black',
       'Hispanic', 'Median Income'],
      dtype='object')

In [14]:
census.drop(['district', 'sector', 'beat'], axis=1, inplace=True)

In [15]:
census.columns = [c.upper() for c in census.columns]

## MERGE ALL

In [16]:
for df in [complaints, crime, isr, uof]:
    print(df[df.duplicated(subset=['BEAT', 'YEAR'])].sort_values(by=['BEAT', 'YEAR']))

Empty DataFrame
Columns: [BEAT, YEAR, POLICE_SHOOTING, BLACK, HISPANIC, WHITE, TOTAL]
Index: []
Empty DataFrame
Columns: [BEAT, YEAR, TOTAL, ARREST, DOMESTIC]
Index: []
Empty DataFrame
Columns: [BEAT, YEAR, TOTAL_STOPS, SEARCH, BLACK, WHITE, HISPANIC, ARREST]
Index: []
Empty DataFrame
Columns: [BEAT, YEAR, TOTAL_COUNT, POLICE_W_WEAPON, POLICE_WO_WEAPON, HISPANIC, BLACK, WHITE]
Index: []


In [17]:
# prefix columns with dataset name to make merging tidier
complaints.rename(columns=lambda c: c if c in ['BEAT','YEAR'] else "COMPLAINTS_"+c, inplace=True)
crime.rename(columns=lambda c: c if c in ['BEAT','YEAR'] else "CRIME_"+c, inplace=True)
isr.rename(columns=lambda c: c if c in ['BEAT','YEAR'] else "ISR_"+c, inplace=True)
uof.rename(columns=lambda c: c if c in ['BEAT','YEAR'] else "UOF_"+c, inplace=True)
census.rename(columns=lambda c: c if c in ['BEAT','YEAR'] else "CENSUS_"+c, inplace=True)


In [18]:
merged_df = pd.merge(complaints, crime, how='inner', on=['BEAT','YEAR'])
print(merged_df.shape)
merged_df = pd.merge(merged_df, isr, how='inner', on=['BEAT', 'YEAR'])
print(merged_df.shape)
merged_df = pd.merge(merged_df, uof, how='inner', on=['BEAT', 'YEAR'])
print(merged_df.shape)
merged_df = pd.merge(merged_df, census, how='inner', on=['BEAT'])
print(merged_df.shape)


(991, 10)
(991, 16)
(979, 22)
(979, 27)


In [19]:
merged_df.columns

Index(['BEAT', 'YEAR', 'COMPLAINTS_POLICE_SHOOTING', 'COMPLAINTS_BLACK',
       'COMPLAINTS_HISPANIC', 'COMPLAINTS_WHITE', 'COMPLAINTS_TOTAL',
       'CRIME_TOTAL', 'CRIME_ARREST', 'CRIME_DOMESTIC', 'ISR_TOTAL_STOPS',
       'ISR_SEARCH', 'ISR_BLACK', 'ISR_WHITE', 'ISR_HISPANIC', 'ISR_ARREST',
       'UOF_TOTAL_COUNT', 'UOF_POLICE_W_WEAPON', 'UOF_POLICE_WO_WEAPON',
       'UOF_HISPANIC', 'UOF_BLACK', 'UOF_WHITE', 'CENSUS_TOTAL POP',
       'CENSUS_WHITE', 'CENSUS_BLACK', 'CENSUS_HISPANIC',
       'CENSUS_MEDIAN INCOME'],
      dtype='object')

In [20]:
# Write to disk
!mkdir -p ../data/features
merged_df.to_csv("../data/features/merged.csv", index=False)