# Cogs 108 Final Project 
Allison Reiss, Brendan Taing, Evan Barosay, Gael Van der Lee, Daniel Benamou, Adham Rafiq

The following data analysis is centered around the following research question:

Can we distinguish the safety of different neighborhoods in San Diego based on the number of calls dispatched by the SDPD based on the degree of the call and number of dispatches, and if so, are they related to income and population?

## Import and Clean Data

In [194]:
import pandas as pd

#load data into dataframes
beat = pd.read_csv('pd_beat_neighborhoods_datasd.csv')
call = pd.read_csv('pd_calls_for_service_2017_datasd.csv')
census = pd.read_csv('sandiegocensustract.csv')

In [195]:
# This is cleaning up the call df columns
call = call[['date_time','street','streettype','beat', 'priority']]


# This is cleaning up the census df to drop unneccessary columns
census = census.drop('TRACTNUM', axis = 1)
census = census.drop('TRACT', axis = 1)
census = census.drop('SevCrwd', axis = 1)
census = census.drop('TeenBirthAllWom', axis = 1)
census = census.drop('MaltrtAllegRate', axis = 1)
census = census.drop('ProxOffAlco', axis = 1)
census = census.drop('ProxOnAlco', axis = 1)
census = census.drop('TraffInjur', axis = 1)
census = census.drop('SNAP_FdStmp', axis = 1)
census = census.drop('n_FoodDesert', axis = 1)
census = census.drop('pct_FoodDesert', axis = 1)
census = census.drop('LiqCount', axis = 1)
census = census.drop('VoterPartic', axis = 1)
census = census.drop('HghSchOrHigh', axis = 1)
census = census.drop('Uninsured', axis = 1)
census = census.drop('PovertyPctl', axis = 1)
census = census.drop('Longitude', axis = 1)
census = census.drop('Latitude', axis = 1)
census = census.drop('LowBirthWeight', axis = 1)
census = census.drop('LowBirthWeightPctl', axis = 1)
census = census.drop('PollutionBurdenPctl', axis = 1)
census = census.drop('CES20Score', axis = 1)
census = census.drop('SingMother', axis = 1)
census = census.drop('TeenBirthProportion', axis = 1)
census = census.drop('PollutionBurdenScore', axis = 1)
census = census.drop('CES20PercentileRange', axis = 1)
census = census.drop('PopCharScore', axis = 1)
census = census.drop('PopCharPctl', axis = 1)
census = census.drop('FosterCareEntry', axis = 1)
census = census.drop('TotalPov', axis = 1)
census = census.drop('ChildPov', axis = 1)
census = census.drop('Education', axis = 1)

# change names of columns in beat dataframe to be compatible for merge
beat.columns = [
    'beat', 'neighborhood'
]

In [196]:
# Replace date_time column to include only year
call['date_time'] = call['date_time'].str[:4]
call.columns = ['year','street','streettype','beat', 'priority']

In [197]:
# Replace all 0s and 999s in beat with NaN. 
call['beat'] = call['beat'].replace(0,'NaN')
call['beat'] = call['beat'].replace(999,'NaN')

# Drop all rows with null values
call = call.dropna(how='any')

In [198]:
# Merge the call dataframe with beat dataframe to replace beat code with actual neighborhood
call = pd.merge(call, beat, on='beat')

call = call.drop(['beat'], axis=1)
call

Unnamed: 0,year,street,streettype,priority,neighborhood
0,2016,GRAND,AV,2.0,Pacific Beach
1,2016,OLIVER,AV,2.0,Pacific Beach
2,2016,INGRAHAM,ST,1.0,Pacific Beach
3,2016,HAINES,ST,1.0,Pacific Beach
4,2016,GARNET,AV,1.0,Pacific Beach
5,2016,INGRAHAM,ST,2.0,Pacific Beach
6,2016,PACIFIC VIEW,DR,1.0,Pacific Beach
7,2016,GARNET,AV,2.0,Pacific Beach
8,2016,INGRAHAM,ST,3.0,Pacific Beach
9,2016,GARNET,AV,1.0,Pacific Beach


## Analyze the Call dataframe and create a Heatmap to show dangerous areas

In [205]:
#  Function Name: get_threat_lvl
#  Purpose: to find the "threat level" of a neighborhoof
#          Threat level = sum of (priority of call*number of calls) for each neighborhood (can later do street)
#  Parameter(s): df (Dataframe containing all of the calls/priorities for a certain neighborhood)
#  Returns: The threat level of the neighborhood (float)
#
# Example Usage: 
#   get_threat_lvl(PB_df) could return 3.5 (idk if thats the actual value lol just an example)

def get_threat_lvl(df):
    # get the counts for each number of priority call in that neighborhood
    sum_1 = np.count_nonzero(df['priority'] == 1.0)
    sum_2 = np.count_nonzero(df['priority'] == 2.0)
    sum_3 = np.count_nonzero(df['priority'] == 3.0)
    sum_4 = np.count_nonzero(df['priority'] == 4.0)
    
    threat_lvl = (sum_1*1.0) + (sum_2*2.0) + (sum_3*3.0) + (sum_4*4.0)
    
    return threat_lvl

In [206]:
# Get list of unique neighborhoods in dataframe
unique_neighborhoods = call['neighborhood'].unique()

In [201]:
# Loop through list and run algorithm on each neighborhood to get the safety level of each neighborhood
# and add it to a dict -- (neighborhood, safety level) pair
#
# Make sure you pass a dataframe containing all calls for ONE NEIGHBOORHOOD into the function

In [202]:
# upload Google API for HeatMaps

In [203]:
# Figure out how to make a heat map

## The section below will start to analyze the correlations between income and other demographic variables 

In [None]:
# We will fill in these steps after completing the first part of our analysis (the heat maps)

In [204]:
#looking at unique neighborhood names, probably delete later but to visualize data
print (census['NeighborhoodName'].unique().size)
census['City'].unique()

238


array(['San Diego', 'San Marcos', 'Oceanside', 'Escondido', 'Vista',
       'Carlsbad', 'El Cajon', 'Fallbrook', 'National City', 'La Mesa',
       'San Ysidro', 'Encinitas', 'Lakeside', 'Chula Vista',
       'Borrego Springs', 'Spring Valley', 'Poway', 'Santee',
       'Imperial Beach', 'Campo', 'Warner Springs', 'Solana Beach',
       'Lemon Grove', 'La Jolla', 'Valley Center', 'Ramona',
       'Pauma Valley', 'Bonita', 'Coronado', 'Alpine', 'Descanso', 'Jamul',
       'Cardiff By the Sea', 'Del Mar', 'Bonsall', 'Julian',
       'Rancho Santa Fe', 'Camp Pendleton', nan], dtype=object)