# NYPD Shootings

Classifying the victim's race (Asian or White) with latitude and longitude of the scene.

In [190]:
# Don't change this cell; just run it. 

import numpy as np
from datascience import * 

# These lines do some fancy plotting magic
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore')
from datetime import datetime

Read in data

In [191]:
shootings = Table.read_table('NYPD_Shooting_Incident_Data__Historic_.csv')
shootings.show(5)

INCIDENT_KEY,OCCUR_DATE,OCCUR_TIME,BORO,LOC_OF_OCCUR_DESC,PRECINCT,JURISDICTION_CODE,LOC_CLASSFCTN_DESC,LOCATION_DESC,STATISTICAL_MURDER_FLAG,PERP_AGE_GROUP,PERP_SEX,PERP_RACE,VIC_AGE_GROUP,VIC_SEX,VIC_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat
228798151,05/27/2021,21:30:00,QUEENS,,105,0,,,False,,,,18-24,M,BLACK,1058920.0,180924,40.663,-73.7308,POINT (-73.73083868899994 40.662964620000025)
137471050,06/27/2014,17:40:00,BRONX,,40,0,,,False,,,,18-24,M,BLACK,1005030.0,234516,40.8104,-73.9249,POINT (-73.92494232599995 40.81035186300006)
147998800,11/21/2015,03:56:00,QUEENS,,108,0,,,True,,,,25-44,M,WHITE,1007670.0,209837,40.7426,-73.9155,POINT (-73.91549174199997 40.74260663300004)
146837977,10/09/2015,18:30:00,BRONX,,44,0,,,False,,,,<18,M,WHITE HISPANIC,1006540.0,244511,40.8378,-73.9195,POINT (-73.91945661499994 40.83778200300003)
58921844,02/19/2009,22:58:00,BRONX,,47,0,,,True,25-44,M,BLACK,45-64,M,BLACK,1024920.0,262189,40.8862,-73.8529,POINT (-73.85290950899997 40.88623791800006)


In [192]:
shootings.group('VIC_RACE')

VIC_RACE,count
AMERICAN INDIAN/ALASKAN NATIVE,10
ASIAN / PACIFIC ISLANDER,404
BLACK,19439
BLACK HISPANIC,2646
UNKNOWN,66
WHITE,698
WHITE HISPANIC,4049


In [193]:
shootings = shootings.select('Latitude', 'Longitude', 'VIC_RACE')
shootings = shootings.where('VIC_RACE', are.contained_in('ASIAN / PACIFIC ISLANDERWHITE'))

In [194]:
shootings = shootings.where('Latitude', are.between_or_equal_to(40,41))
shootings = shootings.where('Longitude', are.between_or_equal_to(-75,41))

Map of shootings

In [207]:
# Just run this cell!
colors = {"ASIAN / PACIFIC ISLANDER":"red", "WHITE":"blue"}
t = Table().with_columns("lat", shootings.column(0), 
                                      "lon", shootings.column(1), 
                                      "color", shootings.apply(colors.get, 2)
                        )
Circle.map_table(t, radius=2, fill_opacity=1)

In [196]:
#shootings = shootings.sample(2000)
shootings

Latitude,Longitude,VIC_RACE
40.7426,-73.9155,WHITE
40.8845,-73.9056,WHITE
40.6574,-73.9584,WHITE
40.701,-73.9422,WHITE
40.7001,-73.9477,ASIAN / PACIFIC ISLANDER
40.7726,-73.9159,WHITE
40.7547,-73.9916,WHITE
40.6878,-73.9132,WHITE
40.7025,-73.8138,ASIAN / PACIFIC ISLANDER
40.8475,-73.8273,WHITE


In [197]:
def distance(arr1, arr2):
    return np.sqrt(sum((arr1 - arr2)**2))

In [198]:
shootings.num_rows

1102

In [200]:
shuffled_table = shootings.sample(with_replacement=False)
train = shuffled_table.take(np.arange(826))
test = shuffled_table.take(np.arange(826, 1102))

print("Training set:\t",   train.num_rows, "examples")
print("Test set:\t",       test.num_rows, "examples")
train.show(5), test.show(5);

Training set:	 826 examples
Test set:	 276 examples


Latitude,Longitude,VIC_RACE
40.6874,-73.977,ASIAN / PACIFIC ISLANDER
40.8722,-73.8661,ASIAN / PACIFIC ISLANDER
40.6288,-73.9366,WHITE
40.6963,-73.8546,WHITE
40.5603,-74.1103,WHITE


Latitude,Longitude,VIC_RACE
40.707,-73.9046,WHITE
40.8575,-73.8968,WHITE
40.6762,-73.9513,WHITE
40.7467,-73.7069,WHITE
40.8345,-73.9266,WHITE


In [201]:
features = make_array("Latitude", "Longitude")
features

array(['Latitude', 'Longitude'],
      dtype='<U9')

In [202]:
def row_to_array(row, features):
    """Converts a row to an array of its features."""
    arr = make_array()
    for feature in features:
        arr = np.append(arr, row.item(feature))
    return arr

def classify(test_row, k, train):
    test_row_features_array = row_to_array(test_row, features)
    distances = make_array()
    for train_row in train.rows:
        train_row_features_array = row_to_array(train_row, features)
        row_distance = distance(train_row_features_array, test_row_features_array)
        distances = np.append(distances, row_distance)
    train_with_distances = train.with_column("Distance", distances)
    nearest_neighbors = train_with_distances.sort("Distance").take(np.arange(k))
    most_common_label = nearest_neighbors.group("VIC_RACE").sort('count', descending = True).column(0).item(0)
    return most_common_label

In [203]:
first_test = classify(test.row(0), 5, train)
first_test

'WHITE'

In [204]:
def three_classify(row):
    return classify(row, 3, train)

test_with_prediction = test.apply(three_classify)
labels_correct = np.count_nonzero(test_with_prediction == test.column("VIC_RACE"))
accuracy = labels_correct / test.num_rows
accuracy

0.6884057971014492

In [205]:
for i in np.arange(1, 10):
    def k_classify(row):
        return classify(row, i, train)
    test_with_prediction = test.apply(three_classify)
    labels_correct = np.count_nonzero(test_with_prediction == test.column("VIC_RACE"))
    accuracy = labels_correct / test.num_rows
    print(accuracy)

0.7282608695652174
0.6847826086956522
0.6884057971014492
0.6594202898550725
0.6811594202898551
0.6630434782608695
0.6884057971014492
0.6811594202898551
0.6847826086956522


In [206]:
for i in np.arange(10, 21):
    def k_classify(row):
        return classify(row, i, train)
    test_with_prediction = test.apply(three_classify)
    labels_correct = np.count_nonzero(test_with_prediction == test.column("VIC_RACE"))
    accuracy = labels_correct / test.num_rows
    print(accuracy)

0.6884057971014492
0.6739130434782609
0.6811594202898551
0.6702898550724637
0.7028985507246377
0.6992753623188406
0.7101449275362319
0.6992753623188406
0.7137681159420289
0.7028985507246377
0.7028985507246377


Which k to use?