## Data Cleaning

In [1]:
import pandas as pd
import numpy as np
from functools import reduce

In [2]:
def clean_city_name(string):
    lst = string.split()
    new_string = " ".join(lst[0:(len(lst)-1)])
    return new_string

In [3]:
def killing_rename_city(string):
    lst = string.split()
    if lst[-1].lower() == "county":
        new_string = " ".join(lst[0:(len(lst)-1)])
        return new_string
    else:
        return string

In [4]:
income = pd.read_csv("MedianHouseholdIncome2015.csv", encoding="windows-1252", na_values=["(X)", "-"])
income["City"] = income["City"].apply(clean_city_name)
income.head()

Unnamed: 0,Geographic Area,City,Median Income
0,AL,Abanda,11207
1,AL,Abbeville,25615
2,AL,Adamsville,42575
3,AL,Addison,37083
4,AL,Akron,21667


In [5]:
poverty = pd.read_csv("PercentagePeopleBelowPovertyLevel.csv", encoding="windows-1252", na_values=["(X)", "-"])
poverty["City"] = poverty["City"].apply(clean_city_name)
poverty.shape

(29329, 3)

In [6]:
education = pd.read_csv("PercentOver25CompletedHighSchool.csv", encoding="windows-1252", na_values=["(X)", "-"])
education["City"] = education["City"].apply(clean_city_name)
education.shape

(29329, 3)

In [7]:
race = pd.read_csv("ShareRaceByCity.csv", encoding="windows-1252", na_values=["(X)", "-"])
race.rename(columns={"Geographic area": "Geographic Area"}, inplace=True)
race["City"] = race["City"].apply(clean_city_name)
race.shape

(29268, 7)

In [8]:
killing = pd.read_csv("PoliceKillingsUS.csv", encoding="windows-1252", na_values=["(X)", "-"])
killing["city"] = killing["city"].apply(killing_rename_city)
killing = pd.DataFrame(killing.groupby(["city", "state"])["id"].count())
killing.columns = ["police_killing"]
killing["police_killing"] = 1
killing.reset_index(inplace=True)

In [9]:
df_lst = [income, poverty, education, race]
census_data = reduce(lambda left, right: pd.merge(left, right, how="inner", on=["City", "Geographic Area"]),
                     df_lst)
census_data

Unnamed: 0,Geographic Area,City,Median Income,poverty_rate,percent_completed_hs,share_white,share_black,share_native_american,share_asian,share_hispanic
0,AL,Abanda,11207,78.8,21.2,67.2,30.2,0.0,0.0,1.6
1,AL,Abbeville,25615,29.1,69.1,54.4,41.4,0.1,1.0,3.1
2,AL,Adamsville,42575,25.5,78.9,52.3,44.9,0.5,0.3,2.3
3,AL,Addison,37083,30.7,81.4,99.1,0.1,0.0,0.1,0.4
4,AL,Akron,21667,42.0,68.6,13.2,86.5,0.0,0.0,0.3
...,...,...,...,...,...,...,...,...,...,...
30518,WY,Woods Landing-Jelm,,18.6,100.0,95.9,0.0,0.0,2.1,0.0
30519,WY,Worland,41523,15.3,85.6,89.9,0.3,1.3,0.6,16.6
30520,WY,Wright,77114,5.9,89.2,94.5,0.1,1.4,0.2,6.2
30521,WY,Yoder,37500,5.4,79.4,97.4,0.0,0.0,0.0,4.0


In [10]:
data = pd.merge(census_data, killing, how="left", left_on=["City", "Geographic Area"], right_on=["city", "state"])
data["police_killing"] = data["police_killing"].fillna(0)
data = data.drop(columns=["Geographic Area", "City", "city", "state"]).dropna()
data

Unnamed: 0,Median Income,poverty_rate,percent_completed_hs,share_white,share_black,share_native_american,share_asian,share_hispanic,police_killing
0,11207,78.8,21.2,67.2,30.2,0.0,0.0,1.6,0.0
1,25615,29.1,69.1,54.4,41.4,0.1,1.0,3.1,1.0
2,42575,25.5,78.9,52.3,44.9,0.5,0.3,2.3,0.0
3,37083,30.7,81.4,99.1,0.1,0.0,0.1,0.4,0.0
4,21667,42.0,68.6,13.2,86.5,0.0,0.0,0.3,0.0
...,...,...,...,...,...,...,...,...,...
30515,34984,12.0,92.7,95.1,0.1,0.4,0.7,7.4,0.0
30517,94792,2.9,100.0,97.5,0.2,0.3,0.8,1.8,0.0
30519,41523,15.3,85.6,89.9,0.3,1.3,0.6,16.6,0.0
30520,77114,5.9,89.2,94.5,0.1,1.4,0.2,6.2,0.0


In [11]:
data["police_killing"].value_counts()

0.0    27207
1.0     1342
Name: police_killing, dtype: int64

In [12]:
# data.to_csv("data.csv", index=False)

## Baseline scikit-learn Model

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [14]:
data.loc[data["Median Income"] == "2,500-", "Median Income"] = "2500"
data.loc[data["Median Income"] == "250,000+", "Median Income"] = "250000"
data["Median Income"] = data["Median Income"].astype(int)

In [15]:
y = data.iloc[:, -1]
X = data.iloc[:, 0:-1]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.04, random_state=42)
y_train_keep = y_train[y_train == 1]
X_train_keep = X_train[y_train == 1]
y_train_resample = y_train[y_train != 1]
X_train_resample = X_train[y_train != 1]
X_train_resample, _, y_train_resample, _ = train_test_split(X_train_resample, y_train_resample, 
                                                            test_size = 0.95, random_state=42)
X_train = pd.concat([X_train_keep, X_train_resample])
y_train = np.concatenate([y_train_keep, y_train_resample])

In [17]:
svm = make_pipeline(StandardScaler(), SVC(C = 0.7, kernel = "poly", degree = 2))
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

In [18]:
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

array([[720, 376],
       [ 18,  28]])

In [19]:
# Precision
conf_mat[1, 1]/(conf_mat[0, 1] + conf_mat[1, 1])

0.06930693069306931

In [20]:
# Recall
conf_mat[1, 1]/(conf_mat[1, 0] + conf_mat[1, 1])

0.6086956521739131

In [21]:
# Accuracy
np.trace(conf_mat)/np.sum(conf_mat)

0.6549912434325744