In [84]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_columns', None)

In [85]:
df = pd.read_csv("Hate-2001-2019-0.csv")

In [86]:
#Fill empty values with None for Weapon Type and clean up typos in dataset
df['WeaponType'].fillna('None',inplace = True)
df['WeaponType'].replace('Other ( bottle, rocks, spitting)', 'Other (bottle, rocks, spitting)', inplace=True)
df['WeaponType'].replace('Firearm (unknown whether handgun, rifle or shotgun', 'Firearm (unknown whether handgun, rifle or shotgun)', inplace=True)

#Replace rest of NaN values with 'Unknown'
df_new = df.replace(np.nan, 'Unknown', regex=True)

#rearrange columns to be more easily readable
columns_titles = ['MostSeriousBias','MostSeriousBiasType','TotalNumberOfVictims','MostSeriousVictimType','TotalNumberOfSuspects','SuspectsRaceAsAGroup','MostSeriousUcr','MostSeriousUcrType','WeaponType','Offensive_Act','MostSeriousLocation','County','NCIC','ClosedYear','MonthOccurrence']
df_new=df_new.reindex(columns=columns_titles)

#One hot encode all necessary columns
df_new = pd.concat([df_new,pd.get_dummies(df_new['MostSeriousBiasType'], prefix='BiasType')],axis=1)
df_new.drop(['MostSeriousBiasType'],axis=1, inplace=True)
TotalNumberOfVictims = df_new['TotalNumberOfVictims']
df_new.drop(['TotalNumberOfVictims'],axis=1, inplace=True)
df_new = pd.concat([df_new,TotalNumberOfVictims],axis=1)
df_new = pd.concat([df_new,pd.get_dummies(df_new['MostSeriousVictimType'], prefix='VictimType')],axis=1)
df_new.drop(['MostSeriousVictimType'],axis=1, inplace=True)
TotalNumberOfSuspects = df_new['TotalNumberOfSuspects']
df_new.drop(['TotalNumberOfSuspects'],axis=1, inplace=True)
df_new = pd.concat([df_new,TotalNumberOfSuspects],axis=1)
df_new = pd.concat([df_new,pd.get_dummies(df_new['SuspectsRaceAsAGroup'], prefix='SuspectsRace')],axis=1)
df_new.drop(['SuspectsRaceAsAGroup'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['MostSeriousUcr'], prefix='Ucr')],axis=1)
df_new.drop(['MostSeriousUcr'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['MostSeriousUcrType'], prefix='UcrType')],axis=1)
df_new.drop(['MostSeriousUcrType'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['WeaponType'], prefix='Weapon')],axis=1)
df_new.drop(['WeaponType'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['Offensive_Act'], prefix='Offense')],axis=1)
df_new.drop(['Offensive_Act'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['MostSeriousLocation'], prefix='Location')],axis=1)
df_new.drop(['MostSeriousLocation'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['County'], prefix='County')],axis=1)
df_new.drop(['County'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['NCIC'], prefix='Agency')],axis=1)
df_new.drop(['NCIC'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['ClosedYear'], prefix='Year')],axis=1)
df_new.drop(['ClosedYear'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['MonthOccurrence'], prefix='Month')],axis=1)
df_new.drop(['MonthOccurrence'],axis=1, inplace=True)

In [87]:
#separate data into labels and data points
y_labels = df_new['MostSeriousBias']
x_data = df_new.iloc[:, 1:778]

In [119]:
#split data into training and testing sets and shuffle before splitting
x_Train, x_Test, y_Train, y_Test = train_test_split(x_data, y_labels, shuffle = True, test_size = .2)

In [120]:
#scale the data/normalize the data so that the features do not broadly vary
scaler = StandardScaler()
scaler.fit(x_Train)

x_Train = scaler.transform(x_Train)
x_Test = scaler.transform(x_Test)

In [124]:
#build the Knn classifier
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=2)
classifier.fit(x_Train, y_Train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [125]:
y_pred = classifier.predict(x_Test)

In [126]:
#print out statistics to show how well our classifier did
from sklearn.metrics import classification_report
print(classification_report(y_Test, y_pred))

                                                        precision    recall  f1-score   support

                    Anti-American Indian/Alaska Native       0.18      0.20      0.19        10
                                             Anti-Arab       0.07      0.09      0.08        57
                                            Anti-Asian       0.13      0.13      0.13       172
                              Anti-Atheism/Agnosticism       0.00      0.00      0.00         3
                                         Anti-Bisexual       0.00      0.00      0.00         9
                        Anti-Black or African American       0.47      0.50      0.48      1467
                                         Anti-Catholic       0.37      0.30      0.33        37
                               Anti-Citizenship Status       0.00      0.00      0.00         5
                                           Anti-Female       0.86      0.75      0.80         8
                                       

  _warn_prf(average, modifier, msg_start, len(result))
