In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv(r'C:\Users\98all\Documents\GitHub\ECE196_SP21\Hate-2001-2019-0.csv')

In [4]:
#Fill empty values with None for Weapon Type and clean up typos in dataset
df['WeaponType'].fillna('None',inplace = True)
df['WeaponType'].replace('Other ( bottle, rocks, spitting)', 'Other (bottle, rocks, spitting)', inplace=True)
df['WeaponType'].replace('Firearm (unknown whether handgun, rifle or shotgun', 'Firearm (unknown whether handgun, rifle or shotgun)', inplace=True)

#Replace rest of NaN values with 'Unknown'
df_new = df.replace(np.nan, 'Unknown', regex=True)

#rearrange columns to be more easily readable
columns_titles = ['MostSeriousBiasType','TotalNumberOfVictims','MostSeriousVictimType','TotalNumberOfSuspects','SuspectsRaceAsAGroup','MostSeriousUcr','MostSeriousUcrType','WeaponType','Offensive_Act','MostSeriousLocation','County','NCIC','ClosedYear','MonthOccurrence']
df_new=df_new.reindex(columns=columns_titles)

#One hot encode all necessary columns
##df_new = pd.concat([df_new,pd.get_dummies(df_new['MostSeriousBias'], prefix='Bias')],axis=1)
##df_new.drop(['MostSeriousBias'],axis=1, inplace=True)
TotalNumberOfVictims = df_new['TotalNumberOfVictims']
df_new.drop(['TotalNumberOfVictims'],axis=1, inplace=True)
df_new = pd.concat([df_new,TotalNumberOfVictims],axis=1)
df_new = pd.concat([df_new,pd.get_dummies(df_new['MostSeriousVictimType'], prefix='VictimType')],axis=1)
df_new.drop(['MostSeriousVictimType'],axis=1, inplace=True)
TotalNumberOfSuspects = df_new['TotalNumberOfSuspects']
df_new.drop(['TotalNumberOfSuspects'],axis=1, inplace=True)
df_new = pd.concat([df_new,TotalNumberOfSuspects],axis=1)
df_new = pd.concat([df_new,pd.get_dummies(df_new['SuspectsRaceAsAGroup'], prefix='SuspectsRace')],axis=1)
df_new.drop(['SuspectsRaceAsAGroup'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['MostSeriousUcr'], prefix='Ucr')],axis=1)
df_new.drop(['MostSeriousUcr'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['MostSeriousUcrType'], prefix='UcrType')],axis=1)
df_new.drop(['MostSeriousUcrType'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['WeaponType'], prefix='Weapon')],axis=1)
df_new.drop(['WeaponType'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['Offensive_Act'], prefix='Offense')],axis=1)
df_new.drop(['Offensive_Act'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['MostSeriousLocation'], prefix='Location')],axis=1)
df_new.drop(['MostSeriousLocation'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['County'], prefix='County')],axis=1)
df_new.drop(['County'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['NCIC'], prefix='Agency')],axis=1)
df_new.drop(['NCIC'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['ClosedYear'], prefix='Year')],axis=1)
df_new.drop(['ClosedYear'],axis=1, inplace=True)
df_new = pd.concat([df_new,pd.get_dummies(df_new['MonthOccurrence'], prefix='Month')],axis=1)
df_new.drop(['MonthOccurrence'],axis=1, inplace=True)

In [5]:
#separate data into labels and data points
y_labels = df_new['MostSeriousBiasType']
x_data = df_new.iloc[:, 1:778]

In [6]:
#split data into training and testing sets and shuffle before splitting
x_Train, x_Test, y_Train, y_Test = train_test_split(x_data, y_labels, shuffle = True, test_size = .2)

In [7]:
#scale the data/normalize the data so that the features do not broadly vary
scaler = StandardScaler()
scaler.fit(x_Train)

x_Train = scaler.transform(x_Train)
x_Test = scaler.transform(x_Test)

In [8]:
#build the Knn classifier
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=2)
classifier.fit(x_Train, y_Train)

KNeighborsClassifier(n_neighbors=2)

In [9]:
y_pred = classifier.predict(x_Test)

In [10]:
#print out statistics to show how well our classifier did
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_Test, y_pred))

                         precision    recall  f1-score   support

             Disability       0.00      0.00      0.00        19
                 Gender       0.03      0.09      0.05        11
   Gender Nonconforming       0.05      0.09      0.06        66
Race/Ethnicity/Ancestry       0.65      0.83      0.73      2794
               Religion       0.44      0.29      0.35       733
     Sexual Orientation       0.42      0.14      0.21       999

               accuracy                           0.58      4622
              macro avg       0.26      0.24      0.23      4622
           weighted avg       0.55      0.58      0.54      4622



In [11]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state = 0).fit(x_Train, y_Train)
print(clf.coef_)
y_prediction = clf.predict(x_Test)

[[-5.05659978e-01  1.21297903e-01  1.13667682e-02 ... -2.03962948e-02
   3.07275206e-02 -3.73182462e-02]
 [ 2.41285543e-02  8.40919036e-03  1.48962788e-02 ...  1.44427851e-01
   3.24707777e-03  1.01665507e-01]
 [-2.02004324e-01 -7.19198179e-02 -4.00440382e-04 ... -3.68761901e-02
   1.67439674e-02 -5.96940712e-02]
 [ 2.50385058e-01 -1.53999751e-02  4.85527751e-02 ... -3.36667307e-02
  -1.10348756e-02 -1.64784456e-02]
 [ 2.64014880e-01  6.37305585e-03 -4.31399837e-02 ... -5.15458355e-02
  -2.51633713e-02  1.28869590e-02]
 [ 1.69135809e-01 -4.87603557e-02 -3.12753981e-02 ... -1.94279967e-03
  -1.45203188e-02 -1.06170326e-03]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
clf.predict_proba(x_Test)

array([[7.87275807e-05, 1.62702053e-06, 5.19763185e-02, 8.36351088e-01,
        2.63130214e-02, 8.52792176e-02],
       [1.38610663e-06, 5.06560308e-06, 6.80963251e-07, 1.48889234e-01,
        8.45463831e-01, 5.63980200e-03],
       [1.40219123e-03, 7.65219059e-04, 9.25727418e-03, 3.93714342e-01,
        2.14209645e-01, 3.80651328e-01],
       ...,
       [3.32073935e-04, 6.07001958e-05, 5.01285171e-02, 3.77747505e-01,
        1.29261972e-02, 5.58805006e-01],
       [3.39354316e-07, 7.51900024e-06, 1.04379735e-01, 5.41458895e-01,
        3.20814996e-02, 3.22072013e-01],
       [8.39167703e-04, 4.04238916e-02, 2.23929876e-02, 4.71156057e-01,
        1.76224365e-01, 2.88963531e-01]])

In [13]:
clf.predict_log_proba(x_Test)

array([[ -9.44951701, -13.32876011,  -2.95696708,  -0.17870679,
         -3.63769135,  -2.46182449],
       [-13.48901173, -12.19303736, -14.1997575 ,  -1.90455265,
         -0.16786989,  -5.17790632],
       [ -6.5697191 ,  -7.17534841,  -4.68234564,  -0.93212965,
         -1.54080009,  -0.96587147],
       ...,
       [ -8.01015292,  -9.70956363,  -2.99316523,  -0.97352928,
         -4.34849924,  -0.58195469],
       [-14.8962211 , -11.79807738,  -2.25971973,  -0.61348813,
         -3.43947575,  -1.13298012],
       [ -7.08309999,  -3.20833429,  -3.79900742,  -0.75256591,
         -1.73599729,  -1.24145479]])

In [14]:
clf.score(x_data,y_labels)

0.5980264866268502

In [15]:
confusion_matrix(y_Test, y_prediction)

array([[   1,    0,    0,   14,    1,    3],
       [   0,    1,    0,    9,    0,    1],
       [   0,    0,    1,   43,    1,   21],
       [   1,    4,    5, 2483,  123,  178],
       [   1,    1,    0,  398,  308,   25],
       [   1,    0,    2,  751,   29,  216]], dtype=int64)

In [17]:
print(classification_report(y_Test, y_prediction))

                         precision    recall  f1-score   support

             Disability       0.25      0.05      0.09        19
                 Gender       0.17      0.09      0.12        11
   Gender Nonconforming       0.12      0.02      0.03        66
Race/Ethnicity/Ancestry       0.67      0.89      0.76      2794
               Religion       0.67      0.42      0.52       733
     Sexual Orientation       0.49      0.22      0.30       999

               accuracy                           0.65      4622
              macro avg       0.39      0.28      0.30      4622
           weighted avg       0.62      0.65      0.61      4622

