## GOAL: Predict violation type via violation type code using other features

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('./data/dob_violations_data.csv', low_memory=False)

In [4]:
df.head()

Unnamed: 0,bbl,isndobbisviol,boro,bin,block,lot,issuedate,violationtypecode,violationnumber,housenumber,street,dispositiondate,dispositioncomments,devicenumber,description,ecbnumber,number,violationcategory,violationtype
0,3014790000.0,1018012,3,3039615.0,1479,1,2005-01-31,LL6291,15504,880,QUINCY STREET,2005-07-21,IRICIA DISMISSED FULL PENALTY PAID $500 ...,00081537,,,V*013105LL629115504,V*-DOB VIOLATION - DISMISSED,LL6291-LOCAL LAW 62/91 - BOILERS
1,1014350000.0,798462,1,1044200.0,1435,16,2002-03-26,LL6291,04391,330,EAST 61 STREET,2006-06-23,CIAMGZ CANCELLED DEED SUB NEW OWNER AS OF ...,00111092,,,V*032602LL629104391,V*-DOB VIOLATION - DISMISSED,LL6291-LOCAL LAW 62/91 - BOILERS
2,1016710000.0,251606,1,1052648.0,1671,49,1993-03-18,LL6291,10195,1936,2 AVENUE,2000-08-08,"VTUCIA CANCELLED, BLDG HEATED BY 1932 ...",00900563,,,V*031893LL629110195,V*-DOB VIOLATION - DISMISSED,LL6291-LOCAL LAW 62/91 - BOILERS
3,2025230000.0,1071206,2,2097167.0,2523,61,2006-02-02,E,9444/153715,950,UNIVERSITY AVEN,2007-06-11,PPN203 AOC SUBMITTED ON 06/08/2007 BY DYNAMIC ...,002P1188,,,V*020206E9444/153715,V*-DOB VIOLATION - Resolved,E-ELEVATOR
4,2031440000.0,2027969,2,2013386.0,3144,66,2016-02-05,IMEGNCY,9246,2094,VALENTINE AVENUE,2016-02-11,BLDING IS NOT DEMOLISHED. FENCE ERECTED IN PLA...,,02/05/16 DOB : DEC 9246 HAS BEEN UPLOADED AND ...,,V*020516IMEGNCY9246,V*-DOB VIOLATION - DISMISSED,IMEGNCY-IMMEDIATE EMERGENCY


In [5]:
df.columns

Index(['bbl', 'isndobbisviol', 'boro', 'bin', 'block', 'lot', 'issuedate',
       'violationtypecode', 'violationnumber', 'housenumber', 'street',
       'dispositiondate', 'dispositioncomments', 'devicenumber', 'description',
       'ecbnumber', 'number', 'violationcategory', 'violationtype'],
      dtype='object')

In [6]:
for column in df.columns:
    print(column, ': ', df[column].nunique())

bbl :  215454
isndobbisviol :  2020288
boro :  12
bin :  218799
block :  13247
lot :  2414
issuedate :  12654
violationtypecode :  52
violationnumber :  799405
housenumber :  45077
street :  15766
dispositiondate :  10078
dispositioncomments :  466754
devicenumber :  353184
description :  100615
ecbnumber :  146632
number :  2013506
violationcategory :  14
violationtype :  46


y = violationcategory (as it reflects violationtype; so drop violationtype column altogether)

1. trying categorical approach first
    - problem: most features seem non-categorical
2. try neural network with concatenated features
    - remove irrelevant features like violation number first
    - perform concatenation
    - train NN

# Data Cleaning

In [7]:
cols_to_drop = ['bbl', 'isndobbisviol', 'bin', 'block', 'lot', 'violationnumber',\
                'devicenumber', 'dispositioncomments', 'ecbnumber', 'number',\
                'violationtype', 'violationcategory']
processed_df = df.drop(cols_to_drop, axis=1)

In [8]:
processed_df.columns

Index(['boro', 'issuedate', 'violationtypecode', 'housenumber', 'street',
       'dispositiondate', 'description'],
      dtype='object')

In [9]:
X = pd.Series(processed_df.loc[:, processed_df.columns != 'violationtypecode'].fillna('').values.tolist()).str.join('')
y = pd.Series(processed_df['violationtypecode'])

In [10]:
X.head()

0                32005-01-31880QUINCY STREET2005-07-21
1               12002-03-26330EAST 61 STREET2006-06-23
2                    11993-03-1819362 AVENUE2000-08-08
3              22006-02-02950UNIVERSITY AVEN2007-06-11
4    22016-02-052094VALENTINE AVENUE2016-02-1102/05...
dtype: object

In [11]:
y.head()

0     LL6291
1     LL6291
2     LL6291
3          E
4    IMEGNCY
Name: violationtypecode, dtype: object

In [12]:
# Constructing dict for looking up actual violationtypecode
y_keys = []
y_vals = []
for key, val in enumerate(y.unique()):
    y_keys.append(key)
    y_vals.append(val)
y_dict = dict(zip(y_keys, y_vals))
y_dict[0]

'LL6291'

In [13]:
for key in y_dict:
    y.replace(to_replace=y_dict[key], value=key, inplace=True)
y.head()

0    0
1    0
2    0
3    1
4    2
Name: violationtypecode, dtype: int64

# Data Split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [16]:
y_test.unique() # A classification with 52 categories

array([ 1, 14,  0,  5,  6, 11,  3, 28,  8,  4, 24, 10, 18, 16,  9, 13, 51,
       34, 39, 12, 31, 30, 33, 23, 26,  7, 25,  2, 20, 35, 17, 19, 49, 22,
       40, 21, 27, 15, 29, 32, 37, 42, 36, 43, 38, 41, 46, 50, 44],
      dtype=int64)

# Modeling

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras import layers
from keras.models import Model
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [18]:
X_train_list = X_train.tolist()
X_test_list = X_test.tolist()

In [19]:
cvec = CountVectorizer()
cvec.fit(X_train_list)
X_train_vec = cvec.transform(X_train_list)
X_test_vec = cvec.transform(X_test_list)

In [20]:
X_train_vec

<1515216x764009 sparse matrix of type '<class 'numpy.int64'>'
	with 12543555 stored elements in Compressed Sparse Row format>

In [24]:
def simpleNN(input_dim, output_dim):
    model = Sequential()
    model.add(layers.Dense(16, input_dim=input_dim, activation='relu', use_bias=True))
    model.add(layers.Dense(8, activation='relu', use_bias=True))
    model.add(layers.Dropout(rate=0.6))
    model.add(layers.Dense(output_dim, activation='sigmoid'))
    model.compile(loss='sparse_categorical_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy'])
    return model

In [25]:
y.shape

(2020288,)

In [None]:
input_dim = X_train_vec.shape[1]
output_dim = y.nunique()

model = simpleNN(input_dim, output_dim)
model.fit(X_train_vec, y_train,
          epochs=4,
          verbose=1,
          validation_data=None,
          batch_size=256)
print(model.summary())

loss, train_accuracy = model.evaluate(X_train_vec, Y_train, verbose=False)
print('Training Accuracy: {:.4f}'.format(train_accuracy))
# loss, val_accuracy = model.evaluate(X_val_vec, Y_val, verbose=False)
# print('Validation Accuracy: {:.4f}'.format(val_accuracy))
loss, test_accuracy = model.evaluate(X_test_vec, Y_test, verbose=False)
print('Testing Accuracy: {:.4f}'.format(test_accuracy))

Epoch 1/4
  16896/1515216 [..............................] - ETA: 3:25:52 - loss: 3.8389 - acc: 0.1883