# Getting and cleaning Housing Maintenance Code Violation (HMCV) data

HMCV dataset can be viewed online at: https://data.cityofnewyork.us/Housing-Development/Housing-Maintenance-Code-Violations/wvxf-dwi5



First, data from the NYC HMCV dataset are read into a pandas data frame.

In [16]:
import pandas as pd
import numpy as np
import re
from pandas import parser

### 

def get_HMCV_data():
    #query = ("https://data.cityofnewyork.us/api/views/wvxf-dwi5/rows.csv?accessType=DOWNLOAD")
    query = ("Housing_Maintenance_Code_Violations.csv")
    HMCV_data = pd.read_csv(query)
    return HMCV_data
            
HMCV_data = get_HMCV_data()
HMCV_data.head(3)

Unnamed: 0,ViolationID,BuildingID,RegistrationID,BoroID,Boro,HouseNumber,LowHouseNumber,HighHouseNumber,StreetName,StreetCode,Zip,Apartment,Story,Block,Lot,Class,InspectionDate,ApprovedDate,OriginalCertifyByDate,OriginalCorrectByDate,Unnamed: 21
0,10304176,45567,202840,2,BRONX,1905,1905,1905,ANDREWS AVENUE SOUTH,8820,10453,1A,1,3221,90,C,07/10/2014,07/11/2014,07/27/2014,07/22/2014,...
1,10340355,41491,105339,1,MANHATTAN,111,111,115,WEST 141 STREET,36590,10030,3H,4,2010,21,B,08/08/2014,08/11/2014,09/30/2014,09/16/2014,...
2,10337179,27609,107359,1,MANHATTAN,272,272,274,SHERMAN AVENUE,30490,10034,3C,3,2228,42,B,08/05/2014,08/08/2014,09/29/2014,09/15/2014,...


In [17]:
# This should be 1,131,841 rows and 30 columns, per online HMCV dataset information
print HMCV_data.shape

(1131841, 30)


Next, the dataset is cleaned as follows:
* Only the following features are included:
    ** BoroID, Boro, Block, Lot, Class, InspectionDate

In [18]:
print HMCV_data.shape
HMCV_data = HMCV_data[['BoroID', 'Boro', 'Block', 'Lot', 'Class', 'InspectionDate']]
print HMCV_data.shape

(1131841, 30)
(1131841, 6)


In [19]:
# Delete duplicate rows (if any)
HMCV_data.drop_duplicates()
print HMCV_data.shape

(1131841, 6)


In [20]:
# Check for Boro and BoroID conflicts. Per 'HPD Violataion Open Data.pdf',
# Boro code (1 = Manhattan, 2 = Bronx, 3 = Brooklyn, 4 = Queens, 5 = Staten Island)
print HMCV_data[['BoroID', 'Boro']].groupby(['BoroID', 'Boro']).count().iloc[:,[0,1]]

                      BoroID    Boro
BoroID Boro                         
1      MANHATTAN      239121  239121
2      BRONX          341915  341915
3      BROOKLYN       423966  423966
4      QUEENS         110757  110757
5      STATEN ISLAND   16082   16082

[5 rows x 2 columns]


In [21]:
# Drop the Boro column since we have BoroID
HMCV_data = HMCV_data.drop(['Boro'],1)
print HMCV_data.shape

(1131841, 5)


Then rows are dropped if:
    * They are incomplete (i.e. only complete records are included in analytic sample)

In [22]:
HMCV_data = HMCV_data[~(HMCV_data.isnull().any(axis=1))]
print HMCV_data.shape

(1131841, 5)


In [23]:
print HMCV_data.head(6)

   BoroID  Block  Lot Class InspectionDate
0       2   3221   90     C     07/10/2014
1       1   2010   21     B     08/08/2014
2       1   2228   42     B     08/05/2014
3       1   2153   36     A     03/09/2013
4       3   1419    6     B     08/26/2014
5       2   2488   31     A     07/31/2014

[6 rows x 5 columns]


Only entries from 4/1/2014 - 4/1/2015 are included

In [24]:
HMCV_data['InspectionDate'] = pd.to_datetime(HMCV_data['InspectionDate'])
print HMCV_data['InspectionDate'].dtype

datetime64[ns]


In [25]:
print HMCV_data.shape
startDate = '20140401'
endDate = '20150401'
HMCV_data = HMCV_data[(HMCV_data['InspectionDate'] > startDate) & (HMCV_data['InspectionDate'] < endDate)]
print HMCV_data.head(5)
HMCV_data.shape

(1131841, 5)
   BoroID  Block  Lot Class InspectionDate
0       2   3221   90     C     2014-07-10
1       1   2010   21     B     2014-08-08
2       1   2228   42     B     2014-08-05
4       3   1419    6     B     2014-08-26
5       2   2488   31     A     2014-07-31

[5 rows x 5 columns]


(391519, 5)

In [26]:
def make_BBL(borough, block, lot): 
    '''
    The borough code is one numeric digit. 
    The tax block is one to five numeric digits, preceded with leading zeros 
    when the block is less than five digits.
    The tax lot is one to four digits and is preceded with leading zeros
    when the lot is less than four digits.
    
    >>> make_BBL(1,16,100)
    1000160100
    >>> make_BBL(3,15828,7501)
    3158287501
    '''
    return int(str(borough) + str(block).zfill(5) + str(lot).zfill(4))
    
HMCV_data['BBL'] = map(make_BBL, HMCV_data['BoroID'], HMCV_data['Block'], HMCV_data['Lot'])
HMCV_data.head(5)

Unnamed: 0,BoroID,Block,Lot,Class,InspectionDate,BBL
0,2,3221,90,C,2014-07-10,2032210090
1,1,2010,21,B,2014-08-08,1020100021
2,1,2228,42,B,2014-08-05,1022280042
4,3,1419,6,B,2014-08-26,3014190006
5,2,2488,31,A,2014-07-31,2024880031


Drop BoroID, Block, Lot (since we have BBL now), and InspectionDate

In [27]:
HMCV_data = HMCV_data.drop(['BoroID', 'Block', 'Lot', 'InspectionDate'],1)
print HMCV_data.shape

(391519, 2)


Drop rows with the class of I, since the only valid classes are A, B, and C (per 'HPD Violation Open Data.pdf')

In [28]:
HMCV_data = HMCV_data[HMCV_data.Class != 'I']
print HMCV_data.shape
#HMCV_data.to_csv('HMCV_BBL_Class.csv')

(341841, 2)


Rearrange to violation classes by BBL

In [29]:
HMCV_data = HMCV_data.groupby(['BBL', 'Class']).size().reset_index()
HMCV_data.columns = ['BBL', 'Class', 'Total Class Count']
HMCV_data = HMCV_data.pivot('BBL', 'Class', 'Total Class Count')
HMCV_data['BBL'] = HMCV_data.index
HMCV_data = HMCV_data.reset_index(drop=True)
HMCV_data.columns = ['Class A Violations', 'Class B Violations', 'Class C Violations', 'BBL']
HMCV_data = HMCV_data.fillna(0)
HMCV_data.head(5)

Unnamed: 0,Class A Violations,Class B Violations,Class C Violations,BBL
0,0,0,1,1000157501
1,1,14,1,1000160100
2,0,0,1,1000167508
3,1,4,0,1000167515
4,2,5,0,1000167516


In [30]:
HMCV_data.shape

(27725, 4)