In [1]:
from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

InteractiveShell.ast_node_interactivity = "all"

In [2]:
## SPLIT oversized csv
# ! cd data
# ! split -l 4000000 311.csv
# ! mv xaa 311a.csv
# ! mv xab 311b.csv

In [3]:
%%time

with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)

    df311a = pd.read_csv('data/311a.csv', skiprows=None, nrows=None, index_col=0)
    df311b = pd.read_csv('data/311b.csv', skiprows=None, nrows=None, index_col=0, header=None)

df311b.columns = df311a.columns
df311 = pd.concat([df311a, df311b])

CPU times: user 36.2 s, sys: 2.32 s, total: 38.5 s
Wall time: 37.7 s


In [4]:
df311 = df311.rename(columns={
    'Unique Key': 'Key',
    'Created Date': 'Created',
    'Closed Date': 'Closed',
    'Complaint Type': 'Complaint',
    'Location Type': 'Building_type',
    'Incident Zip': 'Zip', # Must leave as float because some na
    'Incident Address': 'Address',
    'Street Name': 'Street',
    'Address Type': 'Address_type',
    'Resolution Description': 'Description',
    'Borough': 'Boro'
})

In [5]:
%%time
df311.Created = pd.to_datetime(df311.Created, format='%m/%d/%Y %I:%M:%S %p')
df311.Closed = pd.to_datetime(df311.Closed, format='%m/%d/%Y %I:%M:%S %p')

CPU times: user 34.2 s, sys: 500 ms, total: 34.7 s
Wall time: 34.1 s


In [6]:
pd.set_option('max_columns', 9)
pd.set_option('max_colwidth', 13)
pd.set_option('max_rows', 1)
df311

Unnamed: 0,Key,Created,Closed,Complaint,...,Description,Boro,Latitude,Longitude
0,45531130,2020-02-02...,NaT,HEAT/HOT ...,...,The follo...,MANHATTAN,40.765132,-73.988993


In [7]:
pd.set_option('float_format', '{:.1f}'.format)
pd.reset_option('max_rows')
pd.reset_option('max_columns')
df311[['Latitude','Longitude']].describe().T

pd.reset_option('float_format')
pd.set_option('max_columns', 6)
print('\n=============== Value Counts ===============\n')

df311.Status.value_counts().to_frame().T
df311.Building_type.isna().sum()

# df311.Address_type.value_counts().to_frame().T
# df311.Address_type.isna().sum()
df311 = df311.drop(columns=['Address_type'])

Building_type = {
    'RESIDENTIAL BUILDING': 'Residence',
    'Residential Building': 'Residence',
    'Apartment':            'Apartment',
    'Building-Wide':        'Buildingwide',
    'Public Area':          'Public',
}
df311.Building_type = df311.Building_type.map(Building_type)
df311.Building_type.value_counts().to_frame().T

Complaint = {
    'HEAT/HOT WATER':         'Hotwater',
    'HEATING':                'Heat',
    'PLUMBING':               'Plumbing',
    'GENERAL CONSTRUCTION':   'Construction',
    'UNSANITARY CONDITION':   'Unsanitary',
    'PAINT - PLASTER':        'Paint',
    'PAINT/PLASTER':          'Paint',
    'ELECTRIC':               'Electric',
    'NONCONST':               'Nonconstruction',
    'DOOR/WINDOW':            'Door',
    'WATER LEAK':             'Leak',
    'GENERAL':                'General',
    'FLOORING/STAIRS':        'Stairs',
    'APPLIANCE':              'Appliance',
    'HPD Literature Request': 'Literature',
    'SAFETY':                 'Safety',
    'OUTSIDE BUILDING':       'Outside',
    'ELEVATOR':               'Elevator',
    'Unsanitary Condition':   'Unsanitary',
    'CONSTRUCTION':           'Construction',
    'General':                'General',
    'Safety':                 'Safety',
    'STRUCTURAL':             'Structural',
    'Plumbing':               'Plumbing',
    'AGENCY':                 'Agency',
    'VACANT APARTMENT':       'Vacancy',
    'Outside Building':       'Outside',
    'Appliance':              'Appliance',
    'Mold':                   'Mold',
    'Electric':               'Electric',
}
df311.Complaint = df311.Complaint.map(Complaint)
df311.Complaint.value_counts().to_frame().T

df311.Boro = df311.Boro.mask(df311.Boro=='Unspecified')
df311.Boro.value_counts().to_frame().T

df311.City = df311.City.str.title() # Title case
df311.City.value_counts().to_frame().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Latitude,5939172.0,40.8,0.1,40.5,40.7,40.8,40.8,40.9
Longitude,5939172.0,-73.9,0.1,-74.3,-74.0,-73.9,-73.9,-73.7






Unnamed: 0,Closed,Open,In Progress,Assigned,Pending
Status,5886253,133220,364,4,2


52824

Unnamed: 0,Residence,Apartment,Buildingwide,Public
Building_type,5967008,5,4,2


Unnamed: 0,Hotwater,Heat,Plumbing,...,Agency,Vacancy,Mold
Complaint,1261574,887850,711141,...,9,6,1


Unnamed: 0,BROOKLYN,BRONX,MANHATTAN,QUEENS,STATEN ISLAND
Boro,1739886,1617956,1055225,645971,87584


Unnamed: 0,Brooklyn,Bronx,New York,...,Floral Park,Breezy Point,New Hyde Park
City,2026580,1860522,1204417,...,287,269,112


In [8]:
%time df311.to_pickle('data/311.pkl')

CPU times: user 6.59 s, sys: 2.41 s, total: 9.01 s
Wall time: 10.3 s


In [9]:
df311.isna().sum().to_frame().rename(columns={0:'isna'})

# pd.reset_option('float_format')
# pd.set_option('max_columns', 9)
# pd.set_option('max_colwidth', 13)
# pd.set_option('max_rows', 1)
# df311

pd.set_option('float_format', '{:.2f}'.format)
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
df311.describe(percentiles=[]).T

pd.set_option('max_colwidth', 20)
df311.describe(include='object').T

Unnamed: 0,isna
Key,0
Created,0
Closed,126657
Complaint,0
Building_type,52824
Zip,80697
Address,52825
Street,52825
City,80274
Status,0


Unnamed: 0,count,mean,std,min,50%,max
Key,6019843.0,29562536.56,7909439.23,15629728.0,29310097.0,45532043.0
Zip,5939146.0,10746.98,513.02,10001.0,10469.0,12345.0
Latitude,5939172.0,40.75,0.09,40.5,40.76,40.91
Longitude,5939172.0,-73.92,0.06,-74.25,-73.92,-73.7


Unnamed: 0,count,unique,top,freq
Complaint,6019843,21,Hotwater,1261574
Building_type,5967019,4,Residence,5967008
Address,5967018,182600,34 ARDEN STREET,14298
Street,5967018,6825,GRAND CONCOURSE,92450
City,5939569,47,Brooklyn,2026580
Status,6019843,5,Closed,5886253
Description,6012017,340,The Department o...,1698990
Boro,5146622,5,BROOKLYN,1739886


### Next Steps
1. Investigate whether `Address_type == ADDRESS` versus `== NaN` is meaningful.
   If so, do not drop the `Address_type` column