In [1]:
from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import pandas as pd
import seaborn as sns

InteractiveShell.ast_node_interactivity = "all"

In [2]:
## SPLIT oversized csv
# ! cd data
# ! split -l 4000000 311.csv
# ! mv xaa 311a.csv
# ! mv xab 311b.csv

In [3]:
%%time
df311a = pd.read_csv('data/311a.csv', skiprows=None, nrows=None, index_col=0)
df311b = pd.read_csv('data/311b.csv', skiprows=None, nrows=None, index_col=0, header=None)
df311b.columns = df311a.columns
df311 = pd.concat([df311a, df311b])

  mask |= (ar1 == a)


CPU times: user 37.1 s, sys: 2.59 s, total: 39.6 s
Wall time: 39.3 s


In [4]:
df311 = df311.rename(columns={
    'Unique Key': 'Key',
    'Created Date': 'Created',
    'Closed Date': 'Closed',
    'Complaint Type': 'Complaint',
    'Location Type': 'Building_type',
    'Incident Zip': 'Zip', # Must leave as float because some na
    'Incident Address': 'Address',
    'Street Name': 'Street',
    'Address Type': 'Address_type',
    'Resolution Description': 'Description',
    'Borough': 'Boro'
})

In [5]:
%%time
df311.Created = pd.to_datetime(df311.Created, format='%m/%d/%Y %I:%M:%S %p')
df311.Closed = pd.to_datetime(df311.Closed, format='%m/%d/%Y %I:%M:%S %p')

CPU times: user 40.2 s, sys: 1.14 s, total: 41.4 s
Wall time: 43.1 s


In [6]:
pd.options.display.max_rows = 2
df311

Unnamed: 0,Key,Created,Closed,Complaint,Building_type,Zip,Address,Street,Address_type,City,Status,Description,Boro,Latitude,Longitude
0,45531130,2020-02-02 06:09:17,NaT,HEAT/HOT WATER,RESIDENTIAL BUILDING,10019.0,426 WEST 52 STREET,WEST 52 STREET,ADDRESS,NEW YORK,Open,The following complaint conditions are still o...,MANHATTAN,40.765132,-73.988993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6019842,44063737,2019-10-15 11:29:01,2019-10-15 21:19:45,PAINT/PLASTER,RESIDENTIAL BUILDING,10009.0,143 AVENUE D,AVENUE D,ADDRESS,NEW YORK,Closed,The Department of Housing Preservation and Dev...,MANHATTAN,40.724749,-73.975269


In [7]:
df311.Status.value_counts()

df311.Address_type.value_counts()
df311 = df311.drop(columns=['Address_type'])

pd.options.display.float_format = '{:.4f}'.format
df311[['Latitude','Longitude']].describe()

Building_type = {
    'RESIDENTIAL BUILDING': 'Residence',
    'Residential Building': 'Residence',
    'Apartment':            'Apartment',
    'Building-Wide':        'Buildingwide',
    'Public Area':          'Public',
}
df311.Building_type = df311.Building_type.map(Building_type)
pd.options.display.max_rows = None
df311.Building_type.value_counts()

Complaint = {
    'HEAT/HOT WATER':         'Hotwater',
    'HEATING':                'Heat',
    'PLUMBING':               'Plumbing',
    'GENERAL CONSTRUCTION':   'Construction',
    'UNSANITARY CONDITION':   'Unsanitary',
    'PAINT - PLASTER':        'Paint',
    'PAINT/PLASTER':          'Paint',
    'ELECTRIC':               'Electric',
    'NONCONST':               'Nonconstruction',
    'DOOR/WINDOW':            'Door',
    'WATER LEAK':             'Leak',
    'GENERAL':                'General',
    'FLOORING/STAIRS':        'Stairs',
    'APPLIANCE':              'Appliance',
    'HPD Literature Request': 'Literature',
    'SAFETY':                 'Safety',
    'OUTSIDE BUILDING':       'Outside',
    'ELEVATOR':               'Elevator',
    'Unsanitary Condition':   'Unsanitary',
    'CONSTRUCTION':           'Construction',
    'General':                'General',
    'Safety':                 'Safety',
    'STRUCTURAL':             'Structural',
    'Plumbing':               'Plumbing',
    'AGENCY':                 'Agency',
    'VACANT APARTMENT':       'Vacancy',
    'Outside Building':       'Outside',
    'Appliance':              'Appliance',
    'Mold':                   'Mold',
    'Electric':               'Electric',
}
df311.Complaint = df311.Complaint.map(Complaint)
pd.options.display.max_rows = None
df311.Complaint.value_counts()

Closed     5886253
            ...   
Pending          2
Name: Status, Length: 5, dtype: int64

ADDRESS    5935078
Name: Address_type, dtype: int64

Unnamed: 0,Latitude,Longitude
count,5939172.0000,5939172.0000
...,...,...
max,40.9129,-73.7008


Residence       5967008
Apartment             5
Buildingwide          4
Public                2
Name: Building_type, dtype: int64

Hotwater           1261574
Heat                887850
Plumbing            711141
Paint               707695
Construction        505941
Unsanitary          457142
Electric            307311
Nonconstruction     260890
Door                205278
Leak                193631
General             152471
Stairs              137402
Appliance           112835
Literature           52824
Safety               51953
Outside               7148
Elevator              6725
Structural              16
Agency                   9
Vacancy                  6
Mold                     1
Name: Complaint, dtype: int64

In [8]:
%%time

# df311.City.isna().sum()
# df311.index[df311.City.isna()]
# df311.loc[2300:2350,'City'].str.title() # Yes, .str.title handles NaN

%time df311.City = df311.City.str.title()
pd.options.display.max_rows = None
df311.City.value_counts()

CPU times: user 3.58 s, sys: 218 ms, total: 3.79 s
Wall time: 3.58 s
CPU times: user 4.46 s, sys: 231 ms, total: 4.69 s
Wall time: 4.49 s


Brooklyn               2026580
Bronx                  1860522
New York               1204417
Staten Island           101553
Jamaica                  97792
Astoria                  60604
Flushing                 59558
Far Rockaway             51637
Ridgewood                47811
Elmhurst                 44037
Corona                   34133
Woodside                 30300
Jackson Heights          29661
Forest Hills             20772
Rego Park                19230
Sunnyside                17274
Queens Village           16146
South Richmond Hill      15843
Ozone Park               15177
Richmond Hill            14984
East Elmhurst            13465
Saint Albans             13382
Woodhaven                12963
Kew Gardens              12863
Hollis                   12591
Arverne                  12498
South Ozone Park         12161
Springfield Gardens      11071
Long Island City         10699
Rockaway Park            10557
Rosedale                  7643
Maspeth                   6045
Bayside 

In [9]:
%%time
df311.loc[df311.Boro=='Unspecified', 'Boro'] = np.nan
pd.options.display.max_rows = None
df311.Boro.value_counts()

CPU times: user 1min 12s, sys: 3.29 s, total: 1min 15s
Wall time: 1min 18s


BROOKLYN         1739886
BRONX            1617956
MANHATTAN        1055225
QUEENS            645971
STATEN ISLAND      87584
Name: Boro, dtype: int64

In [10]:
pd.options.display.float_format = None
df311.head()
df311.to_pickle('data/311.pkl')

Unnamed: 0,Key,Created,Closed,Complaint,Building_type,Zip,Address,Street,City,Status,Description,Boro,Latitude,Longitude
0,45531130,2020-02-02 06:09:17,NaT,Hotwater,Residence,10019.0,426 WEST 52 STREET,WEST 52 STREET,New York,Open,The following complaint conditions are still o...,MANHATTAN,40.765132,-73.988993
1,45529784,2020-02-02 14:15:24,NaT,Unsanitary,Residence,11204.0,1751 67 STREET,67 STREET,Brooklyn,Open,The following complaint conditions are still o...,BROOKLYN,40.618484,-73.992673
2,45527528,2020-02-02 02:27:41,NaT,Hotwater,Residence,11372.0,87-15 37 AVENUE,37 AVENUE,Jackson Heights,Open,The following complaint conditions are still o...,QUEENS,40.750269,-73.879432
3,45530329,2020-02-02 12:13:18,NaT,Hotwater,Residence,10458.0,2405 SOUTHERN BOULEVARD,SOUTHERN BOULEVARD,Bronx,Open,The following complaint conditions are still o...,BRONX,40.853773,-73.881558
4,45528814,2020-02-02 13:59:44,NaT,Appliance,Residence,11209.0,223 78 STREET,78 STREET,Brooklyn,Open,The following complaint conditions are still o...,BROOKLYN,40.629745,-74.030533
