In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_parquet("../data/safety-Nyc.parquet")
display(data.head())
data.info()

Unnamed: 0,requestId,dataType,dataSubtype,dateTime,category,subcategory,status,address,latitude,longitude,source,extendedProperties
0,2,Safety,311_All,2020-07-05 23:40:11,UNSANITARY CONDITION,PESTS,Open,640 WEST 139 STREET,40.823048,-73.954211,,
1,5,Safety,311_All,2016-08-06 00:00:00,Standing Water,Puddle in Ground,Closed,2216 VICTORY BOULEVARD,40.612035,-74.137318,,
2,7,Safety,311_All,2018-04-18 08:49:35,UNSANITARY CONDITION,PESTS,Closed,383 SOUTH 3 STREET,40.708739,-73.952169,,
3,8,Safety,311_All,2018-05-30 17:20:11,ELECTRIC,WIRING,Closed,531 WEST 48 STREET,40.764133,-73.994354,,
4,9,Safety,311_All,2018-04-17 08:37:00,Missed Collection (All Materials),1 Missed Collection,Closed,41 RUMPLER COURT,40.625555,-74.142415,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11305633 entries, 0 to 11305632
Data columns (total 12 columns):
 #   Column              Dtype  
---  ------              -----  
 0   requestId           int64  
 1   dataType            object 
 2   dataSubtype         object 
 3   dateTime            object 
 4   category            object 
 5   subcategory         object 
 6   status              object 
 7   address             object 
 8   latitude            float64
 9   longitude           float64
 10  source              object 
 11  extendedProperties  object 
dtypes: float64(2), int64(1), object(9)
memory usage: 1.0+ GB


In [3]:
# remove extendedProperties column and source column since they contain no data
data["extendedProperties"].value_counts()
data.drop("extendedProperties", axis=1, inplace=True)
data.drop("source", axis=1, inplace=True)

# all values in this column are the same, this column has no value
data["dataType"].value_counts()
data.drop("dataType", axis=1, inplace=True)

In [4]:
data.isna().sum()

requestId            0
dataSubtype          0
dateTime             0
category             0
subcategory      58757
status               0
address        1598289
latitude        759223
longitude       759223
dtype: int64

In [5]:
# data quality check

na_subcategory_percentage = 100 * data["subcategory"].isna().sum() / len(data)
na_address_percentage = 100 * data["address"].isna().sum() / len(data)
na_lat_percentage = 100 * data["latitude"].isna().sum() / len(data)
na_long_percentage = 100 * data["longitude"].isna().sum() / len(data)

print(f"Percentage of null values for \nsubcategory: {na_subcategory_percentage:.3f}%" +
      f"\naddress: {na_address_percentage:.3f}%" +
      f"\nlatitude: {na_lat_percentage:.3f}5" +
      f"\nlongitude: {na_long_percentage:.3f}%")

Percentage of null values for 
subcategory: 0.520%
address: 14.137%
latitude: 6.7155
longitude: 6.715%


In [6]:
data2 = data.dropna().drop_duplicates()

In [7]:
# get "bounding box" for lat and long -- make sure this is around New York City
print(data2["longitude"].min())
print(data2["longitude"].max())
print(data2["latitude"].min())
print(data2["latitude"].max())

-74.25495171973925
-73.70038354802529
40.498778539470784
40.91345653278351


In [8]:
# change data types to optimized tpyes
data2['dateTime'] = pd.to_datetime(data.dateTime)
data2 = data2.astype({"status": "category", "dataSubtype": "category"})

In [9]:
data2.reset_index(drop=True, inplace=True)
print(data2.info())
data2.head()

# dataset went from ~1GB to ~500 MB!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9456777 entries, 0 to 9456776
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   requestId    int64         
 1   dataSubtype  category      
 2   dateTime     datetime64[ns]
 3   category     object        
 4   subcategory  object        
 5   status       category      
 6   address      object        
 7   latitude     float64       
 8   longitude    float64       
dtypes: category(2), datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 523.1+ MB
None


Unnamed: 0,requestId,dataSubtype,dateTime,category,subcategory,status,address,latitude,longitude
0,2,311_All,2020-07-05 23:40:11,UNSANITARY CONDITION,PESTS,Open,640 WEST 139 STREET,40.823048,-73.954211
1,5,311_All,2016-08-06 00:00:00,Standing Water,Puddle in Ground,Closed,2216 VICTORY BOULEVARD,40.612035,-74.137318
2,7,311_All,2018-04-18 08:49:35,UNSANITARY CONDITION,PESTS,Closed,383 SOUTH 3 STREET,40.708739,-73.952169
3,8,311_All,2018-05-30 17:20:11,ELECTRIC,WIRING,Closed,531 WEST 48 STREET,40.764133,-73.994354
4,9,311_All,2018-04-17 08:37:00,Missed Collection (All Materials),1 Missed Collection,Closed,41 RUMPLER COURT,40.625555,-74.142415


In [10]:
# save parquet file
data2.to_parquet("../data/cleaned_NewYorkCity_Data.parquet")