In [3]:
# import pandas and read in the csv file
import pandas as pd
df = pd.read_csv("nyc311_011523-012123_by022023.csv")

is there a statistically significant different between the total duration time between agencies?

are the types of complaints statistically different between boroughs? between agencies?

overall question: can we predict the borough that the 311 request came from?

In [4]:
# look at first few columns
df.head()

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,...,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Latitude,Longitude,Location
0,56532399,01/15/2023 12:00:00 AM,01/17/2023 12:00:01 AM,DOHMH,Department of Health and Mental Hygiene,Food Poisoning,1 or 2,Restaurant/Bar/Deli/Bakery,11101.0,28-07 JACKSON AVENUE,...,,,,,,,,40.748433,-73.938768,"(40.74843329250923, -73.93876844606294)"
1,56533302,01/15/2023 12:00:00 AM,01/23/2023 12:00:01 AM,DOHMH,Department of Health and Mental Hygiene,Food Poisoning,1 or 2,Other (Explain Below),11101.0,48-18 NORTHERN BOULEVARD,...,,,,,,,,40.753619,-73.914245,"(40.75361894380833, -73.91424532268624)"
2,56533451,01/15/2023 12:00:00 AM,01/17/2023 09:34:08 AM,DOHMH,Department of Health and Mental Hygiene,Food Poisoning,1 or 2,Food Cart Vendor,10305.0,1941 HYLAN BOULEVARD,...,,,,,,,,40.582218,-74.097217,"(40.58221767593629, -74.09721689751461)"
3,56536021,01/15/2023 12:00:00 AM,01/17/2023 09:48:19 AM,DOHMH,Department of Health and Mental Hygiene,Food Poisoning,1 or 2,Restaurant/Bar/Deli/Bakery,11103.0,24-11 STEINWAY STREET,...,,,,,,,,40.768674,-73.911074,"(40.76867420855789, -73.91107425623979)"
4,56538004,01/15/2023 12:00:00 AM,01/23/2023 03:08:02 PM,DOHMH,Department of Health and Mental Hygiene,Food Poisoning,3 or More,Other (Explain Below),10036.0,700 8 AVENUE,...,,,,,,,,40.758662,-73.988738,"(40.75866221848322, -73.98873795996684)"


Takeaways from first look:
1. Agency & Agency Name columns have same data?
2. Longitude, Latitude, and Location have same data?
3. Does Unique Key column provide us with any insight?
4. Why are there so many null values?

In [5]:
# explore possible duplicate columns 
check1 = df[['Agency', 'Agency Name']]
check1.head(20)

Unnamed: 0,Agency,Agency Name
0,DOHMH,Department of Health and Mental Hygiene
1,DOHMH,Department of Health and Mental Hygiene
2,DOHMH,Department of Health and Mental Hygiene
3,DOHMH,Department of Health and Mental Hygiene
4,DOHMH,Department of Health and Mental Hygiene
5,NYPD,New York City Police Department
6,NYPD,New York City Police Department
7,NYPD,New York City Police Department
8,NYPD,New York City Police Department
9,NYPD,New York City Police Department


In [6]:
# explore possible duplicate columns 
check2 = df[['Longitude', 'Latitude', 'Location']]
check2.head(20)

Unnamed: 0,Longitude,Latitude,Location
0,-73.938768,40.748433,"(40.74843329250923, -73.93876844606294)"
1,-73.914245,40.753619,"(40.75361894380833, -73.91424532268624)"
2,-74.097217,40.582218,"(40.58221767593629, -74.09721689751461)"
3,-73.911074,40.768674,"(40.76867420855789, -73.91107425623979)"
4,-73.988738,40.758662,"(40.75866221848322, -73.98873795996684)"
5,-73.870409,40.863611,"(40.8636112787934, -73.87040864262848)"
6,-73.83758,40.702772,"(40.70277200816068, -73.83758019372564)"
7,-73.981921,40.687196,"(40.68719608843765, -73.98192072171082)"
8,-73.858906,40.832358,"(40.83235751279932, -73.85890589915455)"
9,-73.947833,40.773667,"(40.773667438961155, -73.94783272698673)"


In [7]:
# drop these columns from the dataset
df.drop(['Unique Key', 'Location', 'Agency Name'], axis=1, inplace=True)

One of the biggest issues with cleaning data are missing values:

In [8]:
# percent missing for each variable
print((df.isnull().sum() * 100)/ len(df))

Created Date                       0.000000
Closed Date                        9.502653
Agency                             0.000000
Complaint Type                     0.000000
Descriptor                         1.630285
Location Type                     13.563678
Incident Zip                       1.375094
Incident Address                   4.743983
Street Name                        4.743983
Cross Street 1                    35.756118
Cross Street 2                    35.756118
Intersection Street 1             39.984211
Intersection Street 2             39.910775
Address Type                       0.756394
City                               4.360278
Landmark                          47.135068
Facility Type                     99.478603
Status                             0.000000
Due Date                          99.809066
Resolution Description             3.214673
Resolution Action Updated Date     3.130221
Community Board                    0.000000
BBL                             

In [9]:
# all of these columns have over 99% data missing, therefore, given the time contraint, it is not reasonable to try to fill this data in.
df.drop(['Bridge Highway Segment', 'Road Ramp', 'Bridge Highway Direction', 'Bridge Highway Name', 'Taxi Pick Up Location', 'Taxi Company Borough', 'Vehicle Type', 'Due Date', 'Facility Type'], axis=1, inplace=True)

More good practice is to check the descriptive statistics of our continuous variables:

In [10]:
df.describe()

Unnamed: 0,Incident Zip,BBL,X Coordinate (State Plane),Y Coordinate (State Plane),Latitude,Longitude
count,53720.0,48244.0,53495.0,53526.0,53491.0,53491.0
mean,10807.120514,2695698000.0,1005072.0,207697.450342,40.736701,-73.924827
std,534.867544,1148508000.0,20941.08,32076.816492,0.088046,0.075525
min,10000.0,0.0,914050.0,121152.0,40.498949,-74.252452
25%,10451.0,2027820000.0,993469.5,183297.0,40.669735,-73.966757
50%,11201.0,3016035000.0,1004174.0,205608.5,40.730943,-73.928097
75%,11232.0,4000588000.0,1017478.0,237620.0,40.818878,-73.880081
max,12345.0,5200430000.0,1067177.0,271730.0,40.912468,-73.700743


In [11]:
# check where the zip code is 10000, and then remove these rows
count = df[df['Incident Zip'] == 10000.000000]['Incident Zip'].count()
print(count)

23


In [12]:
df.drop(df[df['Incident Zip'] == 10000.000000].index, inplace=True)

What about more tedius problems?:

In [13]:
# view all unique values of 'Descriptor'
print(df["Descriptor"].value_counts(dropna=False))

ENTIRE BUILDING                5031
Loud Music/Party               3839
APARTMENT ONLY                 2651
Blocked Hydrant                2647
No Access                      2115
                               ... 
Equipment Complaint               1
Announcements                     1
Bracket Arm Loose                 1
Domestic Employee                 1
Air: Odor, Nail Salon (AD8)       1
Name: Descriptor, Length: 613, dtype: int64


In [14]:
# fill null values with 'Other'
df['Location Type'] = df['Location Type'].fillna('Other')

# change all 'Other (Explain Below)' to 'Other' because they are the same thing
df['Location Type'] = df['Location Type'].replace('Other (Explain Below)', 'Other')

# fill null values with 'Other'
df['Descriptor'] = df['Descriptor'].fillna('Other')

# change all versions of other to uniform 'Other'
df['Descriptor'] = df['Descriptor'].replace({'Other (Explain Below)': 'Other',
                                             'Other (complaint details)': 'Other',
                                             'Other/Unknown': 'Other'})

In [15]:
df["Address Type"].value_counts(dropna=False)

ADDRESS         49341
INTERSECTION     3792
BLOCKFACE         639
NaN               412
UNRECOGNIZED      201
PLACENAME          61
Name: Address Type, dtype: int64

In [16]:
# fill null values with 'UNRECOGNIZED'
df['Address Type'] = df['Address Type'].fillna('UNRECOGNIZED')

df["Address Type"].value_counts(dropna=False)

ADDRESS         49341
INTERSECTION     3792
BLOCKFACE         639
UNRECOGNIZED      613
PLACENAME          61
Name: Address Type, dtype: int64

In [17]:
df['Park Facility Name'] = df['Park Facility Name'].fillna('Unspecified')

In [18]:
df["City"].value_counts(dropna=False)

BROOKLYN               16389
BRONX                  11567
NEW YORK                9967
NaN                     2375
STATEN ISLAND           2173
JAMAICA                 1079
ASTORIA                  921
FLUSHING                 783
QUEENS                   754
RIDGEWOOD                673
MANHATTAN                467
FAR ROCKAWAY             454
LONG ISLAND CITY         433
WOODSIDE                 420
FRESH MEADOWS            400
CORONA                   379
JACKSON HEIGHTS          349
ELMHURST                 322
SOUTH RICHMOND HILL      318
OZONE PARK               317
EAST ELMHURST            282
MASPETH                  278
HOWARD BEACH             277
FOREST HILLS             232
SOUTH OZONE PARK         220
RICHMOND HILL            211
MIDDLE VILLAGE           201
WOODHAVEN                198
REGO PARK                195
BAYSIDE                  195
QUEENS VILLAGE           193
WHITESTONE               168
COLLEGE POINT            160
HOLLIS                   133
SPRINGFIELD GA

In [19]:
# convert lowercase columni
df['City'] = df['City'].str.lower()

In [20]:
# percent missing for each variable
print((df.isnull().sum() * 100)/ len(df))

Created Date                       0.000000
Closed Date                        9.484627
Agency                             0.000000
Complaint Type                     0.000000
Descriptor                         0.000000
Location Type                      0.000000
Incident Zip                       1.375675
Incident Address                   4.745987
Street Name                        4.745987
Cross Street 1                    35.769386
Cross Street 2                    35.765713
Intersection Street 1             39.995592
Intersection Street 2             39.918451
Address Type                       0.000000
City                               4.362120
Landmark                          47.149469
Status                             0.000000
Resolution Description             3.195827
Resolution Action Updated Date     3.111340
Community Board                    0.000000
BBL                               11.403960
Borough                            0.000000
X Coordinate (State Plane)      

Now we get into the more interesting steps:

In [21]:
df['Resolution Action Updated Date'].value_counts(dropna=False)

01/18/2023 12:00:00 AM    2576
01/17/2023 12:00:00 AM    2032
01/19/2023 12:00:00 AM    1942
01/20/2023 12:00:00 AM    1761
NaN                       1694
                          ... 
01/17/2023 04:10:15 PM       1
01/17/2023 06:04:55 PM       1
01/17/2023 05:12:51 PM       1
01/17/2023 04:54:54 PM       1
01/23/2023 10:02:46 AM       1
Name: Resolution Action Updated Date, Length: 29935, dtype: int64

In [22]:
df['Resolution Action Updated Date'] = pd.to_datetime(df['Resolution Action Updated Date'])

In [23]:
# Select the columns you want to print
two_columns_df = df[['Created Date', 'Closed Date']]

# Print the new dataframe with two columns
two_columns_df

Unnamed: 0,Created Date,Closed Date
0,01/15/2023 12:00:00 AM,01/17/2023 12:00:01 AM
1,01/15/2023 12:00:00 AM,01/23/2023 12:00:01 AM
2,01/15/2023 12:00:00 AM,01/17/2023 09:34:08 AM
3,01/15/2023 12:00:00 AM,01/17/2023 09:48:19 AM
4,01/15/2023 12:00:00 AM,01/23/2023 03:08:02 PM
...,...,...
54464,01/22/2023 12:00:00 AM,01/22/2023 04:29:17 AM
54465,01/22/2023 12:00:00 AM,01/23/2023 10:03:08 AM
54466,01/22/2023 12:00:00 AM,01/23/2023 10:05:39 AM
54467,01/22/2023 12:00:00 AM,02/01/2023 12:00:01 AM


In [24]:
# convert "Created Date" and "Closed Date" columns to datetime objects
df['Created Date'] = pd.to_datetime(df['Created Date'])
df['Closed Date'] = pd.to_datetime(df['Closed Date'], errors='coerce')

In [23]:
# Select the columns you want to print
two_columns_df = df[['Created Date', 'Closed Date']]

# Print the new dataframe with two columns
two_columns_df

Unnamed: 0,Created Date,Closed Date
0,2023-01-15,2023-01-17 00:00:01
1,2023-01-15,2023-01-23 00:00:01
2,2023-01-15,2023-01-17 09:34:08
3,2023-01-15,2023-01-17 09:48:19
4,2023-01-15,2023-01-23 15:08:02
...,...,...
54464,2023-01-22,2023-01-22 04:29:17
54465,2023-01-22,2023-01-23 10:03:08
54466,2023-01-22,2023-01-23 10:05:39
54467,2023-01-22,2023-02-01 00:00:01


In [24]:
# Check if there are any instances where the "Closed Date" is earlier than the "Created Date"
closed_earlier = df[df['Closed Date'] < df['Created Date']]

if closed_earlier.empty:
    print("No instances where 'Closed Date' is earlier than 'Created Date'")
else:
    print("There are instances where 'Closed Date' is earlier than 'Created Date'")

There are instances where 'Closed Date' is earlier than 'Created Date'


In [25]:
# prints the number of entries that have this problem
# prints the dataset to inspect further
print(len(closed_earlier))
closed_earlier

134


Unnamed: 0,Created Date,Closed Date,Agency,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,...,Community Board,BBL,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Open Data Channel Type,Park Facility Name,Park Borough,Latitude,Longitude
15759,2023-01-17 09:13:00,2023-01-16 09:12:00,DOT,Street Light Condition,Fixture/Luminaire Door Open,Other,10470.0,,,,...,12 BRONX,,BRONX,1026374.0,267996.0,UNKNOWN,Unspecified,BRONX,40.902160,-73.847620
16346,2023-01-17 10:14:00,2023-01-16 09:37:00,DOT,Street Light Condition,Street Light Out,Other,10470.0,,,,...,12 BRONX,,BRONX,1025621.0,268476.0,UNKNOWN,Unspecified,BRONX,40.903481,-73.850341
16374,2023-01-17 10:16:00,2023-01-16 10:15:00,DOT,Street Light Condition,Street Light Out,Other,10470.0,,,,...,12 BRONX,,BRONX,1024368.0,267516.0,UNKNOWN,Unspecified,BRONX,40.900852,-73.854879
16392,2023-01-17 10:18:00,2023-01-16 10:17:00,DOT,Street Light Condition,Fixture/Luminaire Out Of Position,Other,10459.0,,,,...,03 BRONX,,BRONX,1014492.0,242563.0,UNKNOWN,Unspecified,BRONX,40.832403,-73.890717
16544,2023-01-17 10:28:00,2023-01-16 10:27:00,DOT,Street Light Condition,Fixture/Luminaire Hanging,Other,10457.0,,,,...,06 BRONX,,BRONX,1015598.0,247912.0,UNKNOWN,Unspecified,BRONX,40.847081,-73.886695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42922,2023-01-20 11:59:00,2023-01-19 11:58:00,DOT,Street Light Condition,Street Light Out,Other,10461.0,,,,...,11 BRONX,,BRONX,1028773.0,246156.0,UNKNOWN,Unspecified,BRONX,40.842204,-73.839087
43071,2023-01-20 12:02:00,2023-01-19 11:59:00,DOT,Street Light Condition,Street Light Out,Other,10461.0,,,,...,10 BRONX,,BRONX,1031720.0,245301.0,UNKNOWN,Unspecified,BRONX,40.839842,-73.828442
43073,2023-01-20 12:03:00,2023-01-19 12:02:00,DOT,Street Light Condition,Street Light Out,Other,10475.0,,,,...,10 BRONX,,BRONX,1032818.0,257638.0,UNKNOWN,Unspecified,BRONX,40.873698,-73.824384
43414,2023-01-20 12:47:00,2023-01-19 12:42:00,DOT,Street Light Condition,Street Light Dayburning,Other,11224.0,CONEY ISL BEACH BOARDWALK,CONEY ISL BEACH BOARDWALK,,...,13 BROOKLYN,,BROOKLYN,983593.0,147787.0,UNKNOWN,Unspecified,BROOKLYN,40.572317,-74.002365


In [26]:
# creates an index containing all of examples where closed date is before created data and drops those columns
index_to_drop = df.index[df['Closed Date'] < df['Created Date']].tolist()
df.drop(index_to_drop, inplace=True)

Now its time for geocoding

In [27]:
print((df.isnull().sum() * 100)/ len(df))

Created Date                       0.000000
Closed Date                        9.508028
Agency                             0.000000
Complaint Type                     0.000000
Descriptor                         0.000000
Location Type                      0.000000
Incident Zip                       1.347768
Incident Address                   4.625129
Street Name                        4.625129
Cross Street 1                    35.667992
Cross Street 2                    35.642215
Intersection Street 1             40.037193
Intersection Street 2             39.959862
Address Type                       0.000000
City                               4.341582
Landmark                          47.019075
Status                             0.000000
Resolution Description             3.203712
Resolution Action Updated Date     3.119016
Community Board                    0.000000
BBL                               11.216674
Borough                            0.000000
X Coordinate (State Plane)      

In [28]:
# import geopy
# import geocoder
# import pandas as pd

# # create a function to perform geocoding
# def geocode(address):
#     try:
#         location = geocoder.osm(address)
#         return location.lat, location.lng
#     except:
#         return None, None

# # create a function to perform reverse geocoding
# def reverse_geocode(latitude, longitude):
#     try:
#         location = geopy.Point(latitude, longitude)
#         address = geopy.geocoders.Nominatim(user_agent="my-application").reverse(location)
#         return address.address
#     except:
#         return None

# # iterate over the DataFrame and fill in the missing values
# for index, row in df.iterrows():
#     if pd.isnull(row['Latitude']) or pd.isnull(row['Longitude']):
#         lat, lng = geocode(row['Incident Address'])
#         df.at[index, 'Latitude'] = lat
#         df.at[index, 'Longitude'] = lng
#     if pd.isnull(row['Incident Address']):
#         address = reverse_geocode(row['Latitude'], row['Longitude'])
#         df.at[index, 'Incident Address'] = address

# # percent missing for each variable
# print((df.isnull().sum() * 100)/ len(df))

Status code Unknown from https://nominatim.openstreetmap.org/search: ERROR - HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://nominatim.openstreetmap.org/search: ERROR - HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://nominatim.openstreetmap.org/search: ERROR - HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://nominatim.openstreetmap.org/search: ERROR - HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://nominatim.openstreetmap.org/search: ERROR - HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://nominatim.openstreetmap.org/search: ERROR - HTTPSConnectionPool(host='no

Created Date                       0.000000
Closed Date                        9.508028
Agency                             0.000000
Complaint Type                     0.000000
Descriptor                         0.000000
Location Type                      0.000000
Incident Zip                       1.347768
Incident Address                   4.625129
Street Name                        4.625129
Cross Street 1                    35.667992
Cross Street 2                    35.642215
Intersection Street 1             40.037193
Intersection Street 2             39.959862
Address Type                       0.000000
City                               4.341582
Landmark                          47.019075
Status                             0.000000
Resolution Description             3.203712
Resolution Action Updated Date     3.119016
Community Board                    0.000000
BBL                               11.216674
Borough                            0.000000
X Coordinate (State Plane)      