In [1]:
import pandas as pd
import requests as rq

In [2]:
#nyc complains data: 
complains_url="https://data.cityofnewyork.us/api/views/erm2-nwe9/rows.csv"
#complains_url="data/2017_small.csv"
# population by zip: 
pop_byzip_url = "https://data.cityofnewyork.us/resource/rreq-n6zk.csv"
#pop_byzip_url = "data/pop_by_zip.csv"

pop_byborough_url = "https://data.cityofnewyork.us/api/views/h2bk-zmw6/rows.csv?accessType=DOWNLOAD"
chunksize = 100000
year=2017

#### for some reason pandas ignores nrows() when reading from URL so we use bash magic to expolore schema

In [3]:
%%bash -s "$complains_url" "$pop_byzip_url" "$pop_byborough_url"
echo "++++++++++++++++++++++++++++++ COMPLAINS ++++++++++++++++++++++++++++++++"
wget -qO- $1 | head -2

echo "++++++++++++++++++++++++ POPULATION BY ZIP CODES ++++++++++++++++++++++++"
wget -qO- $2 | head -2

echo "+++++++++++++++++++++++++ POPULATION BY BOROUGH +++++++++++++++++++++++++"
wget -qO- $3 | head -2

++++++++++++++++++++++++++++++ COMPLAINS ++++++++++++++++++++++++++++++++
Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Description,Resolution Action Updated Date,Community Board,BBL,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Open Data Channel Type,Park Facility Name,Park Borough,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Latitude,Longitude,Location
31317560,08/15/2015 11:42:16 PM,08/16/2015 01:12:47 AM,NYPD,New York City Police Department,Noise - Commercial,Loud Music/Party,Club/Bar/Restaurant,10013,134 READE STREET,READE STREET,HUDSON STREET,GREENWICH STREET,,,ADDRESS,NEW YORK,,Precinct,Closed,08/16/2015 07:42:16 AM,The Police Department responded

### Data loading, transforming & exploration

In [None]:
# Filter by year of interest: 2017 and remove 'Borough' == "Unspecified"
# We read in chunks to keep memory consumption controlled iterator=True, 
iter_df=pd.read_csv(complains_url, skipinitialspace=True, usecols=['Unique Key','Created Date','Borough','Zip','City','Complaint Type'], chunksize=chunksize)
#df = pd.concat([chunk[(pd.to_datetime(chunk['Created Date']).dt.year == year) & (chunk['Borough'] != "Unspecified")] for chunk in iter_df])
chunks = []
for chunk in iter_df:
    chunks.append(chunk[(pd.to_datetime(chunk['Created Date']).dt.year == year) & (chunk['Borough'] != "Unspecified")])

df = pd.concat(chunks, axis=0, ignore_index=True)

In [10]:
df.head(n=5)

Unnamed: 0,Unique Key,Created Date,Borough,Zip,City,Complaint Type
1,35139948,2017-01-01 00:00:59,BRONX,10467.0,BRONX,HEAT/HOT WATER
2,35140478,2017-01-01 00:01:43,QUEENS,11368.0,CORONA,Blocked Driveway
3,35138317,2017-01-01 00:02:54,BROOKLYN,11209.0,BROOKLYN,Noise - Residential
4,35139300,2017-01-01 00:03:41,MANHATTAN,10040.0,NEW YORK,Noise - Residential
5,35143952,2017-01-01 00:03:42,BRONX,10470.0,BRONX,HEAT/HOT WATER


In [None]:
df.to_csv("data/nyc_311_2017.csv", index=False)

In [17]:
df.dtypes

Unique Key          int64
Created Date       object
Borough            object
Zip               float64
City               object
Complaint Type     object
dtype: object

In [15]:
df.describe(include = 'all')

Unnamed: 0,Unique Key,Created Date,Borough,Zip,City,Complaint Type
count,999.0,999,999,995.0,995,999
unique,,942,6,,40,44
top,,2017-01-01 00:56:16,BROOKLYN,,BROOKLYN,Noise - Residential
freq,,8,316,,316,561
mean,35142850.0,,,10797.300503,,
std,3365.97,,,552.737019,,
min,35136920.0,,,10001.0,,
25%,35140060.0,,,10306.0,,
50%,35143020.0,,,11203.0,,
75%,35145780.0,,,11233.0,,


In [24]:
df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 999 entries, 1 to 999
Data columns (total 6 columns):
Unique Key        999 non-null int64
Created Date      999 non-null object
Borough           999 non-null object
Zip               995 non-null float64
City              995 non-null object
Complaint Type    999 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 54.6+ KB


In [20]:
df_borough = df[['Borough']].drop_duplicates()
df_borough

Unnamed: 0,Borough
1,BRONX
2,QUEENS
3,BROOKLYN
4,MANHATTAN
9,STATEN ISLAND
132,Unspecified


In [28]:
df.groupby(['Borough'])['Borough'].count().reset_index(name='cnt').sort_values(by='cnt', ascending=False)

Unnamed: 0,Borough,cnt
1,BROOKLYN,316
2,MANHATTAN,239
3,QUEENS,218
0,BRONX,197
4,STATEN ISLAND,26
5,Unspecified,3


In [34]:
df_compl_type = df[['Complaint Type']].drop_duplicates()
df_compl_type

Unnamed: 0,Complaint Type
1,HEAT/HOT WATER
2,Blocked Driveway
3,Noise - Residential
6,Traffic Signal Condition
12,Noise - Commercial
22,Noise - Street/Sidewalk
39,Illegal Parking
40,Drug Activity
51,Homeless Person Assistance
55,Street Light Condition


In [43]:
# Borough == Unspecified is less then 0.5% of all records so we are going to drop those records from the DataFrame
# This also can be done at a loading time (see above)
df = df[df['Borough'].map(lambda x: str(x) != "Unspecified")]
df.count()

Unique Key        996
Created Date      996
Borough           996
Zip               995
City              995
Complaint Type    996
dtype: int64

In [33]:
df.groupby(['Borough'])['Borough'].count().reset_index(name='cnt').sort_values(by='cnt', ascending=False)

Unnamed: 0,Borough,cnt
1,BROOKLYN,316
2,MANHATTAN,239
3,QUEENS,218
0,BRONX,197
4,STATEN ISLAND,26


In [49]:
# There are few complains where City or/and Zip is Nan:
city_null = df['City'].isnull()
zip_null = df['Zip'].isnull()

# Select these cases:
df[zip_null | city_null]

Unnamed: 0,Unique Key,Created Date,Borough,Zip,City,Complaint Type
110,35143285,2017-01-01 00:34:00,MANHATTAN,,,Street Light Condition


In [55]:
df = df[df['Zip'].notnull() | df['City'].notnull()]
df.count()

Unique Key        995
Created Date      995
Borough           995
Zip               995
City              995
Complaint Type    995
dtype: int64

### Assessment part

#### 1. Consider only the 10 most common overall complaint types. For each borough, how many of each of those 10 types were there in 2017?

In [58]:
# First top 10 complains types:
top10_complains = df.groupby(['Complaint Type'])['Complaint Type'].count().reset_index(name='cnt').sort_values(by='cnt', ascending=False).head(n=10)
top10_complains

Unnamed: 0,Complaint Type,cnt
27,Noise - Residential,559
24,Noise - Commercial,103
2,Blocked Driveway,65
18,HEAT/HOT WATER,59
28,Noise - Street/Sidewalk,54
21,Illegal Parking,40
35,Street Condition,19
29,Noise - Vehicle,10
38,Taxi Complaint,8
41,UNSANITARY CONDITION,6


In [62]:
all_merged_with_top10 = pd.merge(top10_complains, df, on='Complaint Type')

In [80]:
# For each borough, how many of each of those 10 types were there in 2017?
top10_borough_complains = all_merged_with_top10.groupby(['Borough','Complaint Type'])['Complaint Type'].count().reset_index(name='cnt').sort_values(by=['Borough','cnt'], ascending=[True, False])
top10_borough_complains

Unnamed: 0,Borough,Complaint Type,cnt
4,BRONX,Noise - Residential,129
1,BRONX,HEAT/HOT WATER,21
0,BRONX,Blocked Driveway,17
2,BRONX,Illegal Parking,7
3,BRONX,Noise - Commercial,2
5,BRONX,Noise - Street/Sidewalk,2
6,BRONX,Noise - Vehicle,2
7,BRONX,UNSANITARY CONDITION,1
12,BROOKLYN,Noise - Residential,152
11,BROOKLYN,Noise - Commercial,53


In [81]:
# top 10 complains by each borough (this contains exaclt 10 rows per borough):
top10_complains_by_borough = df.groupby(['Borough','Complaint Type'])['Complaint Type'].count().groupby(level='Borough').nlargest(10).reset_index(level=0, drop=True)
top10_complains_by_borough

Borough        Complaint Type            
BRONX          Noise - Residential           129
               HEAT/HOT WATER                 21
               Blocked Driveway               17
               Illegal Parking                 7
               Illegal Animal Kept as Pet      4
               Drug Activity                   2
               GENERAL                         2
               Noise - Commercial              2
               Noise - Street/Sidewalk         2
               Noise - Vehicle                 2
BROOKLYN       Noise - Residential           152
               Noise - Commercial             53
               HEAT/HOT WATER                 23
               Noise - Street/Sidewalk        22
               Blocked Driveway               17
               Illegal Parking                14
               Street Condition               10
               UNSANITARY CONDITION            4
               Noise - Vehicle                 3
               Traffic Sign

#### 2. Consider only the 10 most common overall complaint types. For the 10 most populous zip codes, how many of each of those 10 types were there in 2017?


In [85]:
#1. Find top 10 mst populous zip codes
top10_zip_pop = pd.read_csv(pop_byzip_url, skipinitialspace=True, usecols=['jurisdiction_name','count_participants']).sort_values(by=['count_participants'], ascending=[False]).head(10)
top10_zip_pop

Unnamed: 0,count_participants,jurisdiction_name
232,272,12789
218,252,12734
130,248,11230
228,242,12779
120,214,11219
229,201,12783
222,134,12754
210,124,12428
119,111,11218
124,109,11223


In [120]:
t_df = pd.merge(top10_zip_pop, df, left_on="jurisdiction_name", right_on='Zip')

In [127]:
top10_complains_by_zip = t_df.groupby(['Zip','Complaint Type'])['Complaint Type'].count().groupby(level='Zip').nlargest().reset_index(level=0, drop=True)
top10_complains_by_zip

Zip      Complaint Type         
11218.0  Noise - Residential        2
         Taxi Complaint             1
11219.0  Noise - Residential        3
         Noise - Commercial         1
11223.0  Noise - Residential        5
         Noise - Commercial         1
         Noise - Street/Sidewalk    1
         Smoking                    1
11230.0  Noise - Residential        2
Name: Complaint Type, dtype: int64

#### 3. Considering all complaint types. Which boroughs are the biggest "complainers" relative to the size of the population in 2017 Meaning, calculate a complaint-index that adjusts for population of the borough

In [184]:
borough_popul = pd.read_csv(pop_byborough_url, skipinitialspace=True).apply(lambda x: x.astype(str).str.upper())
borough_popul

Unnamed: 0,Borough,Population
0,BROOKLYN,4970026
1,MANHATTAN,3123068
2,BRONX,2717758
3,QUEENS,4460101
4,STATEN ISLAND,912458


In [185]:
borough_compl = df.groupby(['Borough'])['Borough'].count().reset_index(name='complains')
borough_compl

Unnamed: 0,Borough,complains
0,BRONX,197
1,BROOKLYN,316
2,MANHATTAN,238
3,QUEENS,218
4,STATEN ISLAND,26


In [181]:
borough_compl_per_popul_unit = pd.merge(borough_compl, borough_popul, on="Borough")
borough_compl_per_popul_unit['compl_per_1m_popul'] = 1000000*borough_compl_per_popul_unit['complains']/borough_compl_per_popul_unit['Population']
borough_compl_per_popul_unit = borough_compl_per_popul_unit.sort_values(by=['compl_per_1m_popul'], ascending=[False])
borough_compl_per_popul_unit

Unnamed: 0,Borough,complains,Population,compl_per_1m_popul
2,MANHATTAN,238,3123068,76.207114
0,BRONX,197,2717758,72.486218
1,BROOKLYN,316,4970026,63.581156
3,QUEENS,218,4460101,48.877817
4,STATEN ISLAND,26,912458,28.494462
