In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(font_scale=1.25)
np.random.seed(5)

pd.set_option("display.max_rows",10000)
pd.set_option("display.max_columns",100)

In [2]:
df = pd.read_csv("nyconemil.csv")

In [3]:
df.head()

Unnamed: 0,created_date,unique_key,complaint_type,incident_zip,incident_address,street_name,address_type,city,resolution_description,borough,latitude,longitude,closed_date,location_type,status
0,2020-02-07T13:36:39.000,45567011,HEAT/HOT WATER,10474.0,1202 SPOFFORD AVENUE,SPOFFORD AVENUE,ADDRESS,BRONX,The following complaint conditions are still o...,BRONX,40.813843,-73.890801,,RESIDENTIAL BUILDING,Open
1,2020-02-07T22:28:04.000,45565104,HEAT/HOT WATER,11235.0,50 SHORE BOULEVARD,SHORE BOULEVARD,ADDRESS,BROOKLYN,The following complaint conditions are still o...,BROOKLYN,40.582266,-73.954005,,RESIDENTIAL BUILDING,Open
2,2020-02-07T18:09:26.000,45568046,HEAT/HOT WATER,10037.0,621 LENOX AVENUE,LENOX AVENUE,ADDRESS,NEW YORK,The complaint you filed is a duplicate of a co...,MANHATTAN,40.817995,-73.938032,,RESIDENTIAL BUILDING,Open
3,2020-02-07T19:32:17.000,45569863,APPLIANCE,10034.0,77 SEAMAN AVENUE,SEAMAN AVENUE,ADDRESS,NEW YORK,The following complaint conditions are still o...,MANHATTAN,40.868043,-73.925151,,RESIDENTIAL BUILDING,Open
4,2020-02-07T15:54:48.000,45570034,SAFETY,11207.0,184 SCHAEFER STREET,SCHAEFER STREET,ADDRESS,BROOKLYN,The following complaint conditions are still o...,BROOKLYN,40.689579,-73.907906,,RESIDENTIAL BUILDING,Open


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 15 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   created_date            1000000 non-null  object 
 1   unique_key              1000000 non-null  int64  
 2   complaint_type          1000000 non-null  object 
 3   incident_zip            919319 non-null   float64
 4   incident_address        947169 non-null   object 
 5   street_name             947169 non-null   object 
 6   address_type            922295 non-null   object 
 7   city                    919719 non-null   object 
 8   resolution_description  999577 non-null   object 
 9   borough                 1000000 non-null  object 
 10  latitude                919322 non-null   float64
 11  longitude               919322 non-null   float64
 12  closed_date             984457 non-null   object 
 13  location_type           947170 non-null   object 
 14  sta

In [5]:
df.shape

(1000000, 15)

In [6]:
df['complaint_type'].value_counts()

HEATING                   263619
GENERAL CONSTRUCTION      139951
PLUMBING                  132606
HEAT/HOT WATER            115444
PAINT - PLASTER           102381
NONCONST                   75343
HPD Literature Request     52830
ELECTRIC                   47286
APPLIANCE                  19876
UNSANITARY CONDITION       17778
WATER LEAK                  7386
PAINT/PLASTER               7081
DOOR/WINDOW                 6484
GENERAL                     5398
FLOORING/STAIRS             2723
SAFETY                      1838
CONSTRUCTION                1448
ELEVATOR                     341
OUTSIDE BUILDING             143
Unsanitary Condition          30
General                       10
Safety                         2
AGENCY                         1
Electric                       1
Name: complaint_type, dtype: int64

In [7]:
df['borough'].value_counts()

Unspecified      797833
BRONX             65828
BROOKLYN          60858
MANHATTAN         44815
QUEENS            27937
STATEN ISLAND      2729
Name: borough, dtype: int64

In [8]:
df['city'].value_counts()

BROOKLYN               320817
BRONX                  284835
NEW YORK               179880
STATEN ISLAND           15526
JAMAICA                 14119
FLUSHING                 7306
ASTORIA                  7152
RIDGEWOOD                6441
FAR ROCKAWAY             5780
ELMHURST                 4415
CORONA                   3964
WOODSIDE                 3859
JACKSON HEIGHTS          3606
FOREST HILLS             2780
Jamaica                  2677
SOUTH RICHMOND HILL      2412
Elmhurst                 2291
SUNNYSIDE                2231
Flushing                 2228
REGO PARK                2121
QUEENS VILLAGE           2113
RICHMOND HILL            2082
EAST ELMHURST            2075
HOLLIS                   2059
Astoria                  1982
WOODHAVEN                1882
OZONE PARK               1823
SAINT ALBANS             1763
SPRINGFIELD GARDENS      1739
ARVERNE                  1723
SOUTH OZONE PARK         1675
Ridgewood                1437
Woodside                 1436
Far Rockaw

### There are 2 approaches using Classification ML Model

One is use Heat/Hot Water as '1' and the rest as '0'<br>
Two is using Multiclassification Model to predict which complaint type

### Use Bronx as main target area

In [9]:
df2 = df[df['borough'] == 'BRONX']

In [10]:
df2

Unnamed: 0,created_date,unique_key,complaint_type,incident_zip,incident_address,street_name,address_type,city,resolution_description,borough,latitude,longitude,closed_date,location_type,status
0,2020-02-07T13:36:39.000,45567011,HEAT/HOT WATER,10474.0,1202 SPOFFORD AVENUE,SPOFFORD AVENUE,ADDRESS,BRONX,The following complaint conditions are still o...,BRONX,40.813843,-73.890801,,RESIDENTIAL BUILDING,Open
6,2020-02-07T06:01:02.000,45569919,GENERAL,10461.0,3555 BRUCKNER BOULEVARD,BRUCKNER BOULEVARD,ADDRESS,BRONX,The following complaint conditions are still o...,BRONX,40.847809,-73.827481,,RESIDENTIAL BUILDING,Open
8,2020-02-07T13:06:38.000,45567090,HEAT/HOT WATER,10460.0,968 BRONX PARK SOUTH,BRONX PARK SOUTH,ADDRESS,BRONX,The following complaint conditions are still o...,BRONX,40.845066,-73.878848,,RESIDENTIAL BUILDING,Open
10,2020-02-07T09:37:49.000,45567052,HEAT/HOT WATER,10452.0,960 ANDERSON AVENUE,ANDERSON AVENUE,ADDRESS,BRONX,The complaint you filed is a duplicate of a co...,BRONX,40.831403,-73.927819,,RESIDENTIAL BUILDING,Open
19,2020-02-07T19:47:58.000,45566105,HEAT/HOT WATER,10458.0,2746 DECATUR AVENUE,DECATUR AVENUE,ADDRESS,BRONX,The following complaint conditions are still o...,BRONX,40.865685,-73.887842,,RESIDENTIAL BUILDING,Open
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
925582,2010-12-31T00:00:00.000,19505705,GENERAL CONSTRUCTION,10463.0,170 WEST KINGSBRIDGE ROAD,WEST KINGSBRIDGE ROAD,ADDRESS,BRONX,The following complaint conditions are still o...,BRONX,40.870973,-73.904487,,RESIDENTIAL BUILDING,Open
925588,2010-12-31T00:00:00.000,19505711,PLUMBING,10459.0,916 KELLY STREET,KELLY STREET,ADDRESS,BRONX,The following complaint conditions are still o...,BRONX,40.820134,-73.896066,,RESIDENTIAL BUILDING,Open
939851,2011-01-07T00:00:00.000,19551788,HEATING,10458.0,2804 BAINBRIDGE AVENUE,BAINBRIDGE AVENUE,ADDRESS,BRONX,The following complaint conditions are still o...,BRONX,40.868233,-73.888680,,RESIDENTIAL BUILDING,Open
954140,2011-01-13T00:00:00.000,19587224,HEATING,10453.0,2075 MORRIS AVENUE,MORRIS AVENUE,ADDRESS,BRONX,More than one complaint was received for this ...,BRONX,40.853797,-73.905029,,RESIDENTIAL BUILDING,Open


In [11]:
df2.shape

(65828, 15)

In [12]:
df2['complaint_type'].value_counts()

HEAT/HOT WATER          38126
UNSANITARY CONDITION     5498
HEATING                  4328
PLUMBING                 3583
PAINT/PLASTER            2535
WATER LEAK               2471
DOOR/WINDOW              2017
GENERAL                  1600
ELECTRIC                 1571
APPLIANCE                1349
FLOORING/STAIRS           919
GENERAL CONSTRUCTION      559
SAFETY                    491
PAINT - PLASTER           430
NONCONST                  191
ELEVATOR                  104
OUTSIDE BUILDING           33
CONSTRUCTION               14
Unsanitary Condition        6
General                     3
Name: complaint_type, dtype: int64

In [13]:
df2.isnull().sum()

created_date                 0
unique_key                   0
complaint_type               0
incident_zip              8119
incident_address             1
street_name                  1
address_type              7703
city                      8117
resolution_description     112
borough                      0
latitude                  8119
longitude                 8119
closed_date               4172
location_type                0
status                       0
dtype: int64

In [14]:
#Drop unwanted features
df2 = df2.drop(['created_date','unique_key','incident_zip','incident_address','street_name','address_type','city','resolution_description','latitude','longitude','closed_date','location_type'],axis=1)

In [15]:
df2.head()

Unnamed: 0,complaint_type,borough,status
0,HEAT/HOT WATER,BRONX,Open
6,GENERAL,BRONX,Open
8,HEAT/HOT WATER,BRONX,Open
10,HEAT/HOT WATER,BRONX,Open
19,HEAT/HOT WATER,BRONX,Open


In [16]:
df2.isnull().sum()

complaint_type    0
borough           0
status            0
dtype: int64

In [17]:
df2.shape

(65828, 3)

### Using Approach 1

In [None]:
# df3 = df2.replace({'HEAT/HOT WATER':1,'UNSANITARY CONDITION':0,'PLUMBING':0,'PAINT/PLASTER':0,
#              'WATER LEAK':0,'DOOR/WINDOW':0,'GENERAL':0,'ELECTRIC':0,'APPLIANCE':0,
#              'FLOORING/STAIRS':0,'SAFETY':0,'ELEVATOR':0,'OUTSIDE BUILDING':0,})

In [None]:
# df3['complaint_type'].value_counts()

In [None]:
# df3

In [None]:
# df3.isnull().sum()

In [None]:
#Save to csv
#df3.to_csv("nycbronx1.csv",index=False)

### Using Approach 2

In [19]:
df2.isnull().sum()

complaint_type    0
borough           0
status            0
dtype: int64

In [20]:
#Save to csv
#df2.to_csv("nycbronxonemil.csv",index=False)