### Data Dictionary

The goal of this exercise is to do Model Development and Validation to find the answer to the Question 4 of the problem statement:

Can a predictive model be built for future prediction of the possibility of complaints of the specific type that you identified in response to Question 1?

Using the best model, you need to predict the number of future complaints (of the Complaint Type that you decided to focus on in Question 1).

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

%matplotlib inline
sns.set_style('darkgrid')
sns.set(font_scale=1.5)


import feature_engine.missing_data_imputers as mdi
from feature_engine.outlier_removers import Winsorizer

import warnings
warnings.filterwarnings('ignore')


pd.options.display.max_columns= None
#pd.options.display.max_rows = None

### Data Exploration

In [2]:
df = pd.read_csv("heat.csv")

In [3]:
df

Unnamed: 0,ComplaintType,Zipcode,Street,Borough
0,HEAT/HOT WATER,10019.0,WEST 52 STREET,MANHATTAN
1,HEAT/HOT WATER,11372.0,37 AVENUE,QUEENS
2,HEAT/HOT WATER,10458.0,SOUTHERN BOULEVARD,BRONX
3,HEAT/HOT WATER,10456.0,MORRIS AVENUE,BRONX
4,HEAT/HOT WATER,11372.0,81 STREET,QUEENS
...,...,...,...,...
1847750,HEAT/HOT WATER,10029.0,EAST 108 STREET,MANHATTAN
1847751,HEAT/HOT WATER,10029.0,EAST 108 STREET,MANHATTAN
1847752,HEAT/HOT WATER,10461.0,BRUCKNER BOULEVARD,BRONX
1847753,HEAT/HOT WATER,10034.0,SHERMAN AVENUE,MANHATTAN


In [4]:
df.drop(['ComplaintType', 'Zipcode'],axis=1,inplace=True)

In [5]:
df.columns

Index(['Street', 'Borough'], dtype='object')

### Data Preprocessing

### Segment Heat/Hot Water Cases for STATEN ISLAND

In [6]:
df['Borough'].value_counts()

BRONX            600147
BROOKLYN         569310
MANHATTAN        418432
QUEENS           241660
STATEN ISLAND     18206
Name: Borough, dtype: int64

In [7]:
staten = df[df['Borough'] == 'STATEN ISLAND']

In [8]:
staten

Unnamed: 0,Street,Borough
230,BLOOMINGDALE ROAD,STATEN ISLAND
283,BOND STREET,STATEN ISLAND
288,VAN DUZER STREET,STATEN ISLAND
389,SHERADEN AVENUE,STATEN ISLAND
470,JERSEY STREET,STATEN ISLAND
...,...,...
1847544,PROSPECT AVENUE,STATEN ISLAND
1847563,ELVIN STREET,STATEN ISLAND
1847590,AMBOY ROAD,STATEN ISLAND
1847673,GADSEN PLACE,STATEN ISLAND


In [9]:
staten.reset_index(drop=True, inplace=True)

In [10]:
staten

Unnamed: 0,Street,Borough
0,BLOOMINGDALE ROAD,STATEN ISLAND
1,BOND STREET,STATEN ISLAND
2,VAN DUZER STREET,STATEN ISLAND
3,SHERADEN AVENUE,STATEN ISLAND
4,JERSEY STREET,STATEN ISLAND
...,...,...
18201,PROSPECT AVENUE,STATEN ISLAND
18202,ELVIN STREET,STATEN ISLAND
18203,AMBOY ROAD,STATEN ISLAND
18204,GADSEN PLACE,STATEN ISLAND


In [11]:
df2 = pd.read_csv("statenclasstrain.csv")

In [12]:
df2

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
0,1504.0,34.0,0.42,1.0,1.0,3600.0,90.00,1.0,2.0,1915.0,1504.0,0.5,1681.0,80.0,95.0,1
1,1200.0,32.0,0.43,1.0,1.0,2821.0,90.00,1.0,2.0,1915.0,1200.0,0.6,1681.0,40.0,55.0,4
2,4688.0,61.0,0.34,1.0,1.0,12660.0,107.67,1.0,1.0,1915.0,1500.0,0.6,1681.0,8.0,33.0,1
3,1360.0,40.0,0.54,1.0,1.0,2500.0,100.00,1.0,2.0,1915.0,1360.0,0.6,1681.0,1.0,16.0,0
4,2248.0,48.0,0.98,1.0,1.0,2287.0,94.44,1.0,2.0,1915.0,2248.0,0.6,1681.0,22.0,37.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87182,1768.0,61.0,0.61,1.0,1.0,2875.0,125.00,1.0,2.0,1915.0,1768.0,0.6,1681.0,25.0,40.0,0
87183,1600.0,34.0,0.49,1.0,1.0,700.0,35.00,1.0,1.0,1915.0,1500.0,0.6,1681.0,75.0,90.0,0
87184,1200.0,24.0,1.39,1.0,1.0,620.0,29.83,1.0,2.0,1915.0,1200.0,0.6,1681.0,17.0,32.0,0
87185,4342.0,38.0,0.90,1.0,1.0,4800.0,60.00,1.0,2.0,1915.0,4228.0,0.5,1681.0,2.0,17.0,0


In [13]:
heat = df2[df2['ComplaintType'] == 0]

In [14]:
heat

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
3,1360.0,40.0,0.54,1.0,1.0,2500.0,100.00,1.0,2.0,1915.0,1360.0,0.6,1681.0,1.0,16.0,0
4,2248.0,48.0,0.98,1.0,1.0,2287.0,94.44,1.0,2.0,1915.0,2248.0,0.6,1681.0,22.0,37.0,0
5,1350.0,30.0,0.60,1.0,1.0,2262.0,87.00,1.0,2.0,1915.0,1350.0,0.6,1681.0,20.0,35.0,0
7,1684.0,28.0,0.42,1.0,1.0,4000.0,100.00,1.0,2.0,1915.0,1684.0,0.6,1681.0,70.0,85.0,0
8,1440.0,44.5,0.56,1.0,1.0,2575.0,103.00,1.0,2.0,1915.0,1440.0,0.6,1681.0,30.0,45.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87182,1768.0,61.0,0.61,1.0,1.0,2875.0,125.00,1.0,2.0,1915.0,1768.0,0.6,1681.0,25.0,40.0,0
87183,1600.0,34.0,0.49,1.0,1.0,700.0,35.00,1.0,1.0,1915.0,1500.0,0.6,1681.0,75.0,90.0,0
87184,1200.0,24.0,1.39,1.0,1.0,620.0,29.83,1.0,2.0,1915.0,1200.0,0.6,1681.0,17.0,32.0,0
87185,4342.0,38.0,0.90,1.0,1.0,4800.0,60.00,1.0,2.0,1915.0,4228.0,0.5,1681.0,2.0,17.0,0


In [15]:
heat.reset_index(drop=True, inplace=True)

In [16]:
heat

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
0,1360.0,40.0,0.54,1.0,1.0,2500.0,100.00,1.0,2.0,1915.0,1360.0,0.6,1681.0,1.0,16.0,0
1,2248.0,48.0,0.98,1.0,1.0,2287.0,94.44,1.0,2.0,1915.0,2248.0,0.6,1681.0,22.0,37.0,0
2,1350.0,30.0,0.60,1.0,1.0,2262.0,87.00,1.0,2.0,1915.0,1350.0,0.6,1681.0,20.0,35.0,0
3,1684.0,28.0,0.42,1.0,1.0,4000.0,100.00,1.0,2.0,1915.0,1684.0,0.6,1681.0,70.0,85.0,0
4,1440.0,44.5,0.56,1.0,1.0,2575.0,103.00,1.0,2.0,1915.0,1440.0,0.6,1681.0,30.0,45.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18201,1768.0,61.0,0.61,1.0,1.0,2875.0,125.00,1.0,2.0,1915.0,1768.0,0.6,1681.0,25.0,40.0,0
18202,1600.0,34.0,0.49,1.0,1.0,700.0,35.00,1.0,1.0,1915.0,1500.0,0.6,1681.0,75.0,90.0,0
18203,1200.0,24.0,1.39,1.0,1.0,620.0,29.83,1.0,2.0,1915.0,1200.0,0.6,1681.0,17.0,32.0,0
18204,4342.0,38.0,0.90,1.0,1.0,4800.0,60.00,1.0,2.0,1915.0,4228.0,0.5,1681.0,2.0,17.0,0


In [17]:
heat.shape

(18206, 16)

In [18]:
df3 = pd.concat([heat,staten],axis=1)

In [19]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType,Street,Borough
0,1360.0,40.0,0.54,1.0,1.0,2500.0,100.00,1.0,2.0,1915.0,1360.0,0.6,1681.0,1.0,16.0,0,BLOOMINGDALE ROAD,STATEN ISLAND
1,2248.0,48.0,0.98,1.0,1.0,2287.0,94.44,1.0,2.0,1915.0,2248.0,0.6,1681.0,22.0,37.0,0,BOND STREET,STATEN ISLAND
2,1350.0,30.0,0.60,1.0,1.0,2262.0,87.00,1.0,2.0,1915.0,1350.0,0.6,1681.0,20.0,35.0,0,VAN DUZER STREET,STATEN ISLAND
3,1684.0,28.0,0.42,1.0,1.0,4000.0,100.00,1.0,2.0,1915.0,1684.0,0.6,1681.0,70.0,85.0,0,SHERADEN AVENUE,STATEN ISLAND
4,1440.0,44.5,0.56,1.0,1.0,2575.0,103.00,1.0,2.0,1915.0,1440.0,0.6,1681.0,30.0,45.0,0,JERSEY STREET,STATEN ISLAND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18201,1768.0,61.0,0.61,1.0,1.0,2875.0,125.00,1.0,2.0,1915.0,1768.0,0.6,1681.0,25.0,40.0,0,PROSPECT AVENUE,STATEN ISLAND
18202,1600.0,34.0,0.49,1.0,1.0,700.0,35.00,1.0,1.0,1915.0,1500.0,0.6,1681.0,75.0,90.0,0,ELVIN STREET,STATEN ISLAND
18203,1200.0,24.0,1.39,1.0,1.0,620.0,29.83,1.0,2.0,1915.0,1200.0,0.6,1681.0,17.0,32.0,0,AMBOY ROAD,STATEN ISLAND
18204,4342.0,38.0,0.90,1.0,1.0,4800.0,60.00,1.0,2.0,1915.0,4228.0,0.5,1681.0,2.0,17.0,0,GADSEN PLACE,STATEN ISLAND


In [20]:
df3.columns

Index(['BldgArea', 'BldgDepth', 'BuiltFAR', 'CommFAR', 'FacilFAR', 'LotArea',
       'LotDepth', 'NumBldgs', 'NumFloors', 'OfficeArea', 'ResArea',
       'ResidFAR', 'RetailArea', 'Age', 'Period', 'ComplaintType', 'Street',
       'Borough'],
      dtype='object')

In [21]:
df3.drop(['ComplaintType', 'Street'],axis=1,inplace=True)

In [22]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,Borough
0,1360.0,40.0,0.54,1.0,1.0,2500.0,100.00,1.0,2.0,1915.0,1360.0,0.6,1681.0,1.0,16.0,STATEN ISLAND
1,2248.0,48.0,0.98,1.0,1.0,2287.0,94.44,1.0,2.0,1915.0,2248.0,0.6,1681.0,22.0,37.0,STATEN ISLAND
2,1350.0,30.0,0.60,1.0,1.0,2262.0,87.00,1.0,2.0,1915.0,1350.0,0.6,1681.0,20.0,35.0,STATEN ISLAND
3,1684.0,28.0,0.42,1.0,1.0,4000.0,100.00,1.0,2.0,1915.0,1684.0,0.6,1681.0,70.0,85.0,STATEN ISLAND
4,1440.0,44.5,0.56,1.0,1.0,2575.0,103.00,1.0,2.0,1915.0,1440.0,0.6,1681.0,30.0,45.0,STATEN ISLAND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18201,1768.0,61.0,0.61,1.0,1.0,2875.0,125.00,1.0,2.0,1915.0,1768.0,0.6,1681.0,25.0,40.0,STATEN ISLAND
18202,1600.0,34.0,0.49,1.0,1.0,700.0,35.00,1.0,1.0,1915.0,1500.0,0.6,1681.0,75.0,90.0,STATEN ISLAND
18203,1200.0,24.0,1.39,1.0,1.0,620.0,29.83,1.0,2.0,1915.0,1200.0,0.6,1681.0,17.0,32.0,STATEN ISLAND
18204,4342.0,38.0,0.90,1.0,1.0,4800.0,60.00,1.0,2.0,1915.0,4228.0,0.5,1681.0,2.0,17.0,STATEN ISLAND


In [23]:
df3.duplicated(keep='first').sum()

2200

In [24]:
df3.drop_duplicates(keep='first',ignore_index=True, inplace=True)

In [25]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,Borough
0,1360.0,40.0,0.54,1.0,1.0,2500.0,100.00,1.0,2.0,1915.0,1360.0,0.6,1681.0,1.0,16.0,STATEN ISLAND
1,2248.0,48.0,0.98,1.0,1.0,2287.0,94.44,1.0,2.0,1915.0,2248.0,0.6,1681.0,22.0,37.0,STATEN ISLAND
2,1350.0,30.0,0.60,1.0,1.0,2262.0,87.00,1.0,2.0,1915.0,1350.0,0.6,1681.0,20.0,35.0,STATEN ISLAND
3,1684.0,28.0,0.42,1.0,1.0,4000.0,100.00,1.0,2.0,1915.0,1684.0,0.6,1681.0,70.0,85.0,STATEN ISLAND
4,1440.0,44.5,0.56,1.0,1.0,2575.0,103.00,1.0,2.0,1915.0,1440.0,0.6,1681.0,30.0,45.0,STATEN ISLAND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16001,2392.0,46.0,0.61,1.0,1.0,3935.0,95.00,1.0,2.0,1915.0,2392.0,0.5,1681.0,35.0,60.0,STATEN ISLAND
16002,1768.0,61.0,0.61,1.0,1.0,2875.0,125.00,1.0,2.0,1915.0,1768.0,0.6,1681.0,25.0,40.0,STATEN ISLAND
16003,1600.0,34.0,0.49,1.0,1.0,700.0,35.00,1.0,1.0,1915.0,1500.0,0.6,1681.0,75.0,90.0,STATEN ISLAND
16004,4342.0,38.0,0.90,1.0,1.0,4800.0,60.00,1.0,2.0,1915.0,4228.0,0.5,1681.0,2.0,17.0,STATEN ISLAND


In [26]:
#Save to CSV
#df3.to_csv("statenheat.csv",index=False)