### Data Dictionary

The goal of this exercise is to do Model Development and Validation to find the answer to the Question 4 of the problem statement:

Can a predictive model be built for future prediction of the possibility of complaints of the specific type that you identified in response to Question 1?

Using the best model, you need to predict the number of future complaints (of the Complaint Type that you decided to focus on in Question 1).

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

%matplotlib inline
sns.set_style('darkgrid')
sns.set(font_scale=1.5)


import feature_engine.missing_data_imputers as mdi
from feature_engine.outlier_removers import Winsorizer

import warnings
warnings.filterwarnings('ignore')


pd.options.display.max_columns= None
#pd.options.display.max_rows = None

### Data Exploration

In [2]:
df = pd.read_csv("heat.csv")

In [3]:
df

Unnamed: 0,ComplaintType,Zipcode,Street,Borough
0,HEAT/HOT WATER,10019.0,WEST 52 STREET,MANHATTAN
1,HEAT/HOT WATER,11372.0,37 AVENUE,QUEENS
2,HEAT/HOT WATER,10458.0,SOUTHERN BOULEVARD,BRONX
3,HEAT/HOT WATER,10456.0,MORRIS AVENUE,BRONX
4,HEAT/HOT WATER,11372.0,81 STREET,QUEENS
...,...,...,...,...
1847750,HEAT/HOT WATER,10029.0,EAST 108 STREET,MANHATTAN
1847751,HEAT/HOT WATER,10029.0,EAST 108 STREET,MANHATTAN
1847752,HEAT/HOT WATER,10461.0,BRUCKNER BOULEVARD,BRONX
1847753,HEAT/HOT WATER,10034.0,SHERMAN AVENUE,MANHATTAN


In [4]:
df.drop(['ComplaintType', 'Zipcode'],axis=1,inplace=True)

In [5]:
df.columns

Index(['Street', 'Borough'], dtype='object')

### Data Preprocessing

### Segment Heat/Hot Water Cases for QUEENS

In [6]:
df['Borough'].value_counts()

BRONX            600147
BROOKLYN         569310
MANHATTAN        418432
QUEENS           241660
STATEN ISLAND     18206
Name: Borough, dtype: int64

In [7]:
queen = df[df['Borough'] == 'QUEENS']

In [8]:
queen

Unnamed: 0,Street,Borough
1,37 AVENUE,QUEENS
4,81 STREET,QUEENS
27,SENECA AVENUE,QUEENS
29,ELMHURST AVENUE,QUEENS
33,49 STREET,QUEENS
...,...,...
1847635,48 STREET,QUEENS
1847681,PERSHING CRESCENT,QUEENS
1847691,48 STREET,QUEENS
1847733,205 PLACE,QUEENS


In [9]:
queen.reset_index(drop=True, inplace=True)

In [10]:
queen

Unnamed: 0,Street,Borough
0,37 AVENUE,QUEENS
1,81 STREET,QUEENS
2,SENECA AVENUE,QUEENS
3,ELMHURST AVENUE,QUEENS
4,49 STREET,QUEENS
...,...,...
241655,48 STREET,QUEENS
241656,PERSHING CRESCENT,QUEENS
241657,48 STREET,QUEENS
241658,205 PLACE,QUEENS


In [11]:
df2 = pd.read_csv("queensclasstrain.csv")

In [12]:
df2

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
0,1760.0,39.0,0.61,2.0,5.0,9580.0,100.0,1.0,2.0,1680.0,1600.0,1.8,1500.0,71.0,85.0,0
1,1760.0,39.0,0.61,2.0,5.0,9580.0,100.0,1.0,2.0,1680.0,1600.0,1.8,1500.0,71.0,85.0,4
2,5569.0,94.0,2.16,2.0,5.0,9580.0,100.0,1.0,4.0,1680.0,1600.0,1.8,1500.0,0.0,9.0,3
3,1760.0,39.0,0.61,2.0,5.0,9580.0,100.0,1.0,2.0,1680.0,1600.0,1.8,1500.0,71.0,85.0,1
4,1760.0,39.0,0.61,2.0,5.0,9580.0,100.0,1.0,2.0,1680.0,1600.0,1.8,1500.0,71.0,85.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324578,1760.0,39.0,0.61,2.0,2.0,5700.0,100.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,0
324579,1760.0,39.0,0.61,2.0,2.0,7650.0,100.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,1
324580,1760.0,39.0,0.61,2.0,2.0,6200.0,100.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,4
324581,5569.0,40.0,0.13,2.0,2.0,9580.0,100.0,5.0,1.0,1680.0,4584.0,0.9,1500.0,68.0,82.0,2


In [13]:
heat = df2[df2['ComplaintType'] == 0]

In [14]:
heat

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
0,1760.0,39.0,0.61,2.0,5.0,9580.0,100.0,1.0,2.0,1680.0,1600.0,1.8,1500.0,71.0,85.0,0
9,1760.0,39.0,0.61,2.0,5.0,9580.0,100.0,1.0,2.0,1680.0,1600.0,1.8,1500.0,71.0,85.0,0
12,5569.0,70.0,2.16,2.0,5.0,9580.0,100.0,2.0,4.0,1680.0,4584.0,1.8,1500.0,-7.0,7.0,0
13,5569.0,94.0,2.16,2.0,5.0,9580.0,100.0,1.0,4.0,1680.0,4584.0,1.8,1500.0,0.0,7.0,0
17,5569.0,75.0,0.34,2.0,5.0,9580.0,100.0,5.0,2.0,1680.0,1600.0,0.6,1500.0,48.0,62.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324574,1760.0,39.0,0.61,2.0,2.0,1350.0,50.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,0
324575,1760.0,39.0,0.61,2.0,2.0,7000.0,100.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,0
324576,1760.0,39.0,0.61,2.0,2.0,6000.0,100.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,0
324577,1760.0,39.0,0.61,2.0,2.0,5100.0,90.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,0


In [15]:
heat.reset_index(drop=True, inplace=True)

In [16]:
heat

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
0,1760.0,39.0,0.61,2.0,5.0,9580.0,100.0,1.0,2.0,1680.0,1600.0,1.8,1500.0,71.0,85.0,0
1,1760.0,39.0,0.61,2.0,5.0,9580.0,100.0,1.0,2.0,1680.0,1600.0,1.8,1500.0,71.0,85.0,0
2,5569.0,70.0,2.16,2.0,5.0,9580.0,100.0,2.0,4.0,1680.0,4584.0,1.8,1500.0,-7.0,7.0,0
3,5569.0,94.0,2.16,2.0,5.0,9580.0,100.0,1.0,4.0,1680.0,4584.0,1.8,1500.0,0.0,7.0,0
4,5569.0,75.0,0.34,2.0,5.0,9580.0,100.0,5.0,2.0,1680.0,1600.0,0.6,1500.0,48.0,62.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122511,1760.0,39.0,0.61,2.0,2.0,1350.0,50.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,0
122512,1760.0,39.0,0.61,2.0,2.0,7000.0,100.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,0
122513,1760.0,39.0,0.61,2.0,2.0,6000.0,100.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,0
122514,1760.0,39.0,0.61,2.0,2.0,5100.0,90.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,0


In [17]:
heat.shape

(122516, 16)

In [18]:
quheat = queen.sample(n=122516, random_state=0)

In [19]:
quheat

Unnamed: 0,Street,Borough
50333,BEACH 100 STREET,QUEENS
37191,LINDEN BOULEVARD,QUEENS
159237,43 STREET,QUEENS
77415,YELLOWSTONE BOULEVARD,QUEENS
123114,ELMHURST AVENUE,QUEENS
...,...,...
98987,65 STREET,QUEENS
222789,169 STREET,QUEENS
229528,SHORE FRONT PARKWAY,QUEENS
211787,163 STREET,QUEENS


In [20]:
quheat.reset_index(drop=True,inplace=True)

In [21]:
quheat

Unnamed: 0,Street,Borough
0,BEACH 100 STREET,QUEENS
1,LINDEN BOULEVARD,QUEENS
2,43 STREET,QUEENS
3,YELLOWSTONE BOULEVARD,QUEENS
4,ELMHURST AVENUE,QUEENS
...,...,...
122511,65 STREET,QUEENS
122512,169 STREET,QUEENS
122513,SHORE FRONT PARKWAY,QUEENS
122514,163 STREET,QUEENS


In [22]:
df3 = pd.concat([heat,quheat],axis=1)

In [23]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType,Street,Borough
0,1760.0,39.0,0.61,2.0,5.0,9580.0,100.0,1.0,2.0,1680.0,1600.0,1.8,1500.0,71.0,85.0,0,BEACH 100 STREET,QUEENS
1,1760.0,39.0,0.61,2.0,5.0,9580.0,100.0,1.0,2.0,1680.0,1600.0,1.8,1500.0,71.0,85.0,0,LINDEN BOULEVARD,QUEENS
2,5569.0,70.0,2.16,2.0,5.0,9580.0,100.0,2.0,4.0,1680.0,4584.0,1.8,1500.0,-7.0,7.0,0,43 STREET,QUEENS
3,5569.0,94.0,2.16,2.0,5.0,9580.0,100.0,1.0,4.0,1680.0,4584.0,1.8,1500.0,0.0,7.0,0,YELLOWSTONE BOULEVARD,QUEENS
4,5569.0,75.0,0.34,2.0,5.0,9580.0,100.0,5.0,2.0,1680.0,1600.0,0.6,1500.0,48.0,62.0,0,ELMHURST AVENUE,QUEENS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122511,1760.0,39.0,0.61,2.0,2.0,1350.0,50.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,0,65 STREET,QUEENS
122512,1760.0,39.0,0.61,2.0,2.0,7000.0,100.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,0,169 STREET,QUEENS
122513,1760.0,39.0,0.61,2.0,2.0,6000.0,100.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,0,SHORE FRONT PARKWAY,QUEENS
122514,1760.0,39.0,0.61,2.0,2.0,5100.0,90.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,0,163 STREET,QUEENS


In [24]:
df3.columns

Index(['BldgArea', 'BldgDepth', 'BuiltFAR', 'CommFAR', 'FacilFAR', 'LotArea',
       'LotDepth', 'NumBldgs', 'NumFloors', 'OfficeArea', 'ResArea',
       'ResidFAR', 'RetailArea', 'Age', 'Period', 'ComplaintType', 'Street',
       'Borough'],
      dtype='object')

In [25]:
df3.drop(['ComplaintType', 'Street'],axis=1,inplace=True)

In [26]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,Borough
0,1760.0,39.0,0.61,2.0,5.0,9580.0,100.0,1.0,2.0,1680.0,1600.0,1.8,1500.0,71.0,85.0,QUEENS
1,1760.0,39.0,0.61,2.0,5.0,9580.0,100.0,1.0,2.0,1680.0,1600.0,1.8,1500.0,71.0,85.0,QUEENS
2,5569.0,70.0,2.16,2.0,5.0,9580.0,100.0,2.0,4.0,1680.0,4584.0,1.8,1500.0,-7.0,7.0,QUEENS
3,5569.0,94.0,2.16,2.0,5.0,9580.0,100.0,1.0,4.0,1680.0,4584.0,1.8,1500.0,0.0,7.0,QUEENS
4,5569.0,75.0,0.34,2.0,5.0,9580.0,100.0,5.0,2.0,1680.0,1600.0,0.6,1500.0,48.0,62.0,QUEENS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122511,1760.0,39.0,0.61,2.0,2.0,1350.0,50.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,QUEENS
122512,1760.0,39.0,0.61,2.0,2.0,7000.0,100.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,QUEENS
122513,1760.0,39.0,0.61,2.0,2.0,6000.0,100.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,QUEENS
122514,1760.0,39.0,0.61,2.0,2.0,5100.0,90.0,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,QUEENS


In [27]:
df3.duplicated(keep='first').sum()

24703

In [28]:
df3.drop_duplicates(keep='first',ignore_index=True, inplace=True)

In [29]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,Borough
0,1760.0,39.0,0.61,2.0,5.0,9580.0,100.00,1.0,2.0,1680.0,1600.0,1.8,1500.0,71.0,85.0,QUEENS
1,5569.0,70.0,2.16,2.0,5.0,9580.0,100.00,2.0,4.0,1680.0,4584.0,1.8,1500.0,-7.0,7.0,QUEENS
2,5569.0,94.0,2.16,2.0,5.0,9580.0,100.00,1.0,4.0,1680.0,4584.0,1.8,1500.0,0.0,7.0,QUEENS
3,5569.0,75.0,0.34,2.0,5.0,9580.0,100.00,5.0,2.0,1680.0,1600.0,0.6,1500.0,48.0,62.0,QUEENS
4,1760.0,39.0,0.61,2.0,5.0,6310.0,51.18,1.0,2.0,1680.0,1600.0,0.6,1500.0,71.0,85.0,QUEENS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97808,1760.0,39.0,0.61,2.0,2.0,4375.0,60.00,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,QUEENS
97809,1760.0,39.0,0.61,2.0,2.0,5642.0,100.00,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,QUEENS
97810,1760.0,39.0,0.61,2.0,2.0,1350.0,50.00,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,QUEENS
97811,1760.0,39.0,0.61,2.0,2.0,5100.0,90.00,1.0,2.0,1680.0,1600.0,0.9,1500.0,71.0,85.0,QUEENS


In [30]:
#Save to CSV
#df3.to_csv("queensheat.csv",index=False)