### Data Dictionary

The goal of this exercise is to do Model Development and Validation to find the answer to the Question 4 of the problem statement:

Can a predictive model be built for future prediction of the possibility of complaints of the specific type that you identified in response to Question 1?

Using the best model, you need to predict the number of future complaints (of the Complaint Type that you decided to focus on in Question 1).

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

%matplotlib inline
sns.set_style('darkgrid')
sns.set(font_scale=1.5)


import feature_engine.missing_data_imputers as mdi
from feature_engine.outlier_removers import Winsorizer

import warnings
warnings.filterwarnings('ignore')


pd.options.display.max_columns= None
#pd.options.display.max_rows = None

### Data Exploration

In [2]:
df = pd.read_csv("heat.csv")

In [3]:
df

Unnamed: 0,ComplaintType,Zipcode,Street,Borough
0,HEAT/HOT WATER,10019.0,WEST 52 STREET,MANHATTAN
1,HEAT/HOT WATER,11372.0,37 AVENUE,QUEENS
2,HEAT/HOT WATER,10458.0,SOUTHERN BOULEVARD,BRONX
3,HEAT/HOT WATER,10456.0,MORRIS AVENUE,BRONX
4,HEAT/HOT WATER,11372.0,81 STREET,QUEENS
...,...,...,...,...
1847750,HEAT/HOT WATER,10029.0,EAST 108 STREET,MANHATTAN
1847751,HEAT/HOT WATER,10029.0,EAST 108 STREET,MANHATTAN
1847752,HEAT/HOT WATER,10461.0,BRUCKNER BOULEVARD,BRONX
1847753,HEAT/HOT WATER,10034.0,SHERMAN AVENUE,MANHATTAN


In [4]:
df.drop(['ComplaintType', 'Zipcode'],axis=1,inplace=True)

In [5]:
df.columns

Index(['Street', 'Borough'], dtype='object')

### Data Preprocessing

### Segment Heat/Hot Water Cases for Brooklyn

In [6]:
df['Borough'].value_counts()

BRONX            600147
BROOKLYN         569310
MANHATTAN        418432
QUEENS           241660
STATEN ISLAND     18206
Name: Borough, dtype: int64

In [7]:
brook = df[df['Borough'] == 'BROOKLYN']

In [8]:
brook

Unnamed: 0,Street,Borough
5,WYONA STREET,BROOKLYN
8,LENOX ROAD,BROOKLYN
13,64 STREET,BROOKLYN
14,FLATBUSH AVENUE,BROOKLYN
16,ST JOHNS PLACE,BROOKLYN
...,...,...
1847728,EASTERN PARKWAY,BROOKLYN
1847729,BEVERLY ROAD,BROOKLYN
1847732,BUFFALO AVENUE,BROOKLYN
1847740,65 STREET,BROOKLYN


In [9]:
brook.reset_index(drop=True, inplace=True)

In [10]:
brook

Unnamed: 0,Street,Borough
0,WYONA STREET,BROOKLYN
1,LENOX ROAD,BROOKLYN
2,64 STREET,BROOKLYN
3,FLATBUSH AVENUE,BROOKLYN
4,ST JOHNS PLACE,BROOKLYN
...,...,...
569305,EASTERN PARKWAY,BROOKLYN
569306,BEVERLY ROAD,BROOKLYN
569307,BUFFALO AVENUE,BROOKLYN
569308,65 STREET,BROOKLYN


In [11]:
df2 = pd.read_csv("brooklynclasstrain.csv")

In [12]:
df2

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
0,2430.0,48.0,1.06,2.0,2.0,5603.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,4
1,7983.0,108.0,3.84,2.0,6.0,5603.0,100.0,1.0,6.00,1802.0,2060.0,5.40,1375.0,74.0,100.0,1
2,7983.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,6.00,1802.0,6189.0,1.35,1375.0,80.0,95.0,0
3,2430.0,48.0,1.06,2.0,2.0,5603.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,2
4,2430.0,48.0,1.06,2.0,2.0,5603.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277311,1992.0,24.0,1.12,0.5,1.0,1785.0,52.5,1.0,2.00,1802.0,1368.0,0.60,1375.0,80.0,95.0,0
277312,2430.0,48.0,1.06,0.5,1.0,1785.0,52.5,1.0,2.00,1802.0,2060.0,0.60,1375.0,80.0,95.0,4
277313,1071.0,24.0,0.60,0.5,1.0,1785.0,52.5,1.0,1.75,1802.0,1071.0,0.60,1375.0,80.0,95.0,0
277314,2430.0,48.0,1.06,0.5,1.0,3570.0,52.5,1.0,2.00,1802.0,2060.0,0.60,1375.0,80.0,95.0,0


In [13]:
heat = df2[df2['ComplaintType'] == 0]

In [14]:
heat

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
2,7983.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,6.00,1802.0,6189.0,1.35,1375.0,80.0,95.0,0
5,2430.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,0
6,2430.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,0
8,2430.0,48.0,1.06,2.0,2.0,5603.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,0
11,2430.0,48.0,1.06,2.0,2.0,517.0,46.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277307,1312.0,24.0,0.37,0.5,1.0,3570.0,52.5,1.0,2.00,1802.0,1312.0,0.60,1375.0,80.0,95.0,0
277308,1039.0,24.0,0.58,0.5,1.0,1785.0,52.5,1.0,1.75,1802.0,1039.0,0.60,1375.0,80.0,95.0,0
277311,1992.0,24.0,1.12,0.5,1.0,1785.0,52.5,1.0,2.00,1802.0,1368.0,0.60,1375.0,80.0,95.0,0
277313,1071.0,24.0,0.60,0.5,1.0,1785.0,52.5,1.0,1.75,1802.0,1071.0,0.60,1375.0,80.0,95.0,0


In [15]:
heat.reset_index(drop=True, inplace=True)

In [16]:
heat

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
0,7983.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,6.00,1802.0,6189.0,1.35,1375.0,80.0,95.0,0
1,2430.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,0
2,2430.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,0
3,2430.0,48.0,1.06,2.0,2.0,5603.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,0
4,2430.0,48.0,1.06,2.0,2.0,517.0,46.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91249,1312.0,24.0,0.37,0.5,1.0,3570.0,52.5,1.0,2.00,1802.0,1312.0,0.60,1375.0,80.0,95.0,0
91250,1039.0,24.0,0.58,0.5,1.0,1785.0,52.5,1.0,1.75,1802.0,1039.0,0.60,1375.0,80.0,95.0,0
91251,1992.0,24.0,1.12,0.5,1.0,1785.0,52.5,1.0,2.00,1802.0,1368.0,0.60,1375.0,80.0,95.0,0
91252,1071.0,24.0,0.60,0.5,1.0,1785.0,52.5,1.0,1.75,1802.0,1071.0,0.60,1375.0,80.0,95.0,0


In [17]:
heat.shape

(91254, 16)

In [18]:
brookheat = brook.sample(n=91254, random_state=0)

In [19]:
brookheat

Unnamed: 0,Street,Borough
16814,EAST 46 STREET,BROOKLYN
139549,LINCOLN ROAD,BROOKLYN
65807,MARCUS GARVEY BOULEVARD,BROOKLYN
26118,PULASKI STREET,BROOKLYN
106097,HART STREET,BROOKLYN
...,...,...
507392,65 STREET,BROOKLYN
201208,OCEAN PARKWAY,BROOKLYN
384111,PARKSIDE AVENUE,BROOKLYN
369326,HOWARD AVENUE,BROOKLYN


In [20]:
brookheat.reset_index(drop=True,inplace=True)

In [21]:
brookheat

Unnamed: 0,Street,Borough
0,EAST 46 STREET,BROOKLYN
1,LINCOLN ROAD,BROOKLYN
2,MARCUS GARVEY BOULEVARD,BROOKLYN
3,PULASKI STREET,BROOKLYN
4,HART STREET,BROOKLYN
...,...,...
91249,65 STREET,BROOKLYN
91250,OCEAN PARKWAY,BROOKLYN
91251,PARKSIDE AVENUE,BROOKLYN
91252,HOWARD AVENUE,BROOKLYN


In [22]:
df3 = pd.concat([heat,brookheat],axis=1)

In [23]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType,Street,Borough
0,7983.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,6.00,1802.0,6189.0,1.35,1375.0,80.0,95.0,0,EAST 46 STREET,BROOKLYN
1,2430.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,0,LINCOLN ROAD,BROOKLYN
2,2430.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,0,MARCUS GARVEY BOULEVARD,BROOKLYN
3,2430.0,48.0,1.06,2.0,2.0,5603.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,0,PULASKI STREET,BROOKLYN
4,2430.0,48.0,1.06,2.0,2.0,517.0,46.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,0,HART STREET,BROOKLYN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91249,1312.0,24.0,0.37,0.5,1.0,3570.0,52.5,1.0,2.00,1802.0,1312.0,0.60,1375.0,80.0,95.0,0,65 STREET,BROOKLYN
91250,1039.0,24.0,0.58,0.5,1.0,1785.0,52.5,1.0,1.75,1802.0,1039.0,0.60,1375.0,80.0,95.0,0,OCEAN PARKWAY,BROOKLYN
91251,1992.0,24.0,1.12,0.5,1.0,1785.0,52.5,1.0,2.00,1802.0,1368.0,0.60,1375.0,80.0,95.0,0,PARKSIDE AVENUE,BROOKLYN
91252,1071.0,24.0,0.60,0.5,1.0,1785.0,52.5,1.0,1.75,1802.0,1071.0,0.60,1375.0,80.0,95.0,0,HOWARD AVENUE,BROOKLYN


In [24]:
df3.columns

Index(['BldgArea', 'BldgDepth', 'BuiltFAR', 'CommFAR', 'FacilFAR', 'LotArea',
       'LotDepth', 'NumBldgs', 'NumFloors', 'OfficeArea', 'ResArea',
       'ResidFAR', 'RetailArea', 'Age', 'Period', 'ComplaintType', 'Street',
       'Borough'],
      dtype='object')

In [25]:
df3.drop(['ComplaintType', 'Street'],axis=1,inplace=True)

In [26]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,Borough
0,7983.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,6.00,1802.0,6189.0,1.35,1375.0,80.0,95.0,BROOKLYN
1,2430.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,BROOKLYN
2,2430.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,BROOKLYN
3,2430.0,48.0,1.06,2.0,2.0,5603.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,BROOKLYN
4,2430.0,48.0,1.06,2.0,2.0,517.0,46.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,BROOKLYN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91249,1312.0,24.0,0.37,0.5,1.0,3570.0,52.5,1.0,2.00,1802.0,1312.0,0.60,1375.0,80.0,95.0,BROOKLYN
91250,1039.0,24.0,0.58,0.5,1.0,1785.0,52.5,1.0,1.75,1802.0,1039.0,0.60,1375.0,80.0,95.0,BROOKLYN
91251,1992.0,24.0,1.12,0.5,1.0,1785.0,52.5,1.0,2.00,1802.0,1368.0,0.60,1375.0,80.0,95.0,BROOKLYN
91252,1071.0,24.0,0.60,0.5,1.0,1785.0,52.5,1.0,1.75,1802.0,1071.0,0.60,1375.0,80.0,95.0,BROOKLYN


In [27]:
df3.duplicated(keep='first').sum()

21892

In [28]:
df3.drop_duplicates(keep='first',ignore_index=True, inplace=True)

In [29]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,Borough
0,7983.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,6.00,1802.0,6189.0,1.35,1375.0,80.0,95.0,BROOKLYN
1,2430.0,48.0,1.06,2.0,2.0,2142.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,BROOKLYN
2,2430.0,48.0,1.06,2.0,2.0,5603.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,BROOKLYN
3,2430.0,48.0,1.06,2.0,2.0,517.0,46.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,BROOKLYN
4,2430.0,48.0,1.06,2.0,2.0,4293.0,100.0,1.0,2.00,1802.0,2060.0,1.35,1375.0,80.0,95.0,BROOKLYN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69357,2031.0,24.0,0.57,0.5,1.0,3570.0,52.5,1.0,1.75,1802.0,1251.0,0.60,1375.0,80.0,95.0,BROOKLYN
69358,1312.0,24.0,0.37,0.5,1.0,3570.0,52.5,1.0,2.00,1802.0,1312.0,0.60,1375.0,80.0,95.0,BROOKLYN
69359,1992.0,24.0,1.12,0.5,1.0,1785.0,52.5,1.0,2.00,1802.0,1368.0,0.60,1375.0,80.0,95.0,BROOKLYN
69360,1071.0,24.0,0.60,0.5,1.0,1785.0,52.5,1.0,1.75,1802.0,1071.0,0.60,1375.0,80.0,95.0,BROOKLYN


In [30]:
#Save to CSV
#df3.to_csv("brooklynheat.csv",index=False)