### Data Dictionary

The goal of this exercise is to do Model Development and Validation to find the answer to the Question 4 of the problem statement:

Can a predictive model be built for future prediction of the possibility of complaints of the specific type that you identified in response to Question 1?

Using the best model, you need to predict the number of future complaints (of the Complaint Type that you decided to focus on in Question 1).

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

%matplotlib inline
sns.set_style('darkgrid')
sns.set(font_scale=1.5)


import feature_engine.missing_data_imputers as mdi
from feature_engine.outlier_removers import Winsorizer

import warnings
warnings.filterwarnings('ignore')


pd.options.display.max_columns= None
#pd.options.display.max_rows = None

### Data Exploration

In [2]:
df = pd.read_csv("heat.csv")

In [3]:
df

Unnamed: 0,ComplaintType,Zipcode,Street,Borough
0,HEAT/HOT WATER,10019.0,WEST 52 STREET,MANHATTAN
1,HEAT/HOT WATER,11372.0,37 AVENUE,QUEENS
2,HEAT/HOT WATER,10458.0,SOUTHERN BOULEVARD,BRONX
3,HEAT/HOT WATER,10456.0,MORRIS AVENUE,BRONX
4,HEAT/HOT WATER,11372.0,81 STREET,QUEENS
...,...,...,...,...
1847750,HEAT/HOT WATER,10029.0,EAST 108 STREET,MANHATTAN
1847751,HEAT/HOT WATER,10029.0,EAST 108 STREET,MANHATTAN
1847752,HEAT/HOT WATER,10461.0,BRUCKNER BOULEVARD,BRONX
1847753,HEAT/HOT WATER,10034.0,SHERMAN AVENUE,MANHATTAN


In [4]:
df.drop(['ComplaintType', 'Zipcode'],axis=1,inplace=True)

In [5]:
df.columns

Index(['Street', 'Borough'], dtype='object')

### Data Preprocessing

### Segment Heat/Hot Water Cases for Manhattan

In [6]:
df['Borough'].value_counts()

BRONX            600147
BROOKLYN         569310
MANHATTAN        418432
QUEENS           241660
STATEN ISLAND     18206
Name: Borough, dtype: int64

In [7]:
man = df[df['Borough'] == 'MANHATTAN']

In [8]:
man

Unnamed: 0,Street,Borough
0,WEST 52 STREET,MANHATTAN
7,FREDERICK DOUGLASS BOULEVARD,MANHATTAN
9,WEST 22 STREET,MANHATTAN
10,CENTRAL PARK NORTH,MANHATTAN
11,WEST 151 STREET,MANHATTAN
...,...,...
1847741,BROADWAY,MANHATTAN
1847743,WEST 45 STREET,MANHATTAN
1847750,EAST 108 STREET,MANHATTAN
1847751,EAST 108 STREET,MANHATTAN


In [9]:
man.reset_index(drop=True, inplace=True)

In [10]:
man

Unnamed: 0,Street,Borough
0,WEST 52 STREET,MANHATTAN
1,FREDERICK DOUGLASS BOULEVARD,MANHATTAN
2,WEST 22 STREET,MANHATTAN
3,CENTRAL PARK NORTH,MANHATTAN
4,WEST 151 STREET,MANHATTAN
...,...,...
418427,BROADWAY,MANHATTAN
418428,WEST 45 STREET,MANHATTAN
418429,EAST 108 STREET,MANHATTAN
418430,EAST 108 STREET,MANHATTAN


In [11]:
df2 = pd.read_csv("manhattanclasstrain.csv")

In [12]:
df2

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
0,76036.25,77.0,0.35,6.0,1.0,16942.0,100.00,1.0,5.0,6072.0,7650.0,0.60,2320.0,88.0,120.0,3
1,76036.25,77.0,1.00,6.0,1.0,16942.0,126.93,1.0,5.0,6072.0,7650.0,0.60,2320.0,88.0,120.0,3
2,76036.25,77.0,0.22,6.0,1.0,16942.0,100.00,1.0,5.0,6072.0,7650.0,0.60,2320.0,88.0,120.0,0
3,9401.00,77.0,3.60,6.0,6.5,2523.0,100.00,1.0,5.0,6072.0,7650.0,4.00,2320.0,78.0,110.0,0
4,9401.00,77.0,3.60,6.0,6.5,2523.0,100.00,1.0,5.0,6072.0,7650.0,4.00,2320.0,78.0,110.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42961,20535.00,50.0,4.11,6.0,6.5,5000.0,100.00,1.0,5.0,6072.0,20535.0,3.44,2320.0,63.0,94.0,1
42962,9401.00,77.0,3.60,6.0,6.5,6450.0,126.93,1.0,5.0,6072.0,7650.0,3.44,2320.0,78.0,110.0,4
42963,9401.00,15.0,3.60,6.0,6.5,16942.0,126.93,1.0,1.0,6072.0,7650.0,4.00,2320.0,57.0,89.0,0
42964,9401.00,77.0,3.60,6.0,6.5,16942.0,126.93,1.0,5.0,6072.0,7650.0,3.44,2320.0,78.0,110.0,0


In [13]:
heat = df2[df2['ComplaintType'] == 0]

In [14]:
heat

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
2,76036.25,77.0,0.22,6.0,1.0,16942.0,100.00,1.0,5.0,6072.0,7650.0,0.60,2320.0,88.0,120.0,0
3,9401.00,77.0,3.60,6.0,6.5,2523.0,100.00,1.0,5.0,6072.0,7650.0,4.00,2320.0,78.0,110.0,0
6,76036.25,183.0,0.83,3.4,10.0,16942.0,126.93,1.0,5.0,6072.0,7650.0,10.00,2320.0,101.0,120.0,0
7,7500.00,75.0,0.19,3.4,10.0,16942.0,126.93,1.0,1.0,6072.0,7650.0,10.00,2320.0,88.0,120.0,0
16,76036.25,77.0,10.35,6.0,14.0,16942.0,126.93,1.0,12.0,6072.0,7650.0,10.00,2320.0,18.0,50.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42957,37482.00,117.0,4.19,6.0,6.5,8950.0,126.93,1.0,6.0,6072.0,31531.0,3.44,2320.0,59.0,90.0,0
42960,76036.25,138.0,7.81,6.0,6.5,10000.0,126.93,1.0,10.0,6072.0,31531.0,3.44,2320.0,22.0,54.0,0
42963,9401.00,15.0,3.60,6.0,6.5,16942.0,126.93,1.0,1.0,6072.0,7650.0,4.00,2320.0,57.0,89.0,0
42964,9401.00,77.0,3.60,6.0,6.5,16942.0,126.93,1.0,5.0,6072.0,7650.0,3.44,2320.0,78.0,110.0,0


In [15]:
heat.reset_index(drop=True, inplace=True)

In [16]:
heat

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
0,76036.25,77.0,0.22,6.0,1.0,16942.0,100.00,1.0,5.0,6072.0,7650.0,0.60,2320.0,88.0,120.0,0
1,9401.00,77.0,3.60,6.0,6.5,2523.0,100.00,1.0,5.0,6072.0,7650.0,4.00,2320.0,78.0,110.0,0
2,76036.25,183.0,0.83,3.4,10.0,16942.0,126.93,1.0,5.0,6072.0,7650.0,10.00,2320.0,101.0,120.0,0
3,7500.00,75.0,0.19,3.4,10.0,16942.0,126.93,1.0,1.0,6072.0,7650.0,10.00,2320.0,88.0,120.0,0
4,76036.25,77.0,10.35,6.0,14.0,16942.0,126.93,1.0,12.0,6072.0,7650.0,10.00,2320.0,18.0,50.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17041,37482.00,117.0,4.19,6.0,6.5,8950.0,126.93,1.0,6.0,6072.0,31531.0,3.44,2320.0,59.0,90.0,0
17042,76036.25,138.0,7.81,6.0,6.5,10000.0,126.93,1.0,10.0,6072.0,31531.0,3.44,2320.0,22.0,54.0,0
17043,9401.00,15.0,3.60,6.0,6.5,16942.0,126.93,1.0,1.0,6072.0,7650.0,4.00,2320.0,57.0,89.0,0
17044,9401.00,77.0,3.60,6.0,6.5,16942.0,126.93,1.0,5.0,6072.0,7650.0,3.44,2320.0,78.0,110.0,0


In [17]:
heat.shape

(17046, 16)

In [18]:
manheat = man.sample(n=17046, random_state=0)

In [19]:
manheat

Unnamed: 0,Street,Borough
36482,FT WASHINGTON AVENUE,MANHATTAN
261281,3 AVENUE,MANHATTAN
371821,MADISON AVENUE,MANHATTAN
215945,ST NICHOLAS AVENUE,MANHATTAN
151958,1 AVENUE,MANHATTAN
...,...,...
416342,BROADWAY,MANHATTAN
40905,ST NICHOLAS AVENUE,MANHATTAN
285946,WEST 14 STREET,MANHATTAN
57908,LENOX AVENUE,MANHATTAN


In [20]:
manheat.reset_index(drop=True,inplace=True)

In [21]:
manheat

Unnamed: 0,Street,Borough
0,FT WASHINGTON AVENUE,MANHATTAN
1,3 AVENUE,MANHATTAN
2,MADISON AVENUE,MANHATTAN
3,ST NICHOLAS AVENUE,MANHATTAN
4,1 AVENUE,MANHATTAN
...,...,...
17041,BROADWAY,MANHATTAN
17042,ST NICHOLAS AVENUE,MANHATTAN
17043,WEST 14 STREET,MANHATTAN
17044,LENOX AVENUE,MANHATTAN


In [22]:
df3 = pd.concat([heat,manheat],axis=1)

In [23]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType,Street,Borough
0,76036.25,77.0,0.22,6.0,1.0,16942.0,100.00,1.0,5.0,6072.0,7650.0,0.60,2320.0,88.0,120.0,0,FT WASHINGTON AVENUE,MANHATTAN
1,9401.00,77.0,3.60,6.0,6.5,2523.0,100.00,1.0,5.0,6072.0,7650.0,4.00,2320.0,78.0,110.0,0,3 AVENUE,MANHATTAN
2,76036.25,183.0,0.83,3.4,10.0,16942.0,126.93,1.0,5.0,6072.0,7650.0,10.00,2320.0,101.0,120.0,0,MADISON AVENUE,MANHATTAN
3,7500.00,75.0,0.19,3.4,10.0,16942.0,126.93,1.0,1.0,6072.0,7650.0,10.00,2320.0,88.0,120.0,0,ST NICHOLAS AVENUE,MANHATTAN
4,76036.25,77.0,10.35,6.0,14.0,16942.0,126.93,1.0,12.0,6072.0,7650.0,10.00,2320.0,18.0,50.0,0,1 AVENUE,MANHATTAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17041,37482.00,117.0,4.19,6.0,6.5,8950.0,126.93,1.0,6.0,6072.0,31531.0,3.44,2320.0,59.0,90.0,0,BROADWAY,MANHATTAN
17042,76036.25,138.0,7.81,6.0,6.5,10000.0,126.93,1.0,10.0,6072.0,31531.0,3.44,2320.0,22.0,54.0,0,ST NICHOLAS AVENUE,MANHATTAN
17043,9401.00,15.0,3.60,6.0,6.5,16942.0,126.93,1.0,1.0,6072.0,7650.0,4.00,2320.0,57.0,89.0,0,WEST 14 STREET,MANHATTAN
17044,9401.00,77.0,3.60,6.0,6.5,16942.0,126.93,1.0,5.0,6072.0,7650.0,3.44,2320.0,78.0,110.0,0,LENOX AVENUE,MANHATTAN


In [24]:
df3.columns

Index(['BldgArea', 'BldgDepth', 'BuiltFAR', 'CommFAR', 'FacilFAR', 'LotArea',
       'LotDepth', 'NumBldgs', 'NumFloors', 'OfficeArea', 'ResArea',
       'ResidFAR', 'RetailArea', 'Age', 'Period', 'ComplaintType', 'Street',
       'Borough'],
      dtype='object')

In [25]:
df3.drop(['ComplaintType', 'Street'],axis=1,inplace=True)

In [26]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,Borough
0,76036.25,77.0,0.22,6.0,1.0,16942.0,100.00,1.0,5.0,6072.0,7650.0,0.60,2320.0,88.0,120.0,MANHATTAN
1,9401.00,77.0,3.60,6.0,6.5,2523.0,100.00,1.0,5.0,6072.0,7650.0,4.00,2320.0,78.0,110.0,MANHATTAN
2,76036.25,183.0,0.83,3.4,10.0,16942.0,126.93,1.0,5.0,6072.0,7650.0,10.00,2320.0,101.0,120.0,MANHATTAN
3,7500.00,75.0,0.19,3.4,10.0,16942.0,126.93,1.0,1.0,6072.0,7650.0,10.00,2320.0,88.0,120.0,MANHATTAN
4,76036.25,77.0,10.35,6.0,14.0,16942.0,126.93,1.0,12.0,6072.0,7650.0,10.00,2320.0,18.0,50.0,MANHATTAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17041,37482.00,117.0,4.19,6.0,6.5,8950.0,126.93,1.0,6.0,6072.0,31531.0,3.44,2320.0,59.0,90.0,MANHATTAN
17042,76036.25,138.0,7.81,6.0,6.5,10000.0,126.93,1.0,10.0,6072.0,31531.0,3.44,2320.0,22.0,54.0,MANHATTAN
17043,9401.00,15.0,3.60,6.0,6.5,16942.0,126.93,1.0,1.0,6072.0,7650.0,4.00,2320.0,57.0,89.0,MANHATTAN
17044,9401.00,77.0,3.60,6.0,6.5,16942.0,126.93,1.0,5.0,6072.0,7650.0,3.44,2320.0,78.0,110.0,MANHATTAN


In [27]:
df3.duplicated(keep='first').sum()

485

In [28]:
df3.drop_duplicates(keep='first',ignore_index=True, inplace=True)

In [29]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,Borough
0,76036.25,77.0,0.22,6.0,1.0,16942.0,100.00,1.0,5.0,6072.0,7650.0,0.60,2320.0,88.0,120.0,MANHATTAN
1,9401.00,77.0,3.60,6.0,6.5,2523.0,100.00,1.0,5.0,6072.0,7650.0,4.00,2320.0,78.0,110.0,MANHATTAN
2,76036.25,183.0,0.83,3.4,10.0,16942.0,126.93,1.0,5.0,6072.0,7650.0,10.00,2320.0,101.0,120.0,MANHATTAN
3,7500.00,75.0,0.19,3.4,10.0,16942.0,126.93,1.0,1.0,6072.0,7650.0,10.00,2320.0,88.0,120.0,MANHATTAN
4,76036.25,77.0,10.35,6.0,14.0,16942.0,126.93,1.0,12.0,6072.0,7650.0,10.00,2320.0,18.0,50.0,MANHATTAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16556,9401.00,77.0,3.60,6.0,6.5,5000.0,102.50,1.0,5.0,6072.0,7650.0,3.44,2320.0,78.0,110.0,MANHATTAN
16557,37482.00,117.0,4.19,6.0,6.5,8950.0,126.93,1.0,6.0,6072.0,31531.0,3.44,2320.0,59.0,90.0,MANHATTAN
16558,76036.25,138.0,7.81,6.0,6.5,10000.0,126.93,1.0,10.0,6072.0,31531.0,3.44,2320.0,22.0,54.0,MANHATTAN
16559,9401.00,15.0,3.60,6.0,6.5,16942.0,126.93,1.0,1.0,6072.0,7650.0,4.00,2320.0,57.0,89.0,MANHATTAN


In [30]:
#Save to CSV
#df3.to_csv("manhattanheat.csv",index=False)