### Data Dictionary

The goal of this exercise is to do Model Development and Validation to find the answer to the Question 4 of the problem statement:

Can a predictive model be built for future prediction of the possibility of complaints of the specific type that you identified in response to Question 1?

Using the best model, you need to predict the number of future complaints (of the Complaint Type that you decided to focus on in Question 1).

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

%matplotlib inline
sns.set_style('darkgrid')
sns.set(font_scale=1.5)


import feature_engine.missing_data_imputers as mdi
from feature_engine.outlier_removers import Winsorizer

import warnings
warnings.filterwarnings('ignore')


pd.options.display.max_columns= None
#pd.options.display.max_rows = None

### Data Exploration

In [2]:
df = pd.read_csv("heat.csv")

In [3]:
df

Unnamed: 0,ComplaintType,Zipcode,Street,Borough
0,HEAT/HOT WATER,10019.0,WEST 52 STREET,MANHATTAN
1,HEAT/HOT WATER,11372.0,37 AVENUE,QUEENS
2,HEAT/HOT WATER,10458.0,SOUTHERN BOULEVARD,BRONX
3,HEAT/HOT WATER,10456.0,MORRIS AVENUE,BRONX
4,HEAT/HOT WATER,11372.0,81 STREET,QUEENS
...,...,...,...,...
1847750,HEAT/HOT WATER,10029.0,EAST 108 STREET,MANHATTAN
1847751,HEAT/HOT WATER,10029.0,EAST 108 STREET,MANHATTAN
1847752,HEAT/HOT WATER,10461.0,BRUCKNER BOULEVARD,BRONX
1847753,HEAT/HOT WATER,10034.0,SHERMAN AVENUE,MANHATTAN


In [4]:
df.drop(['ComplaintType', 'Zipcode'],axis=1,inplace=True)

In [5]:
df.columns

Index(['Street', 'Borough'], dtype='object')

### Data Preprocessing

### Segment Heat/Hot Water Cases for Bronx

In [6]:
df['Borough'].value_counts()

BRONX            600147
BROOKLYN         569310
MANHATTAN        418432
QUEENS           241660
STATEN ISLAND     18206
Name: Borough, dtype: int64

In [7]:
bronx = df[df['Borough'] == 'BRONX']

In [8]:
bronx

Unnamed: 0,Street,Borough
2,SOUTHERN BOULEVARD,BRONX
3,MORRIS AVENUE,BRONX
6,ALDUS STREET,BRONX
12,BOYNTON AVENUE,BRONX
18,KINGSBRIDGE TERRACE,BRONX
...,...,...
1847747,WOODYCREST AVENUE,BRONX
1847748,EAST 242 STREET,BRONX
1847749,BAINBRIDGE AVENUE,BRONX
1847752,BRUCKNER BOULEVARD,BRONX


In [9]:
bronx.reset_index(drop=True, inplace=True)

In [10]:
bronx

Unnamed: 0,Street,Borough
0,SOUTHERN BOULEVARD,BRONX
1,MORRIS AVENUE,BRONX
2,ALDUS STREET,BRONX
3,BOYNTON AVENUE,BRONX
4,KINGSBRIDGE TERRACE,BRONX
...,...,...
600142,WOODYCREST AVENUE,BRONX
600143,EAST 242 STREET,BRONX
600144,BAINBRIDGE AVENUE,BRONX
600145,BRUCKNER BOULEVARD,BRONX


In [11]:
df2 = pd.read_csv("bronxclasstrain.csv")

In [12]:
df2

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
0,2340.0,46.0,0.90,2.0,6.5,10400.0,125.0,1.0,2.0,2628.0,2000.0,6.02,2458.0,65.0,85.0,0
1,752.0,16.0,0.05,2.0,6.5,10400.0,100.0,1.0,1.0,272.0,2000.0,6.02,2458.0,63.0,89.0,1
2,7858.0,112.0,1.13,2.0,6.5,10400.0,125.0,1.0,2.0,2628.0,2000.0,6.02,2458.0,69.0,89.0,0
3,7858.0,85.0,3.08,2.0,6.5,2500.0,100.0,1.0,5.0,2628.0,6034.0,6.02,2458.0,70.0,89.0,0
4,7858.0,70.0,3.08,2.0,6.5,1875.0,75.0,1.0,5.0,2628.0,6034.0,6.02,1719.0,89.0,100.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89705,2340.0,46.0,0.90,2.0,2.0,323.0,125.0,1.0,2.0,2628.0,2000.0,0.90,2458.0,65.0,85.0,0
89706,2340.0,46.0,0.90,2.0,2.0,2513.0,125.0,1.0,2.0,2628.0,2000.0,0.90,2458.0,65.0,85.0,3
89707,2340.0,46.0,0.90,2.0,2.0,2513.0,100.0,1.0,2.0,2628.0,2000.0,1.25,2458.0,65.0,85.0,0
89708,2340.0,46.0,0.90,2.0,6.5,2513.0,100.0,1.0,2.0,2628.0,2000.0,6.02,2458.0,65.0,85.0,1


In [13]:
heat = df2[df2['ComplaintType'] == 0]

In [14]:
heat

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
0,2340.0,46.0,0.90,2.0,6.5,10400.0,125.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,65.0,85.0,0
2,7858.0,112.0,1.13,2.0,6.5,10400.0,125.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,69.0,89.0,0
3,7858.0,85.0,3.08,2.0,6.5,2500.0,100.00,1.0,5.0,2628.0,6034.0,6.02,2458.0,70.0,89.0,0
4,7858.0,70.0,3.08,2.0,6.5,1875.0,75.00,1.0,5.0,2628.0,6034.0,6.02,1719.0,89.0,100.0,0
5,2340.0,46.0,0.90,2.0,6.5,10400.0,100.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,65.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89698,1512.0,28.0,0.39,2.0,1.0,3891.0,109.08,1.0,2.0,2628.0,1512.0,0.50,2458.0,60.0,80.0,0
89703,2340.0,46.0,0.90,2.0,2.0,10400.0,125.00,1.0,2.0,2628.0,2000.0,0.90,2458.0,65.0,85.0,0
89705,2340.0,46.0,0.90,2.0,2.0,323.0,125.00,1.0,2.0,2628.0,2000.0,0.90,2458.0,65.0,85.0,0
89707,2340.0,46.0,0.90,2.0,2.0,2513.0,100.00,1.0,2.0,2628.0,2000.0,1.25,2458.0,65.0,85.0,0


In [15]:
heat.reset_index(drop=True, inplace=True)

In [16]:
heat

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType
0,2340.0,46.0,0.90,2.0,6.5,10400.0,125.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,65.0,85.0,0
1,7858.0,112.0,1.13,2.0,6.5,10400.0,125.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,69.0,89.0,0
2,7858.0,85.0,3.08,2.0,6.5,2500.0,100.00,1.0,5.0,2628.0,6034.0,6.02,2458.0,70.0,89.0,0
3,7858.0,70.0,3.08,2.0,6.5,1875.0,75.00,1.0,5.0,2628.0,6034.0,6.02,1719.0,89.0,100.0,0
4,2340.0,46.0,0.90,2.0,6.5,10400.0,100.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,65.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33659,1512.0,28.0,0.39,2.0,1.0,3891.0,109.08,1.0,2.0,2628.0,1512.0,0.50,2458.0,60.0,80.0,0
33660,2340.0,46.0,0.90,2.0,2.0,10400.0,125.00,1.0,2.0,2628.0,2000.0,0.90,2458.0,65.0,85.0,0
33661,2340.0,46.0,0.90,2.0,2.0,323.0,125.00,1.0,2.0,2628.0,2000.0,0.90,2458.0,65.0,85.0,0
33662,2340.0,46.0,0.90,2.0,2.0,2513.0,100.00,1.0,2.0,2628.0,2000.0,1.25,2458.0,65.0,85.0,0


In [17]:
heat.shape

(33664, 16)

In [18]:
brheat = bronx.sample(n=33664, random_state=0)

In [19]:
brheat

Unnamed: 0,Street,Borough
274458,CARPENTER AVENUE,BRONX
302785,MT HOPE PLACE,BRONX
177600,EAST 169 STREET,BRONX
509664,SOUTHERN BOULEVARD,BRONX
173531,CROTONA AVENUE,BRONX
...,...,...
238741,MARION AVENUE,BRONX
501873,WALLACE AVENUE,BRONX
560034,GILLESPIE AVENUE,BRONX
282084,EAST 241 STREET,BRONX


In [20]:
brheat.reset_index(drop=True,inplace=True)

In [21]:
brheat

Unnamed: 0,Street,Borough
0,CARPENTER AVENUE,BRONX
1,MT HOPE PLACE,BRONX
2,EAST 169 STREET,BRONX
3,SOUTHERN BOULEVARD,BRONX
4,CROTONA AVENUE,BRONX
...,...,...
33659,MARION AVENUE,BRONX
33660,WALLACE AVENUE,BRONX
33661,GILLESPIE AVENUE,BRONX
33662,EAST 241 STREET,BRONX


In [22]:
df3 = pd.concat([heat,brheat],axis=1)

In [23]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,ComplaintType,Street,Borough
0,2340.0,46.0,0.90,2.0,6.5,10400.0,125.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,65.0,85.0,0,CARPENTER AVENUE,BRONX
1,7858.0,112.0,1.13,2.0,6.5,10400.0,125.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,69.0,89.0,0,MT HOPE PLACE,BRONX
2,7858.0,85.0,3.08,2.0,6.5,2500.0,100.00,1.0,5.0,2628.0,6034.0,6.02,2458.0,70.0,89.0,0,EAST 169 STREET,BRONX
3,7858.0,70.0,3.08,2.0,6.5,1875.0,75.00,1.0,5.0,2628.0,6034.0,6.02,1719.0,89.0,100.0,0,SOUTHERN BOULEVARD,BRONX
4,2340.0,46.0,0.90,2.0,6.5,10400.0,100.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,65.0,85.0,0,CROTONA AVENUE,BRONX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33659,1512.0,28.0,0.39,2.0,1.0,3891.0,109.08,1.0,2.0,2628.0,1512.0,0.50,2458.0,60.0,80.0,0,MARION AVENUE,BRONX
33660,2340.0,46.0,0.90,2.0,2.0,10400.0,125.00,1.0,2.0,2628.0,2000.0,0.90,2458.0,65.0,85.0,0,WALLACE AVENUE,BRONX
33661,2340.0,46.0,0.90,2.0,2.0,323.0,125.00,1.0,2.0,2628.0,2000.0,0.90,2458.0,65.0,85.0,0,GILLESPIE AVENUE,BRONX
33662,2340.0,46.0,0.90,2.0,2.0,2513.0,100.00,1.0,2.0,2628.0,2000.0,1.25,2458.0,65.0,85.0,0,EAST 241 STREET,BRONX


In [24]:
df3.columns

Index(['BldgArea', 'BldgDepth', 'BuiltFAR', 'CommFAR', 'FacilFAR', 'LotArea',
       'LotDepth', 'NumBldgs', 'NumFloors', 'OfficeArea', 'ResArea',
       'ResidFAR', 'RetailArea', 'Age', 'Period', 'ComplaintType', 'Street',
       'Borough'],
      dtype='object')

In [25]:
df3.drop(['ComplaintType', 'Street'],axis=1,inplace=True)

In [26]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,Borough
0,2340.0,46.0,0.90,2.0,6.5,10400.0,125.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,65.0,85.0,BRONX
1,7858.0,112.0,1.13,2.0,6.5,10400.0,125.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,69.0,89.0,BRONX
2,7858.0,85.0,3.08,2.0,6.5,2500.0,100.00,1.0,5.0,2628.0,6034.0,6.02,2458.0,70.0,89.0,BRONX
3,7858.0,70.0,3.08,2.0,6.5,1875.0,75.00,1.0,5.0,2628.0,6034.0,6.02,1719.0,89.0,100.0,BRONX
4,2340.0,46.0,0.90,2.0,6.5,10400.0,100.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,65.0,85.0,BRONX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33659,1512.0,28.0,0.39,2.0,1.0,3891.0,109.08,1.0,2.0,2628.0,1512.0,0.50,2458.0,60.0,80.0,BRONX
33660,2340.0,46.0,0.90,2.0,2.0,10400.0,125.00,1.0,2.0,2628.0,2000.0,0.90,2458.0,65.0,85.0,BRONX
33661,2340.0,46.0,0.90,2.0,2.0,323.0,125.00,1.0,2.0,2628.0,2000.0,0.90,2458.0,65.0,85.0,BRONX
33662,2340.0,46.0,0.90,2.0,2.0,2513.0,100.00,1.0,2.0,2628.0,2000.0,1.25,2458.0,65.0,85.0,BRONX


In [27]:
df3.duplicated(keep='first').sum()

4677

In [28]:
df3.drop_duplicates(keep='first',ignore_index=True, inplace=True)

In [29]:
df3

Unnamed: 0,BldgArea,BldgDepth,BuiltFAR,CommFAR,FacilFAR,LotArea,LotDepth,NumBldgs,NumFloors,OfficeArea,ResArea,ResidFAR,RetailArea,Age,Period,Borough
0,2340.0,46.0,0.90,2.0,6.5,10400.0,125.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,65.0,85.0,BRONX
1,7858.0,112.0,1.13,2.0,6.5,10400.0,125.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,69.0,89.0,BRONX
2,7858.0,85.0,3.08,2.0,6.5,2500.0,100.00,1.0,5.0,2628.0,6034.0,6.02,2458.0,70.0,89.0,BRONX
3,7858.0,70.0,3.08,2.0,6.5,1875.0,75.00,1.0,5.0,2628.0,6034.0,6.02,1719.0,89.0,100.0,BRONX
4,2340.0,46.0,0.90,2.0,6.5,10400.0,100.00,1.0,2.0,2628.0,2000.0,6.02,2458.0,65.0,85.0,BRONX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28982,2340.0,46.0,0.90,2.0,1.0,2701.0,106.25,1.0,2.0,2628.0,2000.0,0.50,2458.0,65.0,85.0,BRONX
28983,1666.0,30.0,0.38,2.0,1.0,4360.0,122.25,1.0,2.0,2628.0,1666.0,0.50,2458.0,60.0,80.0,BRONX
28984,1720.0,30.0,0.20,2.0,1.0,8400.0,112.00,1.0,2.0,2628.0,1720.0,0.50,2458.0,60.0,80.0,BRONX
28985,1512.0,28.0,0.39,2.0,1.0,3891.0,109.08,1.0,2.0,2628.0,1512.0,0.50,2458.0,60.0,80.0,BRONX


In [31]:
#Save to CSV
#df3.to_csv("bronxheat.csv",index=False)