In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(font_scale=1.25)
np.random.seed(5)

pd.set_option("display.max_rows",30000)
pd.set_option("display.max_columns",100)

import pandas_profiling as pp

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, plot_roc_curve


In [2]:
df = pd.read_csv("final3.csv")

In [3]:
df.head()

Unnamed: 0,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,borough,status,year,complaint_type
0,2500.0,2288.0,0.0,2288.0,0.0,0.0,1.0,2.0,100.0,52.0,0.92,1.25,0.0,2.0,BRONX,Open,60,HEAT/HOT WATER
1,1629.0,1584.0,0.0,1152.0,0.0,0.0,1.0,2.0,90.5,32.0,0.97,0.75,0.0,2.0,BRONX,Open,70,GENERAL
2,1970.0,1485.0,0.0,1080.0,0.0,0.0,1.0,2.0,109.42,32.0,0.75,0.75,0.0,2.0,BRONX,Open,65,HEAT/HOT WATER
3,2500.0,3933.0,1337.0,2596.0,0.0,1337.0,1.0,3.0,100.0,60.0,1.57,3.0,0.0,3.0,BRONX,Open,82,HEAT/HOT WATER
4,1800.0,1701.0,0.0,1188.0,0.0,0.0,1.0,2.0,100.0,33.0,0.95,0.75,0.0,2.0,BRONX,Open,63,HEAT/HOT WATER


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50631 entries, 0 to 50630
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   lotarea         50631 non-null  float64
 1   bldgarea        50631 non-null  float64
 2   comarea         50631 non-null  float64
 3   resarea         50631 non-null  float64
 4   officearea      50631 non-null  float64
 5   retailarea      50631 non-null  float64
 6   numbldgs        50631 non-null  float64
 7   numfloors       50631 non-null  float64
 8   lotdepth        50631 non-null  float64
 9   bldgdepth       50631 non-null  float64
 10  builtfar        50631 non-null  float64
 11  residfar        50631 non-null  float64
 12  commfar         50631 non-null  float64
 13  facilfar        50631 non-null  float64
 14  borough         50631 non-null  object 
 15  status          50631 non-null  object 
 16  year            50631 non-null  int64  
 17  complaint_type  50631 non-null 

In [5]:
df['complaint_type'].value_counts()

HEAT/HOT WATER          29124
UNSANITARY CONDITION     4453
HEATING                  2953
PLUMBING                 2751
PAINT/PLASTER            2062
WATER LEAK               1984
DOOR/WINDOW              1629
GENERAL                  1255
ELECTRIC                 1232
APPLIANCE                1092
FLOORING/STAIRS           760
SAFETY                    416
GENERAL CONSTRUCTION      384
PAINT - PLASTER           294
NONCONST                  123
ELEVATOR                   77
OUTSIDE BUILDING           25
CONSTRUCTION               10
Unsanitary Condition        5
General                     2
Name: complaint_type, dtype: int64

In [6]:
df.replace({'HEATING':'HEAT/HOT WATER'},inplace=True) #Replace heating with heat/hotwater

In [7]:
df['complaint_type'].value_counts()

HEAT/HOT WATER          32077
UNSANITARY CONDITION     4453
PLUMBING                 2751
PAINT/PLASTER            2062
WATER LEAK               1984
DOOR/WINDOW              1629
GENERAL                  1255
ELECTRIC                 1232
APPLIANCE                1092
FLOORING/STAIRS           760
SAFETY                    416
GENERAL CONSTRUCTION      384
PAINT - PLASTER           294
NONCONST                  123
ELEVATOR                   77
OUTSIDE BUILDING           25
CONSTRUCTION               10
Unsanitary Condition        5
General                     2
Name: complaint_type, dtype: int64

### Choose top 5 complaints for prediction

In [8]:
df2 = df.replace({'DOOR/WINDOW':np.nan,'GENERAL':np.nan,'ELECTRIC':np.nan,'APPLIANCE':np.nan,
                  'FLOORING/STAIRS':np.nan,'SAFETY':np.nan,'GENERAL CONSTRUCTION':np.nan,'PAINT - PLASTER':np.nan,
                  'NONCONST':np.nan, 'ELEVATOR':np.nan, 'OUTSIDE BUILDING':np.nan, 'CONSTRUCTION':np.nan, 
                  'Unsanitary Condition':np.nan,'General':np.nan})

In [9]:
df2.head()

Unnamed: 0,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,borough,status,year,complaint_type
0,2500.0,2288.0,0.0,2288.0,0.0,0.0,1.0,2.0,100.0,52.0,0.92,1.25,0.0,2.0,BRONX,Open,60,HEAT/HOT WATER
1,1629.0,1584.0,0.0,1152.0,0.0,0.0,1.0,2.0,90.5,32.0,0.97,0.75,0.0,2.0,BRONX,Open,70,
2,1970.0,1485.0,0.0,1080.0,0.0,0.0,1.0,2.0,109.42,32.0,0.75,0.75,0.0,2.0,BRONX,Open,65,HEAT/HOT WATER
3,2500.0,3933.0,1337.0,2596.0,0.0,1337.0,1.0,3.0,100.0,60.0,1.57,3.0,0.0,3.0,BRONX,Open,82,HEAT/HOT WATER
4,1800.0,1701.0,0.0,1188.0,0.0,0.0,1.0,2.0,100.0,33.0,0.95,0.75,0.0,2.0,BRONX,Open,63,HEAT/HOT WATER


In [10]:
df2.isnull().sum()

lotarea              0
bldgarea             0
comarea              0
resarea              0
officearea           0
retailarea           0
numbldgs             0
numfloors            0
lotdepth             0
bldgdepth            0
builtfar             0
residfar             0
commfar              0
facilfar             0
borough              0
status               0
year                 0
complaint_type    7304
dtype: int64

In [11]:
df2['complaint_type'].value_counts()

HEAT/HOT WATER          32077
UNSANITARY CONDITION     4453
PLUMBING                 2751
PAINT/PLASTER            2062
WATER LEAK               1984
Name: complaint_type, dtype: int64

In [12]:
df2.shape

(50631, 18)

In [13]:
df2.dropna(inplace=True)

In [14]:
df2.head()

Unnamed: 0,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,borough,status,year,complaint_type
0,2500.0,2288.0,0.0,2288.0,0.0,0.0,1.0,2.0,100.0,52.0,0.92,1.25,0.0,2.0,BRONX,Open,60,HEAT/HOT WATER
2,1970.0,1485.0,0.0,1080.0,0.0,0.0,1.0,2.0,109.42,32.0,0.75,0.75,0.0,2.0,BRONX,Open,65,HEAT/HOT WATER
3,2500.0,3933.0,1337.0,2596.0,0.0,1337.0,1.0,3.0,100.0,60.0,1.57,3.0,0.0,3.0,BRONX,Open,82,HEAT/HOT WATER
4,1800.0,1701.0,0.0,1188.0,0.0,0.0,1.0,2.0,100.0,33.0,0.95,0.75,0.0,2.0,BRONX,Open,63,HEAT/HOT WATER
5,3034.0,1462.0,0.0,1462.0,0.0,0.0,1.0,2.5,93.58,43.0,0.48,0.75,0.0,2.0,BRONX,Open,90,HEAT/HOT WATER


In [15]:
df2.shape

(43327, 18)

In [16]:
df2.reset_index(drop=True,inplace=True)

In [17]:
df2.head()

Unnamed: 0,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,borough,status,year,complaint_type
0,2500.0,2288.0,0.0,2288.0,0.0,0.0,1.0,2.0,100.0,52.0,0.92,1.25,0.0,2.0,BRONX,Open,60,HEAT/HOT WATER
1,1970.0,1485.0,0.0,1080.0,0.0,0.0,1.0,2.0,109.42,32.0,0.75,0.75,0.0,2.0,BRONX,Open,65,HEAT/HOT WATER
2,2500.0,3933.0,1337.0,2596.0,0.0,1337.0,1.0,3.0,100.0,60.0,1.57,3.0,0.0,3.0,BRONX,Open,82,HEAT/HOT WATER
3,1800.0,1701.0,0.0,1188.0,0.0,0.0,1.0,2.0,100.0,33.0,0.95,0.75,0.0,2.0,BRONX,Open,63,HEAT/HOT WATER
4,3034.0,1462.0,0.0,1462.0,0.0,0.0,1.0,2.5,93.58,43.0,0.48,0.75,0.0,2.0,BRONX,Open,90,HEAT/HOT WATER


In [18]:
df2.shape

(43327, 18)

### Drop comarea, officearea, retailarea, commfar, facilfar, borough

In [19]:
df3 = df2.drop(['comarea', 'officearea', 'retailarea', 'commfar', 'facilfar', 'borough'],axis=1)

In [20]:
df3.head()

Unnamed: 0,lotarea,bldgarea,resarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,status,year,complaint_type
0,2500.0,2288.0,2288.0,1.0,2.0,100.0,52.0,0.92,1.25,Open,60,HEAT/HOT WATER
1,1970.0,1485.0,1080.0,1.0,2.0,109.42,32.0,0.75,0.75,Open,65,HEAT/HOT WATER
2,2500.0,3933.0,2596.0,1.0,3.0,100.0,60.0,1.57,3.0,Open,82,HEAT/HOT WATER
3,1800.0,1701.0,1188.0,1.0,2.0,100.0,33.0,0.95,0.75,Open,63,HEAT/HOT WATER
4,3034.0,1462.0,1462.0,1.0,2.5,93.58,43.0,0.48,0.75,Open,90,HEAT/HOT WATER


In [21]:
df3.shape

(43327, 12)

In [22]:
#Encode the complaint_type feature
le = LabelEncoder()

In [23]:
le.fit_transform(df3['complaint_type'])

array([0, 0, 0, ..., 0, 2, 0])

In [24]:
enc = pd.DataFrame(le.fit_transform(df3['complaint_type']),columns=['complaint'])

In [25]:
enc.head()

Unnamed: 0,complaint
0,0
1,0
2,0
3,0
4,0


In [26]:
df4 = pd.concat([df3,enc],axis=1)

In [27]:
df4.head()

Unnamed: 0,lotarea,bldgarea,resarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,status,year,complaint_type,complaint
0,2500.0,2288.0,2288.0,1.0,2.0,100.0,52.0,0.92,1.25,Open,60,HEAT/HOT WATER,0
1,1970.0,1485.0,1080.0,1.0,2.0,109.42,32.0,0.75,0.75,Open,65,HEAT/HOT WATER,0
2,2500.0,3933.0,2596.0,1.0,3.0,100.0,60.0,1.57,3.0,Open,82,HEAT/HOT WATER,0
3,1800.0,1701.0,1188.0,1.0,2.0,100.0,33.0,0.95,0.75,Open,63,HEAT/HOT WATER,0
4,3034.0,1462.0,1462.0,1.0,2.5,93.58,43.0,0.48,0.75,Open,90,HEAT/HOT WATER,0


In [28]:
df4['complaint_type'].value_counts()

HEAT/HOT WATER          32077
UNSANITARY CONDITION     4453
PLUMBING                 2751
PAINT/PLASTER            2062
WATER LEAK               1984
Name: complaint_type, dtype: int64

In [29]:
df4['complaint'].value_counts()

0    32077
3     4453
2     2751
1     2062
4     1984
Name: complaint, dtype: int64

### Encoding Mappings

- HEAT/HOT WATER = 0
- WATER LEAK = 4
- PAINT/PLASTER = 1
- PLUMBING = 2
- UNSANITARY CONDITION = 3


In [30]:
df4 = df4.drop(['complaint_type'],axis=1)

In [31]:
df4.head()

Unnamed: 0,lotarea,bldgarea,resarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,status,year,complaint
0,2500.0,2288.0,2288.0,1.0,2.0,100.0,52.0,0.92,1.25,Open,60,0
1,1970.0,1485.0,1080.0,1.0,2.0,109.42,32.0,0.75,0.75,Open,65,0
2,2500.0,3933.0,2596.0,1.0,3.0,100.0,60.0,1.57,3.0,Open,82,0
3,1800.0,1701.0,1188.0,1.0,2.0,100.0,33.0,0.95,0.75,Open,63,0
4,3034.0,1462.0,1462.0,1.0,2.5,93.58,43.0,0.48,0.75,Open,90,0


In [32]:
df5 = pd.get_dummies(df4,drop_first=True)
df5.head()

Unnamed: 0,lotarea,bldgarea,resarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,year,complaint,status_Open
0,2500.0,2288.0,2288.0,1.0,2.0,100.0,52.0,0.92,1.25,60,0,1
1,1970.0,1485.0,1080.0,1.0,2.0,109.42,32.0,0.75,0.75,65,0,1
2,2500.0,3933.0,2596.0,1.0,3.0,100.0,60.0,1.57,3.0,82,0,1
3,1800.0,1701.0,1188.0,1.0,2.0,100.0,33.0,0.95,0.75,63,0,1
4,3034.0,1462.0,1462.0,1.0,2.5,93.58,43.0,0.48,0.75,90,0,1


In [33]:
df5.rename({'status_Open':'status'},axis=1,inplace=True)

In [34]:
df5.head()

Unnamed: 0,lotarea,bldgarea,resarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,year,complaint,status
0,2500.0,2288.0,2288.0,1.0,2.0,100.0,52.0,0.92,1.25,60,0,1
1,1970.0,1485.0,1080.0,1.0,2.0,109.42,32.0,0.75,0.75,65,0,1
2,2500.0,3933.0,2596.0,1.0,3.0,100.0,60.0,1.57,3.0,82,0,1
3,1800.0,1701.0,1188.0,1.0,2.0,100.0,33.0,0.95,0.75,63,0,1
4,3034.0,1462.0,1462.0,1.0,2.5,93.58,43.0,0.48,0.75,90,0,1


In [35]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43327 entries, 0 to 43326
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   lotarea    43327 non-null  float64
 1   bldgarea   43327 non-null  float64
 2   resarea    43327 non-null  float64
 3   numbldgs   43327 non-null  float64
 4   numfloors  43327 non-null  float64
 5   lotdepth   43327 non-null  float64
 6   bldgdepth  43327 non-null  float64
 7   builtfar   43327 non-null  float64
 8   residfar   43327 non-null  float64
 9   year       43327 non-null  int64  
 10  complaint  43327 non-null  int32  
 11  status     43327 non-null  uint8  
dtypes: float64(9), int32(1), int64(1), uint8(1)
memory usage: 3.5 MB


In [36]:
#Rearrange columns
df6 = df5[['lotarea','bldgarea','resarea','numbldgs','numfloors','lotdepth',
          'bldgdepth','builtfar','residfar','year','status','complaint']]

In [37]:
df6.head()

Unnamed: 0,lotarea,bldgarea,resarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,year,status,complaint
0,2500.0,2288.0,2288.0,1.0,2.0,100.0,52.0,0.92,1.25,60,1,0
1,1970.0,1485.0,1080.0,1.0,2.0,109.42,32.0,0.75,0.75,65,1,0
2,2500.0,3933.0,2596.0,1.0,3.0,100.0,60.0,1.57,3.0,82,1,0
3,1800.0,1701.0,1188.0,1.0,2.0,100.0,33.0,0.95,0.75,63,1,0
4,3034.0,1462.0,1462.0,1.0,2.5,93.58,43.0,0.48,0.75,90,1,0


In [38]:
df6.shape

(43327, 12)

In [39]:
#pp.ProfileReport(df2)

In [40]:
#Remove duplicated values
df6.duplicated().sum()

16

In [41]:
df7 = df6.drop_duplicates()

In [42]:
df7.shape

(43311, 12)

In [43]:
#Save as csv
#df7.to_csv("train3.csv",index=False)

In [44]:
minmax = MinMaxScaler()

In [45]:
df7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43311 entries, 0 to 43326
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   lotarea    43311 non-null  float64
 1   bldgarea   43311 non-null  float64
 2   resarea    43311 non-null  float64
 3   numbldgs   43311 non-null  float64
 4   numfloors  43311 non-null  float64
 5   lotdepth   43311 non-null  float64
 6   bldgdepth  43311 non-null  float64
 7   builtfar   43311 non-null  float64
 8   residfar   43311 non-null  float64
 9   year       43311 non-null  int64  
 10  status     43311 non-null  uint8  
 11  complaint  43311 non-null  int32  
dtypes: float64(9), int32(1), int64(1), uint8(1)
memory usage: 3.8 MB


In [46]:
numeric = df7[['lotarea','bldgarea','resarea','numbldgs','numfloors','lotdepth',
          'bldgdepth','builtfar','residfar','year']]

In [47]:
numeric.head()

Unnamed: 0,lotarea,bldgarea,resarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,year
0,2500.0,2288.0,2288.0,1.0,2.0,100.0,52.0,0.92,1.25,60
1,1970.0,1485.0,1080.0,1.0,2.0,109.42,32.0,0.75,0.75,65
2,2500.0,3933.0,2596.0,1.0,3.0,100.0,60.0,1.57,3.0,82
3,1800.0,1701.0,1188.0,1.0,2.0,100.0,33.0,0.95,0.75,63
4,3034.0,1462.0,1462.0,1.0,2.5,93.58,43.0,0.48,0.75,90


In [48]:
scalednum = minmax.fit_transform(numeric)

In [49]:
scalednum

array([[3.19192478e-05, 1.68314808e-04, 1.73183792e-04, ...,
        2.75531596e-02, 1.25000000e-01, 2.29007634e-01],
       [2.47811882e-05, 1.09009502e-04, 8.17475940e-05, ...,
        2.24618149e-02, 7.50000000e-02, 2.48091603e-01],
       [3.19192478e-05, 2.89805750e-04, 1.96496994e-04, ...,
        4.70200659e-02, 3.00000000e-01, 3.12977099e-01],
       ...,
       [1.83569345e-05, 1.20013849e-04, 1.23681082e-04, ...,
        3.26445043e-02, 3.44000000e-01, 3.81679389e-01],
       [2.90505559e-05, 2.71859064e-04, 2.79304280e-04, ...,
        4.82180294e-02, 1.10000000e-01, 4.19847328e-02],
       [2.05926286e-05, 1.36409587e-04, 7.80386754e-05, ...,
        3.35429769e-02, 7.50000000e-02, 2.67175573e-01]])

In [50]:
scalednumX = pd.DataFrame(data=scalednum, columns=numeric.columns)

In [51]:
scalednumX.head()

Unnamed: 0,lotarea,bldgarea,resarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,year
0,3.2e-05,0.000168,0.000173,0.003984,0.032258,0.025,0.04,0.027553,0.125,0.229008
1,2.5e-05,0.000109,8.2e-05,0.003984,0.032258,0.027355,0.024615,0.022462,0.075,0.248092
2,3.2e-05,0.00029,0.000196,0.003984,0.048387,0.025,0.046154,0.04702,0.3,0.312977
3,2.2e-05,0.000125,9e-05,0.003984,0.032258,0.025,0.025385,0.028452,0.075,0.240458
4,3.9e-05,0.000107,0.000111,0.003984,0.040323,0.023395,0.033077,0.014376,0.075,0.343511


In [52]:
scalednumX.describe()

Unnamed: 0,lotarea,bldgarea,resarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,year
count,43311.0,43311.0,43311.0,43311.0,43311.0,43311.0,43311.0,43311.0,43311.0,43311.0
mean,0.000173,0.000817,0.000575,0.005181,0.041201,0.026609,0.041995,0.037886,0.168872,0.296691
std,0.005628,0.006378,0.005766,0.009707,0.02598,0.015793,0.025599,0.035648,0.139597,0.11642
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3e-05,0.000131,9.5e-05,0.003984,0.032258,0.02375,0.027692,0.018269,0.075,0.229008
50%,3.6e-05,0.000184,0.000145,0.003984,0.032258,0.025,0.036615,0.027254,0.125,0.339695
75%,6.6e-05,0.000282,0.000215,0.003984,0.048387,0.025939,0.046154,0.040731,0.243,0.381679
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [53]:
category = df7[['status','complaint']]

In [54]:
category.head()

Unnamed: 0,status,complaint
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [55]:
df8 = pd.concat([scalednumX,category],axis=1)

In [56]:
df8.head()

Unnamed: 0,lotarea,bldgarea,resarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,year,status,complaint
0,3.2e-05,0.000168,0.000173,0.003984,0.032258,0.025,0.04,0.027553,0.125,0.229008,1.0,0.0
1,2.5e-05,0.000109,8.2e-05,0.003984,0.032258,0.027355,0.024615,0.022462,0.075,0.248092,1.0,0.0
2,3.2e-05,0.00029,0.000196,0.003984,0.048387,0.025,0.046154,0.04702,0.3,0.312977,1.0,0.0
3,2.2e-05,0.000125,9e-05,0.003984,0.032258,0.025,0.025385,0.028452,0.075,0.240458,1.0,0.0
4,3.9e-05,0.000107,0.000111,0.003984,0.040323,0.023395,0.033077,0.014376,0.075,0.343511,1.0,0.0


In [57]:
df8.describe()

Unnamed: 0,lotarea,bldgarea,resarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,year,status,complaint
count,43311.0,43311.0,43311.0,43311.0,43311.0,43311.0,43311.0,43311.0,43311.0,43311.0,43311.0,43311.0
mean,0.000173,0.000817,0.000575,0.005181,0.041201,0.026609,0.041995,0.037886,0.168872,0.296691,0.047655,0.66632
std,0.005628,0.006378,0.005766,0.009707,0.02598,0.015793,0.025599,0.035648,0.139597,0.11642,0.213038,1.231257
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3e-05,0.000131,9.5e-05,0.003984,0.032258,0.02375,0.027692,0.018269,0.075,0.229008,0.0,0.0
50%,3.6e-05,0.000184,0.000145,0.003984,0.032258,0.025,0.036615,0.027254,0.125,0.339695,0.0,0.0
75%,6.6e-05,0.000282,0.000215,0.003984,0.048387,0.025939,0.046154,0.040731,0.243,0.381679,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0


In [58]:
df8.shape

(43327, 12)

In [59]:
#Save as csv
#df8.to_csv("train3processed.csv",index=False)

In [60]:
#pp.ProfileReport(df4)

In [61]:
df8['complaint'].value_counts()

0.0    32061
3.0     4453
2.0     2751
1.0     2062
4.0     1984
Name: complaint, dtype: int64