In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(font_scale=1.25)
np.random.seed(5)

pd.set_option("display.max_rows",10000)
pd.set_option("display.max_columns",100)

import pandas_profiling as pp

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, plot_roc_curve


In [2]:
df = pd.read_csv("final1.csv")

In [3]:
df.head()

Unnamed: 0,complaint_type,incident_zip,street_name,status,lot,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,yearbuilt,builtfar,residfar,commfar,facilfar
0,1,10461.0,NEILL AVENUE,Closed,72.0,1629.0,1584.0,0.0,1152.0,0.0,0.0,1.0,2.0,90.5,32.0,1950.0,0.97,0.75,0.0,2.0
1,1,10463.0,BAILEY AVENUE,Closed,33.0,2500.0,3933.0,1337.0,2596.0,0.0,1337.0,1.0,3.0,100.0,60.0,1938.0,1.57,3.0,0.0,3.0
2,1,10455.0,TINTON AVENUE,Open,3.0,3034.0,1462.0,0.0,1462.0,0.0,0.0,1.0,2.5,93.58,43.0,1930.0,0.48,0.75,0.0,2.0
3,0,10458.0,EAST 188 STREET,Open,14.0,2500.0,2133.0,0.0,1512.0,0.0,0.0,1.0,2.0,100.0,37.0,1953.0,0.85,0.75,0.0,2.0
4,1,10460.0,MAPES AVENUE,Open,92.0,2025.0,2764.0,0.0,2764.0,0.0,0.0,1.0,3.0,100.0,52.0,1974.0,1.36,0.5,0.0,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13354 entries, 0 to 13353
Data columns (total 20 columns):
complaint_type    13354 non-null int64
incident_zip      13354 non-null float64
street_name       13354 non-null object
status            13354 non-null object
lot               13354 non-null float64
lotarea           13354 non-null float64
bldgarea          13354 non-null float64
comarea           13354 non-null float64
resarea           13354 non-null float64
officearea        13354 non-null float64
retailarea        13354 non-null float64
numbldgs          13354 non-null float64
numfloors         13354 non-null float64
lotdepth          13354 non-null float64
bldgdepth         13354 non-null float64
yearbuilt         13354 non-null float64
builtfar          13354 non-null float64
residfar          13354 non-null float64
commfar           13354 non-null float64
facilfar          13354 non-null float64
dtypes: float64(17), int64(1), object(2)
memory usage: 2.0+ MB


In [5]:
df.drop(['incident_zip','street_name','lot'],axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,complaint_type,status,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,yearbuilt,builtfar,residfar,commfar,facilfar
0,1,Closed,1629.0,1584.0,0.0,1152.0,0.0,0.0,1.0,2.0,90.5,32.0,1950.0,0.97,0.75,0.0,2.0
1,1,Closed,2500.0,3933.0,1337.0,2596.0,0.0,1337.0,1.0,3.0,100.0,60.0,1938.0,1.57,3.0,0.0,3.0
2,1,Open,3034.0,1462.0,0.0,1462.0,0.0,0.0,1.0,2.5,93.58,43.0,1930.0,0.48,0.75,0.0,2.0
3,0,Open,2500.0,2133.0,0.0,1512.0,0.0,0.0,1.0,2.0,100.0,37.0,1953.0,0.85,0.75,0.0,2.0
4,1,Open,2025.0,2764.0,0.0,2764.0,0.0,0.0,1.0,3.0,100.0,52.0,1974.0,1.36,0.5,0.0,1.0


In [7]:
#Create years column
df['years'] = 2020 - df['yearbuilt'].astype('int')

In [8]:
df.head()

Unnamed: 0,complaint_type,status,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,yearbuilt,builtfar,residfar,commfar,facilfar,years
0,1,Closed,1629.0,1584.0,0.0,1152.0,0.0,0.0,1.0,2.0,90.5,32.0,1950.0,0.97,0.75,0.0,2.0,70
1,1,Closed,2500.0,3933.0,1337.0,2596.0,0.0,1337.0,1.0,3.0,100.0,60.0,1938.0,1.57,3.0,0.0,3.0,82
2,1,Open,3034.0,1462.0,0.0,1462.0,0.0,0.0,1.0,2.5,93.58,43.0,1930.0,0.48,0.75,0.0,2.0,90
3,0,Open,2500.0,2133.0,0.0,1512.0,0.0,0.0,1.0,2.0,100.0,37.0,1953.0,0.85,0.75,0.0,2.0,67
4,1,Open,2025.0,2764.0,0.0,2764.0,0.0,0.0,1.0,3.0,100.0,52.0,1974.0,1.36,0.5,0.0,1.0,46


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13354 entries, 0 to 13353
Data columns (total 18 columns):
complaint_type    13354 non-null int64
status            13354 non-null object
lotarea           13354 non-null float64
bldgarea          13354 non-null float64
comarea           13354 non-null float64
resarea           13354 non-null float64
officearea        13354 non-null float64
retailarea        13354 non-null float64
numbldgs          13354 non-null float64
numfloors         13354 non-null float64
lotdepth          13354 non-null float64
bldgdepth         13354 non-null float64
yearbuilt         13354 non-null float64
builtfar          13354 non-null float64
residfar          13354 non-null float64
commfar           13354 non-null float64
facilfar          13354 non-null float64
years             13354 non-null int32
dtypes: float64(15), int32(1), int64(1), object(1)
memory usage: 1.8+ MB


In [10]:
df['years'].value_counts()

100     1234
95       958
90       908
70       760
60       715
80       591
110      567
119      563
65       557
89       514
85       387
55       385
75       265
105      251
50       248
121      224
92       168
93       166
14       136
94       133
15       128
18       113
28       111
16       104
19       100
17        99
13        96
96        92
115       91
20        91
27        88
12        84
91        82
29        82
21        75
23        73
32        71
25        70
33        70
45        68
31        60
30        54
22        47
61        43
26        43
97        40
24        38
35        38
59        38
107       37
88        35
120       33
64        32
11        32
56        32
98        32
67        31
6         31
63        31
104       29
72        29
109       28
82        28
108       28
83        27
9         27
57        27
58        26
53        25
114       25
66        24
34        24
62        24
69        23
113       23
7         23
10        23

In [11]:
df.drop(['yearbuilt'],axis=1,inplace=True)

In [12]:
df.head()

Unnamed: 0,complaint_type,status,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,years
0,1,Closed,1629.0,1584.0,0.0,1152.0,0.0,0.0,1.0,2.0,90.5,32.0,0.97,0.75,0.0,2.0,70
1,1,Closed,2500.0,3933.0,1337.0,2596.0,0.0,1337.0,1.0,3.0,100.0,60.0,1.57,3.0,0.0,3.0,82
2,1,Open,3034.0,1462.0,0.0,1462.0,0.0,0.0,1.0,2.5,93.58,43.0,0.48,0.75,0.0,2.0,90
3,0,Open,2500.0,2133.0,0.0,1512.0,0.0,0.0,1.0,2.0,100.0,37.0,0.85,0.75,0.0,2.0,67
4,1,Open,2025.0,2764.0,0.0,2764.0,0.0,0.0,1.0,3.0,100.0,52.0,1.36,0.5,0.0,1.0,46


In [14]:
df = pd.get_dummies(df,drop_first=True)
df.head()

Unnamed: 0,complaint_type,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,years,status_Open
0,1,1629.0,1584.0,0.0,1152.0,0.0,0.0,1.0,2.0,90.5,32.0,0.97,0.75,0.0,2.0,70,0
1,1,2500.0,3933.0,1337.0,2596.0,0.0,1337.0,1.0,3.0,100.0,60.0,1.57,3.0,0.0,3.0,82,0
2,1,3034.0,1462.0,0.0,1462.0,0.0,0.0,1.0,2.5,93.58,43.0,0.48,0.75,0.0,2.0,90,1
3,0,2500.0,2133.0,0.0,1512.0,0.0,0.0,1.0,2.0,100.0,37.0,0.85,0.75,0.0,2.0,67,1
4,1,2025.0,2764.0,0.0,2764.0,0.0,0.0,1.0,3.0,100.0,52.0,1.36,0.5,0.0,1.0,46,1


In [15]:
df['complaint_type'].value_counts()

1    6813
0    6541
Name: complaint_type, dtype: int64

In [16]:
df.rename({'status_Open':'status'},axis=1,inplace=True)

In [18]:
df.head()

Unnamed: 0,complaint_type,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,years,status
0,1,1629.0,1584.0,0.0,1152.0,0.0,0.0,1.0,2.0,90.5,32.0,0.97,0.75,0.0,2.0,70,0
1,1,2500.0,3933.0,1337.0,2596.0,0.0,1337.0,1.0,3.0,100.0,60.0,1.57,3.0,0.0,3.0,82,0
2,1,3034.0,1462.0,0.0,1462.0,0.0,0.0,1.0,2.5,93.58,43.0,0.48,0.75,0.0,2.0,90,1
3,0,2500.0,2133.0,0.0,1512.0,0.0,0.0,1.0,2.0,100.0,37.0,0.85,0.75,0.0,2.0,67,1
4,1,2025.0,2764.0,0.0,2764.0,0.0,0.0,1.0,3.0,100.0,52.0,1.36,0.5,0.0,1.0,46,1


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13354 entries, 0 to 13353
Data columns (total 17 columns):
complaint_type    13354 non-null int64
lotarea           13354 non-null float64
bldgarea          13354 non-null float64
comarea           13354 non-null float64
resarea           13354 non-null float64
officearea        13354 non-null float64
retailarea        13354 non-null float64
numbldgs          13354 non-null float64
numfloors         13354 non-null float64
lotdepth          13354 non-null float64
bldgdepth         13354 non-null float64
builtfar          13354 non-null float64
residfar          13354 non-null float64
commfar           13354 non-null float64
facilfar          13354 non-null float64
years             13354 non-null int32
status            13354 non-null uint8
dtypes: float64(14), int32(1), int64(1), uint8(1)
memory usage: 1.6 MB


In [20]:
#Rearrange columns
df2 = df[['lotarea','bldgarea','comarea','resarea','officearea','retailarea','numbldgs','numfloors','lotdepth',
          'bldgdepth','builtfar','residfar','commfar','facilfar','years','status','complaint_type']]

In [21]:
df2.head()

Unnamed: 0,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,years,status,complaint_type
0,1629.0,1584.0,0.0,1152.0,0.0,0.0,1.0,2.0,90.5,32.0,0.97,0.75,0.0,2.0,70,0,1
1,2500.0,3933.0,1337.0,2596.0,0.0,1337.0,1.0,3.0,100.0,60.0,1.57,3.0,0.0,3.0,82,0,1
2,3034.0,1462.0,0.0,1462.0,0.0,0.0,1.0,2.5,93.58,43.0,0.48,0.75,0.0,2.0,90,1,1
3,2500.0,2133.0,0.0,1512.0,0.0,0.0,1.0,2.0,100.0,37.0,0.85,0.75,0.0,2.0,67,1,0
4,2025.0,2764.0,0.0,2764.0,0.0,0.0,1.0,3.0,100.0,52.0,1.36,0.5,0.0,1.0,46,1,1


In [22]:
df2.shape

(13354, 17)

In [23]:
#pp.ProfileReport(df2)

In [24]:
#Remove duplicated values
df2.duplicated().sum()

591

In [25]:
df3 = df2.drop_duplicates()

In [26]:
df3.duplicated().sum()

0

In [27]:
#Save as csv
#df3.to_csv("train1.csv",index=False)

In [28]:
df3 = pd.read_csv("train1.csv")

In [29]:
df3.head()

Unnamed: 0,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,years,status,complaint_type
0,1629.0,1584.0,0.0,1152.0,0.0,0.0,1.0,2.0,90.5,32.0,0.97,0.75,0.0,2.0,70,0,1
1,2500.0,3933.0,1337.0,2596.0,0.0,1337.0,1.0,3.0,100.0,60.0,1.57,3.0,0.0,3.0,82,0,1
2,3034.0,1462.0,0.0,1462.0,0.0,0.0,1.0,2.5,93.58,43.0,0.48,0.75,0.0,2.0,90,1,1
3,2500.0,2133.0,0.0,1512.0,0.0,0.0,1.0,2.0,100.0,37.0,0.85,0.75,0.0,2.0,67,1,0
4,2025.0,2764.0,0.0,2764.0,0.0,0.0,1.0,3.0,100.0,52.0,1.36,0.5,0.0,1.0,46,1,1


In [30]:
df3.describe()

Unnamed: 0,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,years,status,complaint_type
count,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0
mean,10688.83,9013.473,2445.998,6372.704,543.001254,441.132179,1.249471,2.481425,104.598569,52.696136,1.212561,1.669253,0.114534,2.908337,76.643266,0.110711,0.509128
std,253294.7,41648.7,22787.7,33503.72,8949.576521,6936.097488,0.930414,1.372565,50.784696,29.438775,1.052447,1.353845,0.54684,1.593568,45.930095,0.313786,0.499936
min,368.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,2222.0,1760.0,0.0,1280.0,0.0,0.0,1.0,2.0,95.0,36.0,0.63,0.75,0.0,2.0,60.0,0.0,0.0
50%,2538.0,2392.0,0.0,1920.0,0.0,0.0,1.0,2.0,100.0,46.0,0.92,1.25,0.0,2.0,85.0,0.0,1.0
75%,4188.0,3440.0,0.0,2760.0,0.0,0.0,1.0,3.0,102.995,58.0,1.32,2.43,0.0,4.8,100.0,0.0,1.0
max,22251600.0,2199075.0,1185895.0,1965090.0,480769.0,464489.0,55.0,25.0,2276.0,992.0,16.18,9.0,9.0,9.0,2020.0,1.0,1.0


In [31]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12763 entries, 0 to 12762
Data columns (total 17 columns):
lotarea           12763 non-null float64
bldgarea          12763 non-null float64
comarea           12763 non-null float64
resarea           12763 non-null float64
officearea        12763 non-null float64
retailarea        12763 non-null float64
numbldgs          12763 non-null float64
numfloors         12763 non-null float64
lotdepth          12763 non-null float64
bldgdepth         12763 non-null float64
builtfar          12763 non-null float64
residfar          12763 non-null float64
commfar           12763 non-null float64
facilfar          12763 non-null float64
years             12763 non-null int64
status            12763 non-null int64
complaint_type    12763 non-null int64
dtypes: float64(14), int64(3)
memory usage: 1.7 MB


In [32]:
scaler = StandardScaler()

In [33]:
numeric = df3[['lotarea','bldgarea','comarea','resarea','officearea','retailarea','numbldgs','numfloors','lotdepth',
          'bldgdepth','builtfar','residfar','commfar','facilfar','years']]

In [34]:
numeric

Unnamed: 0,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,years
0,1629.0,1584.0,0.0,1152.0,0.0,0.0,1.0,2.0,90.50,32.00,0.97,0.75,0.0,2.0,70
1,2500.0,3933.0,1337.0,2596.0,0.0,1337.0,1.0,3.0,100.00,60.00,1.57,3.00,0.0,3.0,82
2,3034.0,1462.0,0.0,1462.0,0.0,0.0,1.0,2.5,93.58,43.00,0.48,0.75,0.0,2.0,90
3,2500.0,2133.0,0.0,1512.0,0.0,0.0,1.0,2.0,100.00,37.00,0.85,0.75,0.0,2.0,67
4,2025.0,2764.0,0.0,2764.0,0.0,0.0,1.0,3.0,100.00,52.00,1.36,0.50,0.0,1.0,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12758,2500.0,3060.0,0.0,3060.0,0.0,0.0,1.0,3.0,100.00,60.00,1.22,0.75,0.0,2.0,14
12759,1638.0,1690.0,0.0,1690.0,0.0,0.0,1.0,3.0,91.00,35.00,1.03,1.25,0.0,2.0,70
12760,6730.0,28279.0,0.0,28279.0,0.0,0.0,1.0,6.0,67.25,55.00,4.20,3.44,0.0,4.8,97
12761,1741.0,2340.0,0.0,2340.0,0.0,0.0,1.0,3.0,92.72,40.00,1.34,3.44,0.0,4.8,28


In [35]:
scalednum = scaler.fit_transform(numeric)

In [36]:
scalednum

array([[-0.03576934, -0.17839125, -0.10734276, ..., -0.20945552,
        -0.57002413, -0.14464429],
       [-0.03233052, -0.12198872, -0.04866845, ..., -0.20945552,
         0.05752312,  0.11663255],
       [-0.03022222, -0.18132063, -0.10734276, ..., -0.20945552,
        -0.57002413,  0.29081711],
       ...,
       [-0.01562996,  0.46259021, -0.10734276, ..., -0.20945552,
         1.18710816,  0.4432286 ],
       [-0.03532715, -0.16023871, -0.10734276, ..., -0.20945552,
         1.18710816, -1.05911323],
       [-0.03523634, -0.16801837, -0.10734276, ..., -0.20945552,
        -0.57002413, -0.36237499]])

In [37]:
scalednumX = pd.DataFrame(data=scalednum, columns=numeric.columns)

In [38]:
scalednumX

Unnamed: 0,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,years
0,-0.035769,-0.178391,-0.107343,-0.155831,-0.060676,-0.063602,-0.26814,-0.350762,-0.277625,-0.703051,-0.230482,-0.679021,-0.209456,-0.570024,-0.144644
1,-0.032331,-0.121989,-0.048668,-0.112729,-0.060676,0.129165,-0.26814,0.377829,-0.090554,0.248113,0.339640,0.982978,-0.209456,0.057523,0.116633
2,-0.030222,-0.181321,-0.107343,-0.146578,-0.060676,-0.063602,-0.26814,0.013533,-0.216975,-0.329379,-0.696082,-0.679021,-0.209456,-0.570024,0.290817
3,-0.032331,-0.165209,-0.107343,-0.145085,-0.060676,-0.063602,-0.26814,-0.350762,-0.090554,-0.533200,-0.344506,-0.679021,-0.209456,-0.570024,-0.209963
4,-0.034206,-0.150058,-0.107343,-0.107715,-0.060676,-0.063602,-0.26814,0.377829,-0.090554,-0.023648,0.140098,-0.863687,-0.209456,-1.197571,-0.667198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12758,-0.032331,-0.142951,-0.107343,-0.098880,-0.060676,-0.063602,-0.26814,0.377829,-0.090554,0.248113,0.007069,-0.679021,-0.209456,-0.570024,-1.363936
12759,-0.035734,-0.175846,-0.107343,-0.139772,-0.060676,-0.063602,-0.26814,0.377829,-0.267780,-0.601140,-0.173470,-0.309688,-0.209456,-0.570024,-0.144644
12760,-0.015630,0.462590,-0.107343,0.653872,-0.060676,-0.063602,-0.26814,2.563603,-0.735458,0.078263,2.838676,1.307991,-0.209456,1.187108,0.443229
12761,-0.035327,-0.160239,-0.107343,-0.120371,-0.060676,-0.063602,-0.26814,0.377829,-0.233910,-0.431289,0.121093,1.307991,-0.209456,1.187108,-1.059113


In [39]:
category = df3[['status','complaint_type']]

In [40]:
category

Unnamed: 0,status,complaint_type
0,0,1
1,0,1
2,1,1
3,1,0
4,1,1
...,...,...
12758,0,1
12759,0,1
12760,0,1
12761,0,1


In [41]:
df4 = pd.concat([scalednumX,category],axis=1)

In [43]:
df4.head()

Unnamed: 0,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,years,status,complaint_type
0,-0.035769,-0.178391,-0.107343,-0.155831,-0.060676,-0.063602,-0.26814,-0.350762,-0.277625,-0.703051,-0.230482,-0.679021,-0.209456,-0.570024,-0.144644,0,1
1,-0.032331,-0.121989,-0.048668,-0.112729,-0.060676,0.129165,-0.26814,0.377829,-0.090554,0.248113,0.33964,0.982978,-0.209456,0.057523,0.116633,0,1
2,-0.030222,-0.181321,-0.107343,-0.146578,-0.060676,-0.063602,-0.26814,0.013533,-0.216975,-0.329379,-0.696082,-0.679021,-0.209456,-0.570024,0.290817,1,1
3,-0.032331,-0.165209,-0.107343,-0.145085,-0.060676,-0.063602,-0.26814,-0.350762,-0.090554,-0.5332,-0.344506,-0.679021,-0.209456,-0.570024,-0.209963,1,0
4,-0.034206,-0.150058,-0.107343,-0.107715,-0.060676,-0.063602,-0.26814,0.377829,-0.090554,-0.023648,0.140098,-0.863687,-0.209456,-1.197571,-0.667198,1,1


In [44]:
df4.describe()

Unnamed: 0,lotarea,bldgarea,comarea,resarea,officearea,retailarea,numbldgs,numfloors,lotdepth,bldgdepth,builtfar,residfar,commfar,facilfar,years,status,complaint_type
count,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0,12763.0
mean,-4.4531140000000004e-17,5.3171180000000005e-18,3.11405e-15,2.767076e-16,1.17832e-15,2.752082e-15,-2.928351e-16,2.845661e-15,1.108786e-15,-9.684419e-16,2.557751e-16,-4.461604e-15,1.188834e-15,4.343118e-16,-8.710071000000001e-17,0.110711,0.509128
std,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039,0.313786,0.499936
min,-0.04074792,-0.2159449,-0.1073428,-0.1902163,-0.06067578,-0.06360197,-1.342972,-1.807945,-2.059728,-1.790095,-1.15218,-1.23302,-0.2094555,-1.825119,-1.646986,0.0,0.0
25%,-0.0334281,-0.1741653,-0.1073428,-0.1520101,-0.06067578,-0.06360197,-0.2681397,-0.3507622,-0.1890126,-0.56717,-0.5535512,-0.6790206,-0.2094555,-0.5700241,-0.362375,0.0,0.0
50%,-0.03218049,-0.1589901,-0.1073428,-0.132907,-0.06067578,-0.06360197,-0.2681397,-0.3507622,-0.09055384,-0.2274687,-0.2779921,-0.3096876,-0.2094555,-0.5700241,0.1819518,0.0,1.0
75%,-0.02566609,-0.1338263,-0.1073428,-0.1078342,-0.06067578,-0.06360197,-0.2681397,0.3778291,-0.03157707,0.180173,0.1020894,0.5619383,-0.2094555,1.187108,0.5085478,0.0,1.0
max,87.80989,52.58621,51.93572,58.46498,53.66118,66.90593,57.77281,16.40684,42.75868,31.90828,14.22212,5.414974,16.24938,3.822807,42.31284,1.0,1.0


In [45]:
#Save as csv
#df4.to_csv("train1processed.csv",index=False)

In [46]:
#pp.ProfileReport(df4)