In [1]:
import pandas as pd
import numpy as np

<br>
<br>
<br>

### Data Collection

In [2]:
# loading prepared data
train_data = pd.read_csv("../pipeline_data/3_train_prepared.csv")
train_data.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,N1,N2,...,N17,N18,N19,N21,N22,N23,N24,N33,N35,Dependent_Variable
0,1,1,999,1,1,False,4,True,35.0,4.442651,...,8.754792,0.95,5.370638,0.75,0.0,62.0,3083.333333,40.0,16.0,0
1,2,3,999,999,1,False,999,True,20.39,3.806662,...,0.0,0.0,6.908755,1.0,0.0,32.0,1666.666667,200.0,5.0,1
2,1,0,0,999,0,False,0,True,28.0,3.970292,...,8.704834,0.56,8.117462,0.92,1.0,26.0,1500.0,42.0,15.0,1
3,1,1,999,12,2,False,2,True,18.0,4.905275,...,8.679312,0.15,10.401896,0.83,2.0,30.0,4583.333333,120.0,19.0,0
4,1,7,999,1,4,False,2,False,31.77,4.882802,...,7.932003,0.56,7.657755,0.83,0.0,40.0,4166.666667,80.0,13.0,1


In [3]:
# checking for null values
train_data.isna().any().any()

False

<br>
<br>
<br>

### Data Proprocessing

In [4]:
from sklearn.preprocessing import RobustScaler

#### Boolean data

In [5]:
bool_data = train_data[['C6', 'C8']].copy()
bool_data.head()

Unnamed: 0,C6,C8
0,False,True
1,False,True
2,False,True
3,False,True
4,False,False


In [6]:
# replacing
# True -> 1
# False -> 0
bool_data_encoded = bool_data.applymap(lambda x: 1 if x else 0)
bool_data_encoded.head()

Unnamed: 0,C6,C8
0,0,1
1,0,1
2,0,1
3,0,1
4,0,0


In [7]:
bool_data_encoded.shape

(28050, 2)

<br>
<br>

#### Categorical data

In [8]:
cat_data = train_data[['C1', 'C2', 'C3', 'C4', 'C5', 'C7']].copy()
cat_data.head()

Unnamed: 0,C1,C2,C3,C4,C5,C7
0,1,1,999,1,1,4
1,2,3,999,999,1,999
2,1,0,0,999,0,0
3,1,1,999,12,2,2
4,1,7,999,1,4,2


In [9]:
# changing dtype to string
cat_data = cat_data.astype('string')
cat_data.head()

Unnamed: 0,C1,C2,C3,C4,C5,C7
0,1,1,999,1,1,4
1,2,3,999,999,1,999
2,1,0,0,999,0,0
3,1,1,999,12,2,2
4,1,7,999,1,4,2


<br>

Here, we are using `pd.get_dummies` instead of OneHotEncoder because of 2 reasons:
- OHE returns a sparse matrix
- This matrix does not contain column names

We need column names because when we will do Recursive Feature Elimination,<br>
we can easily see which columns have low feature importance.<br>

This will make removing those features easy and it also provides better interpretability.

In [10]:
cat_data_encoded = pd.get_dummies(cat_data)
cat_data_encoded.head()

Unnamed: 0,C1_1,C1_2,C1_3,C2_0,C2_1,C2_2,C2_3,C2_4,C2_6,C2_7,...,C5_1,C5_2,C5_4,C5_999,C7_0,C7_1,C7_2,C7_4,C7_6,C7_999
0,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,1,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0


In [11]:
cat_data_encoded.shape

(28050, 36)

<br>
<br>

#### Numerical data

As the data is skewed, using robust scaler provides<br>
better accuracy than standard or min-max scaler

In [12]:
scaler = RobustScaler()

In [13]:
num_data = train_data.iloc[:, 8:-1].copy()
num_data.head()

Unnamed: 0,N1,N2,N3,N4,N7,N8,N9,N10,N10.1,N11,...,N16,N17,N18,N19,N21,N22,N23,N24,N33,N35
0,35.0,4.442651,3.0,11.0,44.0,9.0,2.772589,0.0,0.0,4.0,...,0.0,8.754792,0.95,5.370638,0.75,0.0,62.0,3083.333333,40.0,16.0
1,20.39,3.806662,3.6,1.0,4.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,6.908755,1.0,0.0,32.0,1666.666667,200.0,5.0
2,28.0,3.970292,2.5,9.0,25.0,1.0,0.0,9.0,2.0,8.0,...,0.0,8.704834,0.56,8.117462,0.92,1.0,26.0,1500.0,42.0,15.0
3,18.0,4.905275,3.3,11.0,21.0,8.0,2.772589,0.0,6.0,22.0,...,0.0,8.679312,0.15,10.401896,0.83,2.0,30.0,4583.333333,120.0,19.0
4,31.77,4.882802,3.4,5.0,6.0,2.0,2.772589,0.0,4.0,5.0,...,0.0,7.932003,0.56,7.657755,0.83,0.0,40.0,4166.666667,80.0,13.0


In [14]:
num_data_encoded = scaler.fit_transform(num_data)
# converting ndarray to dataframe
num_data_encoded = pd.DataFrame(data=num_data_encoded, columns=num_data.columns, index=num_data.index)
num_data_encoded.head()

Unnamed: 0,N1,N2,N3,N4,N7,N8,N9,N10,N10.1,N11,...,N16,N17,N18,N19,N21,N22,N23,N24,N33,N35
0,1.18066,0.353292,-1.0,0.333333,1.166667,0.8,0.0,0.0,-0.333333,-0.142857,...,0.0,0.026969,0.735849,-1.075934,-0.85,-1.0,0.6875,-0.317073,-0.471698,-0.333333
1,0.085457,-0.122383,0.5,-1.333333,-1.055556,-0.8,-2.772589,0.0,-0.333333,-0.571429,...,0.0,-4.699231,-1.056604,-0.473452,0.4,-1.0,-0.25,-0.731707,1.037736,-1.555556
2,0.655922,0.0,-2.25,0.0,0.111111,-0.8,-2.772589,9.0,0.333333,0.428571,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.4375,-0.780488,-0.45283,-0.444444
3,-0.093703,0.699302,-0.25,0.333333,-0.111111,0.6,0.0,0.0,1.666667,2.428571,...,0.0,-0.013778,-0.773585,0.894816,-0.45,1.0,-0.3125,0.121951,0.283019,0.0
4,0.938531,0.682494,0.0,-0.666667,-0.944444,-0.6,0.0,0.0,1.0,0.0,...,0.0,-0.417206,0.0,-0.180067,-0.45,-1.0,0.0,0.0,-0.09434,-0.666667


In [15]:
num_data_encoded.shape

(28050, 23)

<br>
<br>

In [16]:
total_data = pd.concat([bool_data_encoded, cat_data_encoded, num_data_encoded, train_data[['Dependent_Variable']]], axis=1)
total_data.head()

Unnamed: 0,C6,C8,C1_1,C1_2,C1_3,C2_0,C2_1,C2_2,C2_3,C2_4,...,N17,N18,N19,N21,N22,N23,N24,N33,N35,Dependent_Variable
0,0,1,1,0,0,0,1,0,0,0,...,0.026969,0.735849,-1.075934,-0.85,-1.0,0.6875,-0.317073,-0.471698,-0.333333,0
1,0,1,0,1,0,0,0,0,1,0,...,-4.699231,-1.056604,-0.473452,0.4,-1.0,-0.25,-0.731707,1.037736,-1.555556,1
2,0,1,1,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,-0.4375,-0.780488,-0.45283,-0.444444,1
3,0,1,1,0,0,0,1,0,0,0,...,-0.013778,-0.773585,0.894816,-0.45,1.0,-0.3125,0.121951,0.283019,0.0,0
4,0,0,1,0,0,0,0,0,0,0,...,-0.417206,0.0,-0.180067,-0.45,-1.0,0.0,0.0,-0.09434,-0.666667,1


In [17]:
total_data.columns

Index(['C6', 'C8', 'C1_1', 'C1_2', 'C1_3', 'C2_0', 'C2_1', 'C2_2', 'C2_3',
       'C2_4', 'C2_6', 'C2_7', 'C2_999', 'C3_0', 'C3_1', 'C3_19', 'C3_2',
       'C3_3', 'C3_5', 'C3_7', 'C3_999', 'C4_0', 'C4_1', 'C4_12', 'C4_31',
       'C4_41', 'C4_999', 'C5_0', 'C5_1', 'C5_2', 'C5_4', 'C5_999', 'C7_0',
       'C7_1', 'C7_2', 'C7_4', 'C7_6', 'C7_999', 'N1', 'N2', 'N3', 'N4', 'N7',
       'N8', 'N9', 'N10', 'N10.1', 'N11', 'N12', 'N14', 'N15', 'N16', 'N17',
       'N18', 'N19', 'N21', 'N22', 'N23', 'N24', 'N33', 'N35',
       'Dependent_Variable'],
      dtype='object')

In [18]:
total_data.isna().any().any()

False

In [19]:
# saving preprocessed data
total_data.to_csv("../pipeline_data/5_train_preprocessed.csv", index=None)