# Test Data Processing

Written by - Chandrima Chakrabarty (28-5-2021)  chakrabartychandrima91@gmail.com 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
#!pip install catboost
from catboost import CatBoostClassifier
from sklearn.utils import class_weight

from sklearn import preprocessing

### Data Processing & Feature Engineering

In [2]:
train = pd.read_csv('test_mSzZ8RL.csv')
print(train.shape)
#train.head()

(105312, 10)


##### Bifurcate in different age classes

In [3]:
age_ = train.Age.values
age_class = []
for element in age_:
    if element >= 70:
        age_class.append(0)
    if ((element >= 60) & (element <70)):
        age_class.append(1)
    if ((element >= 50) & (element <60)):
        age_class.append(3)
    if ((element >= 40) & (element <50)):
        age_class.append(5)
    if ((element >= 30) & (element <40)):
        age_class.append(6)
    if ((element >= 20) & (element <30)):
        age_class.append(4)
    if ((element >= 0) & (element <20)):
        age_class.append(2)
        
train['age_class'] = age_class

In [4]:
train['new_var1'] = ((train['Age']*12)*train['Vintage']) #Creating new feature

**Filling missing values**

In [5]:
train.isnull().sum()

ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         12522
Avg_Account_Balance        0
Is_Active                  0
age_class                  0
new_var1                   0
dtype: int64

In [6]:
cols_with_missing = [col for col in train.columns if train[col].isnull().any()]
print('Columns with Missing values : ')
cols_with_missing

Columns with Missing values : 


['Credit_Product']

In [7]:
#Adding new columns to give the information about missing rows
for col in cols_with_missing:
  train[col + '_was_missing'] = train[col].isnull()
  train[col + '_was_missing'] = train[col + '_was_missing'].apply(lambda x: 1 if x==True else 0)

In [8]:
train

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,age_class,new_var1,Credit_Product_was_missing
0,VBENBARO,Male,29,RG254,Other,X1,25,Yes,742366,No,4,8700,0
1,CCMEWNKY,Male,43,RG268,Other,X2,49,,925537,No,5,25284,1
2,VK3KGA9M,Male,31,RG270,Salaried,X1,14,No,215949,No,6,5208,0
3,TT8RPZVC,Male,29,RG272,Other,X1,33,No,868070,No,4,11484,0
4,SHQZEYTZ,Female,29,RG270,Other,X1,19,No,657087,No,4,6612,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
105307,DBENJOYI,Male,52,RG268,Salaried,X2,86,Yes,4242558,Yes,3,53664,0
105308,CWQ72DWS,Male,55,RG277,Other,X2,86,Yes,1159153,No,3,56760,0
105309,HDESC8GU,Male,35,RG254,Salaried,X4,15,No,1703727,No,6,6300,0
105310,2PW4SFCA,Male,53,RG254,Other,X3,93,No,737178,Yes,3,59148,0


In [9]:
#Filling missing values with a new category 'X'

train['Credit_Product'].fillna('X',inplace=True)

In [10]:
train.isnull().sum().sum()

0

In [11]:
#Making a copies of original data

train_org=train.copy()

In [12]:
## Convert NAN's in Credit_Product as 0,'No' as 1 and 'Yes' as 1
train['Credit_Product']=train['Credit_Product'].apply(lambda x : str(x) if x!='X' else 0)
train['Credit_Product']=train['Credit_Product'].apply(lambda x : 1 if x!='No' else 2)


id_catb   = train[['ID']]
train = train.drop(['ID'],axis=1)

In [13]:
# #label encoding objects types to bring its value between 0-1

pe                = preprocessing.LabelEncoder()
train.Region_Code = pe.fit_transform(train.Region_Code)

In [14]:
train = pd.get_dummies(train,drop_first=True)

train['Avg_Account_Balance'] = np.log(train['Avg_Account_Balance'])

##### Introduce new variable

In [15]:
train['new_var2'] = train['Gender_Male']*train['Occupation_Salaried']

In [16]:
#bringing column value in the range 0 to 1 

train['Avg_Account_Balance'] = train['Avg_Account_Balance']/train['Avg_Account_Balance'].abs().max()
train['Vintage']             = train['Vintage']/train['Vintage'].abs().max()
train['Age']                 = train['Age']/train['Age'].abs().max()
train['new_var1']            = train['new_var1'] /train['new_var1'].abs().max()
train['Region_Code']         = train['Region_Code'] /train['Region_Code'].abs().max()
train['Credit_Product']      = train['Credit_Product'] /train['Credit_Product'].abs().max()

train['age_class']           = train['age_class']/train['age_class'].abs().max()
train['new_var2']            = train['new_var2'] /train['new_var2'].abs().max()

In [17]:
train

Unnamed: 0,Age,Region_Code,Vintage,Credit_Product,Avg_Account_Balance,age_class,new_var1,Credit_Product_was_missing,Gender_Male,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Is_Active_Yes,new_var2
0,0.341176,0.117647,0.185185,0.5,0.839136,0.666667,0.066119,0,1,1,0,0,0,0,0,0,0.0
1,0.505882,0.529412,0.362963,0.5,0.852826,0.833333,0.192157,1,1,1,0,0,1,0,0,0,0.0
2,0.364706,0.588235,0.103704,1.0,0.762483,1.000000,0.039580,0,1,0,1,0,0,0,0,0,1.0
3,0.341176,0.647059,0.244444,1.0,0.848847,0.666667,0.087278,0,1,1,0,0,0,0,0,0,0.0
4,0.341176,0.588235,0.140741,1.0,0.831561,0.666667,0.050251,0,0,1,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105307,0.611765,0.529412,0.637037,0.5,0.947342,0.500000,0.407843,0,1,0,1,0,1,0,0,1,1.0
105308,0.647059,0.794118,0.637037,0.5,0.866798,0.500000,0.431373,0,1,1,0,0,1,0,0,0,0.0
105309,0.411765,0.117647,0.111111,1.0,0.890706,1.000000,0.047880,0,1,0,1,0,0,0,1,0,1.0
105310,0.623529,0.117647,0.688889,1.0,0.838701,0.500000,0.449521,0,1,1,0,0,0,1,0,1,0.0


In [18]:
from sklearn.decomposition import PCA

train_fe = train

# Principal Component Analysis
pca = PCA(n_components=3)
ugriz = pca.fit_transform(train_fe[['Age', 'Region_Code', 'Vintage', 'Avg_Account_Balance']])

# update dataframe 
train_fe = pd.concat((train_fe, pd.DataFrame(ugriz)), axis=1)
train_fe.rename({0: 'PCA_1', 1: 'PCA_2', 2: 'PCA_3'}, axis=1, inplace = True)
#train_fe.drop(['Age', 'Region_Code', 'Vintage', 'Avg_Account_Balance'], axis=1, inplace=True)
train_fe.head()

Unnamed: 0,Age,Region_Code,Vintage,Credit_Product,Avg_Account_Balance,age_class,new_var1,Credit_Product_was_missing,Gender_Male,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Is_Active_Yes,new_var2,PCA_1,PCA_2,PCA_3
0,0.341176,0.117647,0.185185,0.5,0.839136,0.666667,0.066119,0,1,1,0,0,0,0,0,0,0.0,0.515229,-0.068217,-0.060074
1,0.505882,0.529412,0.362963,0.5,0.852826,0.833333,0.192157,1,1,1,0,0,1,0,0,0,0.0,0.048495,0.025151,-0.016484
2,0.364706,0.588235,0.103704,1.0,0.762483,1.0,0.03958,0,1,0,1,0,0,0,0,0,1.0,0.088415,-0.275226,-0.002663
3,0.341176,0.647059,0.244444,1.0,0.848847,0.666667,0.087278,0,1,1,0,0,0,0,0,0,0.0,-0.002312,-0.189373,-0.096668
4,0.341176,0.588235,0.140741,1.0,0.831561,0.666667,0.050251,0,0,1,0,0,0,0,0,0,0.0,0.081456,-0.25502,-0.042038


In [19]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

scaler = MinMaxScaler()
sdss   = scaler.fit_transform(train_fe[['PCA_1','PCA_2','PCA_3']])

train_fe[['PCA_1','PCA_2','PCA_3']] = sdss

In [20]:
test_catb = train_fe.copy()

In [21]:
%store test_catb
%store id_catb

Stored 'test_catb' (DataFrame)
Stored 'id_catb' (DataFrame)
