#### Model testing - Tharaka
Plan to do the following in this notebook
- Load the data
    - Separate labels and features
    - Drop the unwanted columns like date and week
    - Null fill the remaining columns with a suitable method
- Do PCA on the dataset
- Split the data into Test and Train
- Deal with the class imbalance in the training set
- Apply the following models
    - Random Forest
    - XGBoost

In [11]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split 

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE

Loading the data

> Todo: Use Isuru's post-processed data here. Until then, just use numerics

In [3]:
dataset = pd.read_csv("../data/raw/cleaned_joined_static_x.csv.gz", compression='gzip')
dataset.head(2)

  dataset = pd.read_csv("../data/raw/cleaned_joined_static_x.csv.gz", compression='gzip')


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,...,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
0,0,2019-01-03,201901,0,0,,,1917.6,0.0,0.0,...,,,,,,,,,,
1,1,2019-01-03,201901,0,0,,,3134.0,0.0,0.0,...,,,,,,,,,,


In [4]:
dataset_num = dataset.select_dtypes(include='number').fillna(0)
dataset_num.head(2)

Unnamed: 0,case_id,MONTH,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,...,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
0,0,201901,0,0,0.0,0.0,1917.6,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,201901,0,0,0.0,0.0,3134.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Label separation

In [5]:
labels = dataset_num['target'].values
samples = dataset_num.drop(columns=['target', 'case_id', 'MONTH', 'WEEK_NUM'])

Apply PCA

In [6]:
# standardizing for PCA
st_scaler = StandardScaler()
X = st_scaler.fit_transform(samples.values)
X[0:2]

array([[-6.02461529e-03, -5.61584324e-01, -7.05648435e-01,
        -5.11493499e-01, -2.65756420e-03, -2.82714276e-01,
        -1.35708149e-01, -1.10498411e-01, -1.09046784e-01,
        -6.62689471e-01, -7.31555489e-02, -4.93321085e-02,
        -5.72868138e-02, -1.19424987e-01, -5.61811387e-01,
        -2.36743065e-01, -1.09831481e-01, -4.24847336e-01,
        -3.44463344e-01, -4.70643880e-02, -3.37861010e-02,
        -3.49405411e-02, -1.14100235e-01, -8.76171050e-02,
        -1.77621676e-01, -1.24431979e-01, -2.59565920e-03,
        -1.02258895e-01, -4.91697755e-02, -6.10140207e-02,
        -5.04054214e-02, -1.21969740e-02, -2.67601372e-01,
        -6.03071829e-02, -1.07565695e-01, -7.35748250e-01,
        -9.04954305e-01,  0.00000000e+00, -4.49717182e-01,
        -3.87179880e-01, -2.98892458e-01, -1.66936173e-01,
         0.00000000e+00, -3.78181154e-01, -1.34700035e-01,
         9.78540379e-01, -7.48968839e-01, -2.56799969e-01,
         9.35649629e-01,  0.00000000e+00, -7.40465761e-0

In [7]:
REQUIRED_VARIANCE = 0.85

In [8]:
pca = PCA(n_components=REQUIRED_VARIANCE, svd_solver='full')
X_pca = pca.fit_transform(X)

In [9]:
X_pca.shape, labels.shape

((1526659, 55), (1526659,))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, labels, test_size=0.2)

Fixing class imbalance

In [12]:
Counter(y_train)

Counter({0: 1182921, 1: 38406})

In [14]:
smote = SMOTE(sampling_strategy=1.0)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [15]:
X_train_res.shape, y_train_res.shape

((2365842, 55), (2365842,))

In [16]:
Counter(y_train_res)

Counter({0: 1182921, 1: 1182921})