#### Model testing - Tharaka
Plan to do the following in this notebook
- Load the data
    - Separate labels and features
    - Drop the unwanted columns like date and week
    - Null fill the remaining columns with a suitable method
- Do PCA on the dataset
- Split the data into Test and Train
- Deal with the class imbalance in the training set
- Apply the following models
    - Random Forest
    - XGBoost

In [25]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split 

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE

Loading the data

In [26]:
dataset = pd.read_csv("../data/processed/postprocessed_after_outlier_handled.csv.gz", compression='gzip')
dataset.head(2)

Unnamed: 0,case_id,MONTH,WEEK_NUM,target,cardtype_51L_INSTANT,cardtype_51L_NOCARD,cardtype_51L_PERSONALIZED,inittransactioncode_186L_CASH,inittransactioncode_186L_NDF,inittransactioncode_186L_POS,...,scaled_numinstregularpaid_973L.1,scaled_numinsttopaygr_769L.1,scaled_numinstunpaidmax_3546851L.1,scaled_pmtnum_254L.1,scaled_posfpd30lastmonth_3976960P.1,scaled_price_1097A.1,scaled_sumoutstandtotal_3546847A.1,scaled_totaldebt_9A.1,scaled_totalsettled_863A.1,scaled_time_delta
0,0,201901,0,0,False,True,False,True,False,False,...,-0.804146,-0.525393,-0.535457,0.869697,0.0,-1.037614,-0.488194,-0.48857,-0.756542,-0.069199
1,1,201901,0,0,False,True,False,True,False,False,...,-0.804146,-0.525393,-0.535457,0.184843,0.0,-1.037614,-0.488194,-0.48857,-0.756542,-0.069199


Label separation

In [27]:
labels = dataset['target'].values
samples = dataset.drop(columns=['target', 'case_id', 'MONTH', 'WEEK_NUM'])

Apply PCA

In [28]:
# standardizing for PCA
st_scaler = StandardScaler()
X = st_scaler.fit_transform(samples.values)
X[0:2]

array([[-0.37630757,  0.37893562, -0.03902087,  1.84731273, -0.16705846,
        -1.71478191,  1.84731273, -1.35505333, -0.37962594, -0.1662551 ,
         1.84752332, -1.71611153,  1.02721712,  0.87484563,  0.12368229,
         0.63811526,  0.50514282,  0.09199565,  0.76810336,  0.71036989,
         1.45500453, -0.79493756, -0.90799294,  0.        ,  0.97854038,
         0.93564963, -0.84560218, -0.67644218, -1.03460793, -0.62843932,
        -0.58327869, -0.80412098, -0.70139286, -0.80414554, -0.52539263,
        -0.53545702,  0.86969728,  0.        , -1.03761366, -0.48819445,
        -0.48857021, -0.75654201,  0.        , -0.64656927, -0.80930931,
        -0.5663379 ,  0.        , -0.34207333, -0.23520724, -0.29375326,
        -0.23165015, -0.84292236,  0.5974626 ,  0.46419556,  0.50084011,
        -0.36963459, -0.87731154, -0.32138996, -0.54126841, -0.56855253,
         0.        , -0.31005453,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

In [29]:
REQUIRED_VARIANCE = 0.85

In [30]:
pca = PCA(n_components=REQUIRED_VARIANCE, svd_solver='full')
X_pca = pca.fit_transform(X)

In [31]:
X_pca.shape, labels.shape

((1526659, 25), (1526659,))

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, labels, test_size=0.2)

Fixing class imbalance

In [33]:
Counter(y_train)

Counter({0: 1182770, 1: 38557})

In [34]:
smote = SMOTE(sampling_strategy='minority')
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [35]:
X_train_res.shape, y_train_res.shape

((2365540, 25), (2365540,))

In [36]:
Counter(y_train_res)

Counter({0: 1182770, 1: 1182770})

### Models

In [37]:
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score

Random Forest

In [18]:
rforest = RandomForestClassifier(n_estimators=50)

rforest.fit(X_train_res, y_train_res)

In [19]:
predictions = rforest.predict(X_test)

In [29]:
confusion_matrix(y_test, predictions)

array([[286923,   8821],
       [  8316,   1272]], dtype=int64)

In [30]:
confusion_matrix(y_test, predictions).ravel()

array([286923,   8821,   8316,   1272], dtype=int64)

In [27]:
tp, fp, fn, tn = confusion_matrix(y_test, predictions).ravel()

In [31]:
precision = tn / (tn + fp)
precision

0.12602794015654414

Gradient Boosting

In [42]:
hgbc = HistGradientBoostingClassifier()
hgbc.fit(X_train_res, y_train_res)

In [43]:
predicted_hgbc = hgbc.predict(X_test)

In [44]:
confusion_matrix(y_test, predicted_hgbc)

array([[212233,  83662],
       [  3387,   6050]], dtype=int64)

In [45]:
precision_score(y_test, predicted_hgbc)

0.06743802389869806

XGBoost

In [38]:
xgb = XGBClassifier()
xgb.fit(X_train_res, y_train_res)

In [39]:
xgb_predictions = xgb.predict(X_test)

In [40]:
confusion_matrix(y_test, xgb_predictions)

array([[216289,  79606],
       [  3739,   5698]], dtype=int64)

In [41]:
precision_score(y_test, xgb_predictions)

0.06679639876207447

### Models without PCA

In [46]:
samples_train, samples_test, labels_train, labels_test = train_test_split(X, labels, test_size=0.2)

In [51]:
samples_train.shape, samples_test.shape

((1221327, 123), (305332, 123))

In [48]:
smote = SMOTE(sampling_strategy='minority')
samples_train_res, labels_train_res = smote.fit_resample(samples_train, labels_train)

In [49]:
xgb_raw = XGBClassifier()
xgb_raw.fit(samples_train_res, labels_train_res)

In [52]:
xgb_predictions_raw = xgb_raw.predict(samples_test)

In [53]:
precision_score(labels_test, xgb_predictions_raw)

0.07709397911575205