In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from google.colab import drive

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Load Data

In [None]:
train = pd.read_csv("/content/gdrive/MyDrive/BPJS Hackhaton/Classification/fraud_detection_train.csv")

In [None]:
valid = pd.read_csv("/content/gdrive/MyDrive/BPJS Hackhaton/Classification/fraud_detection_val.csv")

In [None]:
X_train = train

In [None]:
X_valid = valid

# Feature Engineering

## Remove Unused Columns

In [None]:
col_to_remove = ['jkpst', 'jnspelsep']
X_train.drop(col_to_remove, axis=1, inplace=True)

In [None]:
X_valid.drop(col_to_remove, axis=1, inplace=True)

## Rename Columns

In [None]:
def rename_col(df):
  df.rename(columns={'dx2_koo_k93' : 'dx2_k00_k93',
                     'dx2_u00_u99' : 'dx2_u00_u85',
                     'proc_32_38' : 'proc32_38'
                     }, 
            inplace=True)

  return df

In [None]:
X_train = rename_col(X_train)
X_valid = rename_col(X_valid)

## Mapping Total Secondary and Procedure

In [None]:
diag_col = X_train.columns[X_train.columns.str.contains(pat = 'dx')].tolist()
proc_col = X_train.columns[X_train.columns.str.contains(pat = 'proc')].tolist()

In [None]:
def totals(x, cols):
  sum = 0
  for i in cols:
    sum = sum + x[i]

In [None]:
X_train['total_diagsec'] = X_train[diag_col].sum(axis=1)
X_train['total_proc'] = X_train[proc_col].sum(axis=1)

X_valid['total_diagsec'] = X_valid[diag_col].sum(axis=1)
X_valid['total_proc'] = X_valid[proc_col].sum(axis=1)

## Mapping Secondary Diagnosis and Procedure into 1 columns

In [None]:
def string_manipulation(row, val_to_rep):
  result = row.replace(val_to_rep, "")

  return result

In [None]:
def create_mapping_to_value(lst, val_to_rep):
  dicts = {}
  for i in lst:
    dicts[i] = string_manipulation(i, val_to_rep)

  return dicts

In [None]:
diag_mapped = create_mapping_to_value(diag_col, 'dx2_')
proc_mapped = create_mapping_to_value(proc_col,'proc')

In [None]:
def ordinal_to_categorical(df, dictionaries):
  to_append = []
  for i in range(len(df)):
    lst_dct = []
    for key in dictionaries:
      if df.loc[i, key]>=1:
        lst_dct.append(dictionaries[key])
    
    if len(lst_dct)==0:
      str_append = 'No'
    else:
      str_append = " | ".join(lst_dct)

    to_append.append(str_append)
    
  return to_append

In [None]:
diag_append = ordinal_to_categorical(X_train, diag_mapped)
proc_append = ordinal_to_categorical(X_train, proc_mapped)

X_train['diagsec'] = diag_append
X_train['procedure'] = proc_append

In [None]:
diag_append_valid = ordinal_to_categorical(X_valid, diag_mapped)
proc_append_valid = ordinal_to_categorical(X_valid, proc_mapped)

X_valid['diagsec'] = diag_append_valid
X_valid['procedure'] = proc_append_valid

In [None]:
X_train.drop(diag_col, axis=1, inplace=True)
X_train.drop(proc_col, axis=1, inplace=True)

X_valid.drop(diag_col, axis=1, inplace=True)
X_valid.drop(proc_col, axis=1, inplace=True)

## Mean Encoding

In [None]:
def get_unique_list(df, col_to_get):
  return df[col_to_get].unique().tolist()

In [None]:
def mean_encoding(df, col, lst):
  mean_enc_dict = {}
  for i in lst:
    mean_encoded = df[df[col]==i]['label'].mean()
    mean_enc_dict[i] = mean_encoded
  
  mean_enc_dict['all'] = df['label'].mean()

  return mean_enc_dict

In [None]:
kdkc_lst = get_unique_list(X_train, 'kdkc')
dati2_lst = get_unique_list(X_train, 'dati2')
typeppk_lst = get_unique_list(X_train, 'typeppk')
cmg_lst = get_unique_list(X_train, 'cmg')
sevlvl_lst = get_unique_list(X_train, 'severitylevel')
diagprimer_lst = get_unique_list(X_train, 'diagprimer')

In [None]:
kdkc_enc = mean_encoding(X_train, 'kdkc', kdkc_lst)
dati2_enc = mean_encoding(X_train, 'dati2', dati2_lst)
typeppk_enc = mean_encoding(X_train, 'typeppk', typeppk_lst)
cmg_enc = mean_encoding(X_train, 'cmg', cmg_lst)
severitylevel_enc = mean_encoding(X_train, 'severitylevel', sevlvl_lst)
diagprimer_enc = mean_encoding(X_train, 'diagprimer', diagprimer_lst)

In [None]:
def change_val(value, dicts):
  if value not in dicts.keys():
    return 'all'
  else:
    return value

In [None]:
def change_mapping(df, col_name, dicts):
  df[col_name] = df[col_name].apply(lambda x : dicts[change_val(x, dicts)])
  return df

In [None]:
X_train = change_mapping(X_train, 'kdkc', kdkc_enc)
X_train = change_mapping(X_train, 'dati2', dati2_enc)
X_train = change_mapping(X_train, 'typeppk', typeppk_enc)
X_train = change_mapping(X_train, 'cmg', cmg_enc)
X_train = change_mapping(X_train, 'severitylevel', severitylevel_enc)
X_train = change_mapping(X_train, 'diagprimer', diagprimer_enc)

In [None]:
X_valid = change_mapping(X_valid, 'kdkc', kdkc_enc)
X_valid = change_mapping(X_valid, 'dati2', dati2_enc)
X_valid = change_mapping(X_valid, 'typeppk', typeppk_enc)
X_valid = change_mapping(X_valid, 'cmg', cmg_enc)
X_valid = change_mapping(X_valid, 'severitylevel', severitylevel_enc)
X_valid = change_mapping(X_valid, 'diagprimer', diagprimer_enc)

## Scaler

In [None]:
from sklearn import preprocessing

scaler_age = preprocessing.MinMaxScaler()
scaler_los = preprocessing.MinMaxScaler()
scaler_diagsec = preprocessing.MinMaxScaler()
scaler_procedure = preprocessing.MinMaxScaler()

In [None]:
minmax_age = scaler_age.fit(X_train[['umur']])
minmax_los = scaler_los.fit(X_train[['los']])
minmax_diagsec = scaler_diagsec.fit(X_train[['total_diagsec']])
minmax_proc = scaler_procedure.fit(X_train[['total_proc']])

In [None]:
X_train['umur'] = minmax_age.transform(X_train[['umur']])
X_train['los'] = minmax_los.transform(X_train[['los']])
X_train['total_diagsec'] = minmax_diagsec.transform(X_train[['total_diagsec']])
X_train['total_proc'] = minmax_proc.transform(X_train[['total_proc']])

In [None]:
X_valid['umur'] = minmax_age.transform(X_valid[['umur']])
X_valid['los'] = minmax_los.transform(X_valid[['los']])
X_valid['total_diagsec'] = minmax_diagsec.transform(X_valid[['total_diagsec']])
X_valid['total_proc'] = minmax_proc.transform(X_valid[['total_proc']])

## Mean Encoding For Diagsec and Procedure

In [None]:
diagsec_lst = []
for i in diag_mapped.values():
  diagsec_lst.append(i)

procmapped_lst = []
for i in proc_mapped.values():
  procmapped_lst.append(i)

In [None]:
diagsec_lst.append('No')
procmapped_lst.append('No')

In [None]:
def create_mapping(df, lst, colname):
  lst_mapping = {}
  for i in lst:
    val_res = 0
    for j in range(len(df)):
      val_in = df.loc[j, colname].split(" | ")
      if i in val_in:
        val_res = val_res + df.loc[j, 'label']

    lst_mapping[i] = val_res/len(df)

  return lst_mapping

In [None]:
diagsec_mapping = create_mapping(X_train, diagsec_lst, 'diagsec')
proc_mapping = create_mapping(X_train, procmapped_lst, 'procedure')

### Apply Mean Encoding Result

In [None]:
def count_means(x, dicts):
  vals = x.split(" | ")
  means = 0
  for i in vals:
    means = means + dicts[i]

  return means/len(vals)

In [None]:
X_train['diagsec'] = X_train['diagsec'].apply(lambda x: count_means(x, diagsec_mapping))
X_train['procedure'] = X_train['procedure'].apply(lambda x: count_means(x, proc_mapping))

In [None]:
X_valid['diagsec'] = X_valid['diagsec'].apply(lambda x: count_means(x, diagsec_mapping))
X_valid['procedure'] = X_valid['procedure'].apply(lambda x: count_means(x, proc_mapping))

# Training

## Using Stratified K-Fold

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [None]:
cv = StratifiedKFold(
    n_splits=10, random_state=0
)



### Comparison Between XGB and RF Without Parameter Tuning

In [None]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
xgb = XGBClassifier(seed=17)
rf = RandomForestClassifier(random_state=17)

In [None]:
param_xgb = {}
param_rf = {}

In [None]:
xgb_gs = GridSearchCV(estimator = xgb,
                      param_grid = param_xgb,
                      scoring = 'precision',
                      cv=cv,
                      verbose = 1
                      )

In [None]:
rf_gs = GridSearchCV(estimator = rf,
                      param_grid = param_rf,
                      scoring = 'precision',
                      cv=cv,
                      verbose = 1
                      )

In [None]:
X_trains = X_train.drop(['visit_id', 'label'], axis=1)
y_trains = X_train[['label']]

In [None]:
xgb_gs.fit(X_trains, y_trains)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.9min finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
             error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=17, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None, param_grid={},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scor

In [None]:
xgb_gs.cv_results_

{'mean_fit_time': array([11.5110702]),
 'mean_score_time': array([0.08494754]),
 'mean_test_score': array([0.65395161]),
 'params': [{}],
 'rank_test_score': array([1], dtype=int32),
 'split0_test_score': array([0.65888562]),
 'split1_test_score': array([0.6520825]),
 'split2_test_score': array([0.64999503]),
 'split3_test_score': array([0.65614919]),
 'split4_test_score': array([0.65382312]),
 'split5_test_score': array([0.65141022]),
 'split6_test_score': array([0.65338882]),
 'split7_test_score': array([0.65588822]),
 'split8_test_score': array([0.6519496]),
 'split9_test_score': array([0.65594378]),
 'std_fit_time': array([0.18757485]),
 'std_score_time': array([0.00250491]),
 'std_test_score': array([0.00258536])}

In [None]:
rf_gs.fit(X_trains, y_trains)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  6.0min finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                           

In [None]:
rf_gs.cv_results_

{'mean_fit_time': array([34.36326437]),
 'mean_score_time': array([1.25826023]),
 'mean_test_score': array([0.72590064]),
 'params': [{}],
 'rank_test_score': array([1], dtype=int32),
 'split0_test_score': array([0.72336945]),
 'split1_test_score': array([0.72835576]),
 'split2_test_score': array([0.72730015]),
 'split3_test_score': array([0.72750802]),
 'split4_test_score': array([0.72892613]),
 'split5_test_score': array([0.7237366]),
 'split6_test_score': array([0.72745902]),
 'split7_test_score': array([0.72360689]),
 'split8_test_score': array([0.72608696]),
 'split9_test_score': array([0.72265744]),
 'std_fit_time': array([0.96876218]),
 'std_score_time': array([0.05854891]),
 'std_test_score': array([0.00221487])}

Random Forest has better result so we will choose to use random forest

### Evaluation Result

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

In [None]:
def predict_model(model, X_test, y_test):
  y_pred = model.predict(X_test)

  conf_matrix = confusion_matrix(y_test, y_pred)
  print("Confusion Matrix : ")
  print(conf_matrix)
  print(" ")

  acc_score = accuracy_score(y_test, y_pred)
  print("Accuracy Score : ", acc_score)
  print(" ")

  prec_score = precision_score(y_test, y_pred)
  print("Precision Score : ", prec_score)
  print(" ")

  rec_score = recall_score(y_test, y_pred)
  print("Recall Score : ", rec_score)
  print(" ")

  return y_pred

In [None]:
rf_predicts = predict_model(rf_gs, X_trains, y_trains)

Confusion Matrix : 
[[97844  2118]
 [ 1440 98815]]
 
Accuracy Score :  0.9822292812298656
 
Precision Score :  0.9790157827469708
 
Recall Score :  0.9856366266021644
 


## Hyperparameter Tuning

In [None]:
rf_params = {
    'criterion' : ['entropy'],
    'max_depth' : [50], 
    'min_samples_leaf' : [3],
    'min_samples_split' : [7],
    'n_estimators' : [250]
}

In [None]:
rf2 = RandomForestClassifier()

In [None]:
rf2_gs = GridSearchCV(estimator = rf2,
                      param_grid = rf_params,
                      scoring = 'f1',
                      cv=cv,
                      verbose = 1
                      )

In [None]:
rf2_gs.fit(X_trains, y_trains)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 15.3min finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                           

In [None]:
rf2_gs.cv_results_

{'mean_fit_time': array([88.91781979]),
 'mean_score_time': array([2.45369487]),
 'mean_test_score': array([0.73219245]),
 'param_criterion': masked_array(data=['entropy'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[50],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[3],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[7],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[250],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'criterion': 'entropy',
   'max_depth': 50,
   'min_samples_leaf': 3,
   'min_samples_split': 7,
   'n_estimators': 250}],
 'rank_test_score': array([1], dtype=int32),
 'split0_test_score': array([0.72840249]),
 'split1_t

In [None]:
rf2_predicts = predict_model(rf2_gs, X_trains, y_trains)

Confusion Matrix : 
[[89015 10947]
 [10801 89454]]
 
Accuracy Score :  0.8913778550272954
 
Precision Score :  0.8909672214420176
 
Recall Score :  0.892264724951374
 


In [None]:
rf_params_3 = {
    'criterion' : ['entropy'],
    'max_depth' : [50], 
    'min_samples_leaf' : [2],
    'min_samples_split' : [8],
    'n_estimators' : [250]
}

In [None]:
rf3 = RandomForestClassifier()

In [None]:
rf3_gs = GridSearchCV(estimator = rf3,
                      param_grid = rf_params_3,
                      scoring = 'f1',
                      cv=cv,
                      verbose = 1
                      )

In [None]:
rf3_gs.fit(X_trains, y_trains)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 16.1min finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                           

In [None]:
rf3_predicts = predict_model(rf3_gs, X_trains, y_trains)

Confusion Matrix : 
[[90689  9273]
 [ 9235 91020]]
 
Accuracy Score :  0.9075602970776707
 
Precision Score :  0.9075409051479166
 
Recall Score :  0.9078848935215201
 


In [None]:
rf3_gs.cv_results_

{'mean_fit_time': array([93.77175503]),
 'mean_score_time': array([2.29000268]),
 'mean_test_score': array([0.73210481]),
 'param_criterion': masked_array(data=['entropy'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[50],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[2],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[8],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[250],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'criterion': 'entropy',
   'max_depth': 50,
   'min_samples_leaf': 2,
   'min_samples_split': 8,
   'n_estimators': 250}],
 'rank_test_score': array([1], dtype=int32),
 'split0_test_score': array([0.72857215]),
 'split1_t

# Prediction

In [None]:
valid_pred = rf3_gs.predict(X_valid.drop(['visit_id'], axis=1))

In [None]:
valid_result = valid[['visit_id']]
valid_result['predict_label'] = valid_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
valid_result

Unnamed: 0,visit_id,predict_label
0,1,1
1,2,0
2,3,1
3,4,1
4,5,0
...,...,...
49757,49758,1
49758,49759,1
49759,49760,1
49760,49761,1


In [None]:
valid_result.to_csv('/content/gdrive/MyDrive/BPJS Hackhaton/Classification/Tahap 2/Trial 3/rf_result.csv', index=False)