In [None]:
experiments_path = '/content/drive/MyDrive/humana_experiments/'
mdl_name = 'mdl_rf_best_features_cv_8000_no_norm.pickle'
csv_name = '/content/drive/MyDrive/humana_experiments/2021CaseCompetition_David_Downing_20211009_rf_best_features_cv_8000.csv'
scaler_name = 'scaler_rf_best_features_cv_8000_no_norm.pickle'
selector_name = 'selector_rf_best_features_cv_8000_no_norm.pickle'
TRAINING =True
iter = 4000

NORMALIZE = True

CROSS_VALIDATION = True
NSPLITS = 4

SMOTE = False
TEST_DATA_LEAK = False

TASK_TYPE = "GPU"

SELECT_BEST = True
NEW_SELECTOR = False

# Blake's functions (additional features)

In [None]:
def make_cost_feature(df):
  cost_list = [
                'total_outpatient_allowed_pmpm_cost_6to9m_b4',
                'total_physician_office_net_paid_pmpm_cost_9to12m_b4',
                'total_physician_office_copay_pmpm_cost',
                'total_outpatient_mbr_resp_pmpm_cost_6to9m_b4',
                'total_med_allowed_pmpm_cost_9to12m_b4',
                'rx_gpi2_72_pmpm_cost_6to9m_b4',
                'rx_nonmaint_mbr_resp_pmpm_cost_9to12m_b4',
                'rx_overall_mbr_resp_pmpm_cost',
                'rx_mail_mbr_resp_pmpm_cost_0to3m_b4',
                'rx_nonmaint_mbr_resp_pmpm_cost',
                'rx_generic_mbr_resp_pmpm_cost_0to3m_b4',
                'rx_gpi2_02_pmpm_cost',
                'rx_nonbh_mbr_resp_pmpm_cost_6to9m_b4',
                'rx_nonbh_mbr_resp_pmpm_cost',
                'rx_gpi2_01_pmpm_cost_0to3m_b4',
                'rx_branded_mbr_resp_pmpm_cost',
                'rx_hum_28_pmpm_cost',
                'rx_generic_mbr_resp_pmpm_cost',
                'rx_overall_mbr_resp_pmpm_cost_0to3m_b4',
                'rx_nonbh_net_paid_pmpm_cost',
                'rx_generic_pmpm_cost',
                'rx_maint_mbr_resp_pmpm_cost_6to9m_b4',
                'rx_generic_pmpm_cost_6to9m_b4',
                'rx_gpi2_49_pmpm_cost_0to3m_b4',
                'rx_overall_net_paid_pmpm_cost_6to9m_b4']

  cost_sum = 0
  for label in cost_list:
      df[label] = df[label].fillna(value=df[label].mean())
      cost_sum += df[label]

  df['money_spent'] = cost_sum

  return df

def make_rx_count_feature(df):

  rx_count_list = [
                'rx_gpi4_6110_pmpm_ct',
                'rx_bh_pmpm_ct_0to3m_b4',
                'rx_gpi2_34_dist_gpi6_pmpm_ct',
                'rx_hum_16_pmpm_ct',
                'rx_nonotc_dist_gpi6_pmpm_ct',
                'rx_nonmaint_pmpm_ct',
                'rx_gpi2_72_pmpm_ct_6to9m_b4',
                'rx_gpi4_3400_pmpm_ct',
                'rx_generic_pmpm_ct_0to3m_b4',
                'rx_tier_2_pmpm_ct_3to6m_b4',
                'rx_maint_pmpm_ct_9to12m_b4',
                'rx_nonbh_pmpm_ct_0to3m_b4',
                'rx_tier_1_pmpm_ct_0to3m_b4',
                'rx_gpi2_34_pmpm_ct',
                'rx_gpi2_90_dist_gpi6_pmpm_ct_9to12m_b4',
                'rx_tier_2_pmpm_ct',
                'rx_gpi2_56_dist_gpi6_pmpm_ct_3to6m_b4',
                'rx_gpi2_33_pmpm_ct_0to3m_b4',
                'rx_gpi2_66_pmpm_ct'
  ]

  count = 0
  for label in rx_count_list:
    df[label] = df[label].replace(to_replace="*", value=0).astype('float64')
    count += df[label]

  df['pharmacy_usage'] = count

  return df

def make_data_saturation_feature(df):
  na = df.isna().sum(axis=1)
  df['data_saturation'] = na

  return df

# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install catboost



In [None]:
import os
import numpy as np
import pandas as pd
from pandas.api.types import infer_dtype

import json

from matplotlib import pyplot as plt
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils import resample, shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE, SMOTENC, BorderlineSMOTE, SVMSMOTE, ADASYN
from sklearn.feature_selection import SelectFromModel

import seaborn as sns

import pickle
from pandas.api.types import infer_dtype



# Functions

## Processing functions

In [None]:
def encode_age(df, bins=[0,20,40,60,80,110], labels=[0,1,2,3,4]):
  df['est_age_enc'] = pd.cut(df['est_age'], bins=bins, labels=labels, include_lowest=True)
  return df

def make_auth_sum_feature(df):
  filtered_col = []
  auth_col = [col for col in df if col.startswith('auth') and 'mean' not in col]
  df_temp = df[auth_col]
  for col in df_temp:
    if np.all(['0', '*'] != df_temp[col].unique()):
      df_temp[col] = df_temp[col].replace(to_replace = '*', value ='0').astype(int)
      filtered_col.append(col)
        
  df['auth_sum'] = df_temp[filtered_col].sum(axis=1)
  return df

def process_float(df, feature_cols_dict, fillna_mean=True):
  if fillna_mean:
    df[feature_cols_dict['float_features']] = df[feature_cols_dict['float_features']].fillna(df[feature_cols_dict['float_features']].mean())
  else:
    df[feature_cols_dict['float_features']] = df[feature_cols_dict['float_features']].fillna(0)

  return df
  
def one_hot_encode(df, columns_to_encode):
  for col in columns_to_encode:
    df[col+'_orig'] = df[col]
  return pd.get_dummies(df, columns=columns_to_encode)

## Helper functions

In [None]:
def make_pred_csv(df_holdout, model_pickle_path=os.path.join(experiments_path, mdl_name), normalize=NORMALIZE, fname=csv_name, cross_val=CROSS_VALIDATION):
  
  with open(model_pickle_path, 'rb') as f:
    feature_cols, model = pickle.load(f)
  
  ids = df_holdout['ID'].values 
  df_holdout = df_holdout[feature_cols]
  
  if normalize:
    with open(os.path.join(experiments_path, scaler_name), 'rb') as f:
      scaler = pickle.load(f)
    
    if cross_val:
      probs = np.mean([mdl.predict_proba(scl.transform(df_holdout.values))[:,1] for mdl, scl in zip(model, scaler)], 0)
    else:
      probs = model.predict_proba(scaler.transform(df_holdout.values))[:,1]
  else:
    if cross_val:
      probs = np.mean([mdl.predict_proba(df_holdout.values)[:,1] for mdl in model], 0)
    else:
      probs = model.predict_proba(df_holdout.values)[:,1]
  
  
  ranks = np.arange(0,len(df_holdout)) + 1
  df_pred = pd.DataFrame({'ID':ids, 'SCORE':probs, 'RANK':ranks})
  df_pred['SCORE']=df_pred['SCORE'].astype('float32')
  df_pred['RANK']=df_pred['RANK'].astype('int32')
  
  df_pred.to_csv(fname, index=False)

# Feature lists

In [None]:
with open(os.path.join(experiments_path, 'feature_cols_dict.pickle'), 'rb') as f:
  feature_cols_dict = pickle.load(f)

# Processing

In [None]:
if TRAINING:
  
  df = pd.read_csv(os.path.join(experiments_path, f'2021_Competition_Training.csv'), index_col=[0])
  
  df = make_data_saturation_feature(df)
  df = make_cost_feature(df)
  df = make_rx_count_feature(df)
  df = make_auth_sum_feature(df)

  y = df['covid_vaccination'].factorize()[0]

  df_cols_to_drop = feature_cols_dict['object_features']+feature_cols_dict['cols_to_drop'] + ['ID'] + list(df.select_dtypes('object').columns) 
  df = df.drop(columns=df_cols_to_drop)
  df = process_float(df, feature_cols_dict, fillna_mean=True)

  col_list_to_save = list(df.columns)
  X = df.values
  #if SMOTE:
  #  sm = SVMSMOTE(random_state=27)
  #  X, y = sm.fit_resample(X, y)
  if SELECT_BEST:
    if not NEW_SELECTOR:
      with open(os.path.join(experiments_path, selector_name), 'rb') as f:
        sel = pickle.load(f)
      X = sel.transform(X)
      col_list_to_save = [df.columns[i] for i in range(len(sel.get_support())) if sel.get_support()[i]]
    
  if TEST_DATA_LEAK:
    X_ = X.copy()
    y_ = y.copy()
    X, X_val, y, y_val = train_test_split(X_, y_, stratify=y_, test_size=0.2, shuffle=True, random_state=27)
  
  if not CROSS_VALIDATION:
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, shuffle=True, random_state=27)

    if NORMALIZE:
      scaler = StandardScaler()
      X_train = scaler.fit_transform(X_train)
      X_test = scaler.transform(X_test)

      with open(os.path.join(experiments_path, scaler_name), 'wb') as f:
        pickle.dump(scaler, f)

  interactivity=interactivity, compiler=compiler, result=result)
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [None]:
if SELECT_BEST:
  if NEW_SELECTOR:
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, shuffle=True, random_state=27)
    sel = SelectFromModel(RandomForestClassifier(n_estimators = 60, random_state=27, verbose=2, n_jobs=2))
    sel.fit(X_train, y_train)

    with open(os.path.join(experiments_path, selector_name), 'wb') as f:
      pickle.dump(sel, f)

# Training

In [None]:
if TRAINING:
  
  if not CROSS_VALIDATION:
    model = CatBoostClassifier(verbose=True, eval_metric='AUC', iterations=iter)

    if TEST_DATA_LEAK:
      model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)
      val_score = roc_auc_score(y_val, model.predict_proba(X_val)[:,1]) # val (test) score
    else:
      model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)
      val_score = roc_auc_score(y_test, model.predict_proba(X_test)[:,1]) # val (test) score

    print(val_score)
    if not TEST_DATA_LEAK:
      with open(os.path.join(experiments_path, mdl_name), 'wb') as f:
        pickle.dump((col_list_to_save, model), f)

  else:
    scalers, models = [], []
    skf = StratifiedKFold(n_splits=NSPLITS, random_state=27)
    for train_index, test_index in skf.split(X, y):
      X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]

      if NORMALIZE:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        scalers.append(scaler)

      model = CatBoostClassifier(verbose=True, eval_metric='AUC', iterations=iter, task_type=TASK_TYPE)
      model.set_feature_names(col_list_to_save)
      
      if TEST_DATA_LEAK:
        model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)
        val_score = roc_auc_score(y_val, model.predict_proba(X_val)[:,1]) # val (test) score
      else:
        model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)
        val_score = roc_auc_score(y_test, model.predict_proba(X_test)[:,1]) # val (test) score

      print(val_score)
      models.append(model)

      if not TEST_DATA_LEAK:
        with open(os.path.join(experiments_path, mdl_name), 'wb') as f:
          pickle.dump((col_list_to_save, models), f)

        with open(os.path.join(experiments_path, scaler_name), 'wb') as f:
          pickle.dump(scalers, f)

      else:
        if NORMALIZE:
          probs = np.mean([mdl.predict_proba(scl.transform(X_val))[:,1] for mdl, scl in zip(models, scalers)], 0)
        else:
          probs = np.mean([mdl.predict_proba(X_val)[:,1] for mdl in models], 0)
        print('VALIDATION_CROSS_VAL_SCORE:')
        print(roc_auc_score(y_val, probs))



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
29:	learn: 0.6484871	test: 0.6477492	best: 0.6477492 (29)	total: 1.9s	remaining: 4m 11s
30:	learn: 0.6495778	test: 0.6488287	best: 0.6488287 (30)	total: 1.96s	remaining: 4m 11s
31:	learn: 0.6507004	test: 0.6499132	best: 0.6499132 (31)	total: 2.02s	remaining: 4m 10s
32:	learn: 0.6511157	test: 0.6502935	best: 0.6502935 (32)	total: 2.07s	remaining: 4m 9s
33:	learn: 0.6512305	test: 0.6504754	best: 0.6504754 (33)	total: 2.13s	remaining: 4m 8s
34:	learn: 0.6516973	test: 0.6509771	best: 0.6509771 (34)	total: 2.19s	remaining: 4m 8s
35:	learn: 0.6519379	test: 0.6511677	best: 0.6511677 (35)	total: 2.25s	remaining: 4m 7s
36:	learn: 0.6521471	test: 0.6513575	best: 0.6513575 (36)	total: 2.3s	remaining: 4m 6s
37:	learn: 0.6523450	test: 0.6515872	best: 0.6515872 (37)	total: 2.36s	remaining: 4m 5s
38:	learn: 0.6522744	test: 0.6515801	best: 0.6515872 (37)	total: 2.43s	remaining: 4m 6s
39:	learn: 0.6522619	test: 0.6515934	best: 0.6515934 (

# Prediction

In [None]:
if not TRAINING:
  df_holdout = pd.read_csv(os.path.join(experiments_path, f'2021_Competition_Holdout.csv'), index_col=[0])
  df_holdout = process_float(df_holdout, feature_cols_dict, fillna_mean=True)
  make_pred_csv(df_holdout, model_pickle_path=os.path.join(experiments_path, mdl_name), normalize=NORMALIZE, fname=csv_name)

# Make feature_cols_dict (uncomment)

In [None]:
'''#df_holdout = pd.read_csv(os.path.join(experiments_path, f'2021_Competition_Holdout.csv'), index_col=[0])

obj_string, obj_int, obj_mixed, obj_float = [], [], [], []
for col in list(df_holdout.select_dtypes('object').columns):
  col_type = infer_dtype(df_holdout[col])
  if col_type == 'string' and col_type != 'ID':
    obj_string.append(col)
  if col_type == 'integer':
    obj_int.append(col)
  if col_type == 'mixed':
    obj_mixed.append(col)
  if col_type == 'floating':
    obj_float.append(col)

feature_cols_dict = {}
feature_cols_dict['all_features'] = list(df_holdout.columns)
feature_cols_dict['float_features'] = list(df_holdout.select_dtypes('float').columns)
feature_cols_dict['int_features'] = list(df_holdout.select_dtypes('int').columns)
feature_cols_dict['object_features'] = [el for el in list(df_holdout.select_dtypes('object').columns) if 'ID' not in el]
feature_cols_dict['object_string'] = obj_string
feature_cols_dict['object_int'] = obj_int
feature_cols_dict['object_mixed'] = obj_mixed
feature_cols_dict['object_float'] = obj_float
feature_cols_dict['target'] = ['covid_vaccination']
feature_cols_dict['ID'] = ['ID']

cols_to_drop = []
for col in df_holdout[feature_cols_dict['int_features']]:
  col_unique_vals = df_holdout[col].unique()
  if len(col_unique_vals) == 1:
    cols_to_drop.append(col)

cols_to_replace_nan_with_stars = []
for col in df_holdout[feature_cols_dict['object_string']]:
  col_unique_vals = df_holdout[col].unique()
  if df_holdout[col].isnull().values.any():
    cols_to_replace_nan_with_stars.append(col)

feature_cols_dict['cols_to_replace_nan_with_stars'] = cols_to_replace_nan_with_stars
feature_cols_dict['cols_to_drop'] = cols_to_drop

with open(os.path.join(experiments_path, 'feature_cols_dict.pickle'), 'wb') as f:
  pickle.dump(feature_cols_dict, f)'''

In [None]:
'''model_pickle_path=os.path.join(experiments_path, mdl_name)

with open(model_pickle_path, 'rb') as f:
  feature_cols, model = pickle.load(f)

with open(os.path.join(experiments_path, scaler_name), 'rb') as f:
  scaler = pickle.load(f)

df = pd.read_csv(os.path.join(experiments_path, f'2021_Competition_Training.csv'), index_col=[0])

y = df['covid_vaccination'].factorize()[0]

df_cols_to_drop = feature_cols_dict['object_features']+feature_cols_dict['cols_to_drop'] + ['ID'] + list(df.select_dtypes('object').columns) 
df = df.drop(columns=df_cols_to_drop)
df = process_float(df, feature_cols_dict, fillna_mean=True)

col_list_to_save = list(df.columns)
X = df.values

skf = StratifiedKFold(n_splits=NSPLITS, random_state=27)
probs = []
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
  print(i)
  X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]
  prob = model[i].predict_proba(scaler[i].transform(X_test))[:,1]
  probs.append(np.sum(y_test==(prob > 0.5).astype(int))/len(y_test))'''