In [1]:
########################
### Create Dataframe ###
########################

import pandas as pd
from pathlib import Path

workspace_dir = "/workspaces/chris_parrish/sas_viya/data/annuity_advisors"
data_table = "annuity_advisors_prep.csv"

dm_inputdf = pd.read_csv(Path(workspace_dir) / data_table, header=0)
print(dm_inputdf.dtypes)
print("Dimenstions of Data:", dm_inputdf.shape)

advisor                            int64
advisor_event_indicator            int64
sf_face_2_face                     int64
sf_call_outbound                   int64
sf_call_inbound                    int64
sf_email_inbound                   int64
channel_bank                       int64
channel_wirehouse                  int64
channel_ria                        int64
primary_prod_sold_fixed            int64
primary_prod_sold_va               int64
sf_email_campaigns                 int64
advisor_hh_children                int64
annuity_mkt_opp                  float64
advisor_advising_years           float64
advisor_aum                      float64
advisor_annuity_selling_years    float64
advisor_age                      float64
advisor_net_worth                float64
advisor_credit_hist_mos          float64
advisor_firm_changes               int64
advisor_credit_score             float64
wholesaler                         int64
region_ca                          int64
region_ny       

In [2]:
########################
### Model Parameters ###
########################

### import python libraries
import numpy as np
from sklearn.utils import shuffle

### model manager information
metadata_output_dir = 'outputs'
model_name = 'logit_python_annuity_workbench'
project_name = 'Annuity Advisors'
description = 'Logistic Regression'
model_type = 'logistic_regression'
model_function = 'Classification'
predict_syntax = 'predict_proba'

### define macro variables for model
dm_dec_target = 'advisor_event_indicator'
dm_partitionvar = 'analytic_partition'
create_new_partition = 'no' # 'yes', 'no'
dm_key = 'advisor' 
dm_classtarget_level = ['0', '1']
dm_partition_validate_val, dm_partition_train_val, dm_partition_test_val = [0, 1, 2]
dm_partition_validate_perc, dm_partition_train_perc, dm_partition_test_perc = [0.3, 0.6, 0.1]

### create list of regressors
keep_predictors = [
    ]
rejected_predictors = [
    'channel_ria',
    'region_we',
    'primary_prod_sold_fixed'
    ] 

### mlflow
use_mlflow = 'no' # 'yes', 'no'
mlflow_run_to_use = 0
mlflow_class_labels =['TENSOR']
mlflow_predict_syntax = 'predict'

### var to consider in bias assessment
bias_vars = []

### var to consider in partial dependency
pd_var1 = ''
pd_var2 = ''

### create partition column, if not already in dataset
if create_new_partition == 'yes':
    dm_inputdf = shuffle(dm_inputdf)
    dm_inputdf.reset_index(inplace=True, drop=True)
    validate_rows = round(len(dm_inputdf)*dm_partition_validate_perc)
    train_rows = round(len(dm_inputdf)*dm_partition_train_perc) + validate_rows
    test_rows = len(dm_inputdf)-train_rows
    dm_inputdf.loc[0:validate_rows,dm_partitionvar] = dm_partition_validate_val
    dm_inputdf.loc[validate_rows:train_rows,dm_partitionvar] = dm_partition_train_val
    dm_inputdf.loc[train_rows:,dm_partitionvar] = dm_partition_test_val

In [3]:
##############################
### Final Modeling Columns ###
##############################

### create list of model variables
dm_input = list(dm_inputdf.columns.values)
macro_vars = (dm_dec_target + ' ' + dm_partitionvar + ' ' + dm_key).split()
#rejected_predictors = [i for i in dm_input if i not in keep_predictors]
rejected_vars = rejected_predictors + macro_vars #(include macro_vars if rejected_predictors are explicitly listed - not contra keep_predictors)
for i in rejected_vars:
    dm_input.remove(i)
print(dm_input)

### create prediction variables
dm_predictionvar = [str('P_') + dm_dec_target + dm_classtarget_level[0], str('P_') + dm_dec_target + dm_classtarget_level[1]]
dm_classtarget_intovar = str('I_') + dm_dec_target

##################
### Data Split ###
##################

### create train, test, validate datasets using existing partition column
dm_traindf = dm_inputdf[dm_inputdf[dm_partitionvar] == dm_partition_train_val]
X_train = dm_traindf.loc[:, dm_input]
y_train = dm_traindf[dm_dec_target]
dm_testdf = dm_inputdf.loc[(dm_inputdf[dm_partitionvar] == dm_partition_test_val)]
X_test = dm_testdf.loc[:, dm_input]
y_test = dm_testdf[dm_dec_target]
dm_validdf = dm_inputdf.loc[(dm_inputdf[dm_partitionvar] == dm_partition_validate_val)]
X_valid = dm_validdf.loc[:, dm_input]
y_valid = dm_validdf[dm_dec_target]
fullX = dm_inputdf.loc[:, dm_input]
fully = dm_inputdf[dm_dec_target]

['sf_face_2_face', 'sf_call_outbound', 'sf_call_inbound', 'sf_email_inbound', 'channel_bank', 'channel_wirehouse', 'primary_prod_sold_va', 'sf_email_campaigns', 'advisor_hh_children', 'annuity_mkt_opp', 'advisor_advising_years', 'advisor_aum', 'advisor_annuity_selling_years', 'advisor_age', 'advisor_net_worth', 'advisor_credit_hist_mos', 'advisor_firm_changes', 'advisor_credit_score', 'wholesaler', 'region_ca', 'region_ny', 'region_fl', 'region_tx', 'region_ne', 'region_so', 'region_mw', 'sf_email_responses']


In [4]:
from time import time

In [5]:
##########################
### Variable Selection ###
##########################

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import cross_val_score

### Recursive Feature Elimination (RFE) with Crossvalidation (auto-select number of variables)
models_for_rfe = [DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor()]
start = time()
rfe_cols_cv = []
for i in models_for_rfe:
    rfe_cv = RFECV(estimator=i, step=1, cv=10, min_features_to_select=1)
    rfe_cv.fit(fullX,fully)
    rfe_cols_cv.append(list(rfe_cv.get_feature_names_out()))

finish = time()

time_to_complete = finish-start
print("Time to complete feature selection with Python:", time_to_complete)

Time to complete feature selection with Python: 1334.2509982585907


In [11]:
print ("Selected variables using Scikit-Learn Decision Tree:", rfe_cols_cv[0])
print ("Selected variables using Scikit-Learn Gradient Boosting:", rfe_cols_cv[1])
print ("Selected variables using Scikit-Learn Random Forest:", rfe_cols_cv[2])

Selected variables using Scikit-Learn Decision Tree: ['sf_call_inbound', 'sf_email_campaigns', 'advisor_hh_children', 'annuity_mkt_opp', 'advisor_advising_years', 'advisor_aum', 'advisor_age', 'advisor_net_worth', 'advisor_credit_hist_mos', 'advisor_credit_score']
Selected variables using Scikit-Learn Gradient Boosting: ['sf_face_2_face', 'sf_call_inbound', 'sf_email_campaigns', 'advisor_hh_children', 'annuity_mkt_opp', 'advisor_advising_years', 'advisor_aum', 'advisor_age', 'advisor_net_worth', 'advisor_credit_hist_mos', 'advisor_firm_changes', 'advisor_credit_score', 'sf_email_responses']
Selected variables using Scikit-Learn Random Forest: ['sf_face_2_face', 'sf_call_outbound', 'sf_call_inbound', 'sf_email_inbound', 'channel_wirehouse', 'primary_prod_sold_va', 'sf_email_campaigns', 'advisor_hh_children', 'annuity_mkt_opp', 'advisor_advising_years', 'advisor_aum', 'advisor_annuity_selling_years', 'advisor_age', 'advisor_net_worth', 'advisor_credit_hist_mos', 'advisor_firm_changes', '

In [7]:
#####################
### Training Code ###
#####################

from sasviya.ml.linear_model import LogisticRegression

models_for_training_list = [LogisticRegression()]
model_results_list = []
model_list = []

for i in models_for_training_list:
    for j in range(0, len(rfe_cols_cv)):
        X_train = dm_traindf.loc[:, rfe_cols_cv[j]]
        X_test = dm_testdf.loc[:, rfe_cols_cv[j]]
        X_valid = dm_validdf.loc[:, rfe_cols_cv[j]]
        dm_model = i
        dm_model.fit(X_train, y_train)
        #cross_val_score(dm_model, X_train, y_train, cv=10, n_jobs=1)
        score = dm_model.score(X_valid, y_valid)
        model_results_list.append(score)
        name = [str(i)[0:20]+str('_varlist')+str(j)]
        model_list.append(name)
        print('%s %.4f' % (name, score))

['LogisticRegression()_varlist0'] 0.9544
['LogisticRegression()_varlist1'] 0.9609
['LogisticRegression()_varlist2'] 0.9609
