In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
#from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, fbeta_score
from sklearn import metrics
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder

### Import Subsampled Data

In [None]:
os.chdir('../data')

X = pd.read_csv('X_3specialties_equalWeight_subsample.zip',compression='zip', index_col=False)
y = pd.read_csv('y_3specialties_equalWeight_subsample.zip',compression='zip')
groups = pd.read_csv('groups_3specialties_equalWeight_subsample.zip',compression='zip')

X = X.iloc[:,1:]
y = y.iloc[:,1:]
groups = groups.iloc[:,1:]

y_columns = y.columns

#le = LabelEncoder()
#y = y.values.ravel()
#y = le.fit_transform(y)
#y = pd.DataFrame(y)
#y.columns = y_columns


In [None]:
X.shape

## Logistic Regression with Imputation and without Regulariztion

In [None]:
def ML_LogReg_noPenalty_kfold(X, y, groups, random_state,n_folds):
    # create a test set
    
    splitter = GroupShuffleSplit(n_splits=1,test_size=0.2,random_state=random_state)
    
    for i_other,i_test in splitter.split(X, y, groups):
        X_other, y_other, groups_other = X.iloc[i_other], y.iloc[i_other], groups.iloc[i_other]
        X_test, y_test, groups_test = X.iloc[i_test], y.iloc[i_test], groups.iloc[i_test]
        
    kf = GroupKFold(n_splits=n_folds)
    
    # create the pipeline: preprocessor + supervised ML method
    
    categorical_ftrs = ['Prscrbr_City','Prscrbr_State_Abrvtn','Brnd_Name','Gnrc_Name']

    std_ftrs = ['Tot_Clms',  'Tot_30day_Fills', 'Tot_Day_Suply', 'Tot_Drug_Cst', 
                'Tot_Benes', 'GE65_Tot_Clms', 'GE65_Tot_30day_Fills', 'GE65_Tot_Drug_Cst',
                'GE65_Tot_Day_Suply', 'GE65_Tot_Benes']
    
    
    numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(estimator = LinearRegression(), 
                                    random_state=random_state,max_iter=1000)),
    ('scaler', StandardScaler())])
    
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'))])
    
    preprocessor = ColumnTransformer(
        transformers=[
        ('num', numeric_transformer, std_ftrs),
        ('onehot', categorical_transformer, categorical_ftrs)],
        remainder='passthrough')



    final_scaler = StandardScaler()
    
    pipe = make_pipeline(preprocessor,final_scaler, LogisticRegression(max_iter=10000))
    
    # the parameter(s) we want to tune

    
    param_grid = {'logisticregression__solver': ['saga'],
                  'logisticregression__penalty' : ['none']}
                   
    
    
    #f05_scorer = make_scorer(fbeta_score, beta=0.5, average = 'macro')
    # prepare gridsearch
    grid = GridSearchCV(pipe, 
                        param_grid=param_grid,
                        scoring = 'accuracy',
                        cv=kf, 
                        return_train_score = True,
                        verbose=10)
    
    # do kfold CV on _other
    
    grid_result = grid.fit(X_other, y_other.values.ravel(), groups=groups_other)
    
    #feature_names = grid.best_estimator_[0].get_feature_names_out()
    
    print()
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    
    print(f'Best params: {grid.best_params_}')
    
    print(f"mean CV: {means} +/ {stds}")
    
    y_test_pred_proba = grid.predict_proba(X_test)
    
    y_test_pred = grid.predict(X_test)
    
    #score = accuracy_score(y_test,y_test_pred)
    
    feature_names = std_ftrs + list(grid.best_estimator_[0].named_transformers_['onehot'][0].get_feature_names(categorical_ftrs))
    
    f_05_score = fbeta_score(y_test, y_test_pred, beta = 0.5, labels=sorted(np.unique(y)), average='macro')
    
    cm = confusion_matrix(y_test, y_test_pred)
    
    accuracy = accuracy_score(y_test, y_test_pred)
    print("accuracy:", accuracy)
    
    
    return grid, X_test, y_test, f_05_score, cm, accuracy, feature_names

In [None]:
%%time
final_models_list = []
test_scores = []
best_params = []
confusion_mat = []
class_met = []
accuracy_scores = []
final_models = []
X_test_set_list =[]
y_test_set_list =[]
featname_list = []

for i in range(5):
    print(f'Random State # {i}')
    print()
    
    fin_grid, X_test_set, y_test_set, test_score,cmat, acc, feat_names = ML_LogReg_noPenalty_kfold(X, y, groups, 42*i , 4)
    
    #featname_list.append(featname)
    
    X_test_set_list.append(X_test_set)
    
    y_test_set_list.append(y_test_set)
    
    final_models_list.append(fin_grid)
    
    test_scores.append(test_score)
    
    confusion_mat.append(cmat)

    accuracy_scores.append(acc)
    
    featname_list.append(feat_names)


In [None]:
print(accuracy_scores)
print(np.mean(accuracy_scores))

In [None]:
os.chdir('../results')
file = open('LogReg_noPenalty_grid.save', 'wb')

pickle.dump((X_test_set_list, 
             y_test_set_list,
             final_models_list,
             confusion_mat,
             test_scores,
             accuracy_scores,
             featname_list),file)

file.close()

## Logistic Regression with Imputation and Regularization

In [None]:
def ML_LogReg_L1_kfold(X, y, groups, random_state,n_folds):
    # create a test set
    
    splitter = GroupShuffleSplit(n_splits=1,test_size=0.2,random_state=random_state)
    
    for i_other,i_test in splitter.split(X, y, groups):
        X_other, y_other, groups_other = X.iloc[i_other], y.iloc[i_other], groups.iloc[i_other]
        X_test, y_test, groups_test = X.iloc[i_test], y.iloc[i_test], groups.iloc[i_test]
        
    kf = GroupKFold(n_splits=n_folds)
    
    # create the pipeline: preprocessor + supervised ML method
    
    categorical_ftrs = ['Prscrbr_City','Prscrbr_State_Abrvtn','Brnd_Name','Gnrc_Name']

    std_ftrs = ['Tot_Clms',  'Tot_30day_Fills', 'Tot_Day_Suply', 'Tot_Drug_Cst', 
                'Tot_Benes', 'GE65_Tot_Clms', 'GE65_Tot_30day_Fills', 'GE65_Tot_Drug_Cst',
                'GE65_Tot_Day_Suply', 'GE65_Tot_Benes']
    
    
    numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(estimator = LinearRegression(), 
                                    random_state=random_state,max_iter=1000)),
    ('scaler', StandardScaler())])
    
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'))])
    
    preprocessor = ColumnTransformer(
        transformers=[
        ('num', numeric_transformer, std_ftrs),
        ('onehot', categorical_transformer, categorical_ftrs)],
        remainder='passthrough')



    final_scaler = StandardScaler()
    
    pipe = make_pipeline(preprocessor,final_scaler, LogisticRegression(max_iter=10000))
    
    # the parameter(s) we want to tune

    
    param_grid = {'logisticregression__solver': ['saga'],
                  'logisticregression__penalty' : ['l1'], 
                  'logisticregression__C'       : np.logspace(-3,3,7)}
                   
    
    
    #f05_scorer = make_scorer(fbeta_score, beta=0.5, average = 'macro')
    # prepare gridsearch
    grid = GridSearchCV(pipe, 
                        param_grid=param_grid,
                        scoring = 'accuracy',
                        cv=kf, 
                        return_train_score = True, 
                        n_jobs=8, 
                        verbose=10)
    
    # do kfold CV on _other
    
    grid_result = grid.fit(X_other, y_other.values.ravel(), groups=groups_other)
    
    #feature_names = grid.best_estimator_[0].get_feature_names_out()
    
    print()
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    
    print(f'Best params: {grid.best_params_}')
    
    print(f"mean CV: {means} +/ {stds}")
    
    y_test_pred_proba = grid.predict_proba(X_test)
    
    y_test_pred = grid.predict(X_test)
    
    #score = accuracy_score(y_test,y_test_pred)
    
    feature_names = std_ftrs + list(grid.best_estimator_[0].named_transformers_['onehot'][0].get_feature_names(categorical_ftrs))
    
    f_05_score = fbeta_score(y_test, y_test_pred, beta = 0.5, labels=sorted(np.unique(y)), average='macro')
    
    cm = confusion_matrix(y_test, y_test_pred)
    
    accuracy = accuracy_score(y_test, y_test_pred)
    print("accuracy:", accuracy)
    
    
    return grid, X_test, y_test, f_05_score, cm, accuracy, feature_names

In [None]:
%%time
final_models_list = []
test_scores = []
best_params = []
confusion_mat = []
class_met = []
accuracy_scores = []
final_models = []
X_test_set_list =[]
y_test_set_list =[]
featname_list = []

for i in range(5):
    print(f'Random State # {i}')
    print()
    
    fin_grid, X_test_set, y_test_set, test_score,cmat, acc, feat_names = ML_LogReg_L1_kfold(X, y, groups, 42*i , 4)
    
    #featname_list.append(featname)
    
    X_test_set_list.append(X_test_set)
    
    y_test_set_list.append(y_test_set)
    
    final_models_list.append(fin_grid)
    
    test_scores.append(test_score)
    
    confusion_mat.append(cmat)

    accuracy_scores.append(acc)
    
    featname_list.append(feat_names)


### Save results with pickle

In [None]:
os.chdir('../results')
file = open('LogReg_L1_grid.save', 'wb')

pickle.dump((X_test_set_list, 
             y_test_set_list,
             final_models_list,
             confusion_mat,
             test_scores,
             accuracy_scores,
             featname_list),file)

file.close()

### Analyzing Coefficients for Feature Importance

In [None]:
coefs = grid.best_estimator_[2].coef_[0]
sorted_indcs = np.argsort(np.abs(coefs))
ftr_names = np.array(ftr_names)

plt.figure(figsize=(6.4,4.8))
plt.rcParams.update({'font.size': 13})
plt.barh(np.arange(20),coefs[sorted_indcs[-20:]])
plt.yticks(np.arange(20),ftr_names[sorted_indcs[-20:]])
plt.xlabel('coefficient')
plt.title('Logistic Regression (L1) Coefficients (After Scaling)')
plt.tight_layout()
plt.savefig('LRC_coefs_scaled.png',dpi=1200)
plt.show()