In [10]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

warnings.simplefilter(action='ignore', category=FutureWarning)
import xgboost
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import confusion_matrix

In [11]:
os.chdir('../data')

X_subsample2 = pd.read_csv('X_subsample_round2_split6.zip',compression='zip',index_col=False)
y_subsample2 = pd.read_csv('y_subsample_round2_split6.zip',compression='zip')
groups_subsample2 = pd.read_csv('groups_subsample_round2_split6.zip',compression='zip')

X_subsample2 = X_subsample2.iloc[:,1:]
y_subsample2 = y_subsample2.iloc[:,1:]
groups_subsample2 = groups_subsample2.iloc[:,1:]

y_subsample2_columns = y_subsample2.columns
#y_subsample = y_subsample.values.ravel()

In [12]:
y_subsample2.iloc[0]

Prscrbr_Type    Psychiatry
Name: 0, dtype: object

In [13]:
categorical_ftrs = ['Prscrbr_City',
                    'Prscrbr_State_Abrvtn',
                    'Brnd_Name',
                    'Gnrc_Name']

std_ftrs = ['Tot_Clms', 
            'Tot_30day_Fills', 
            'Tot_Day_Suply', 
            'Tot_Drug_Cst', 
            'Tot_Benes', 
            'GE65_Tot_Clms',
            'GE65_Tot_30day_Fills',
            'GE65_Tot_Drug_Cst',
            'GE65_Tot_Day_Suply',
            'GE65_Tot_Benes']

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), categorical_ftrs),
        ('std', StandardScaler(), std_ftrs)])


In [14]:
X_subsample2.isnull()

Unnamed: 0,Prscrbr_City,Prscrbr_State_Abrvtn,Brnd_Name,Gnrc_Name,Tot_Clms,Tot_30day_Fills,Tot_Day_Suply,Tot_Drug_Cst,Tot_Benes,GE65_Tot_Clms,GE65_Tot_30day_Fills,GE65_Tot_Drug_Cst,GE65_Tot_Day_Suply,GE65_Tot_Benes
0,False,False,False,False,False,False,False,False,True,False,False,False,False,True
1,False,False,False,False,False,False,False,False,True,True,True,True,True,True
2,False,False,False,False,False,False,False,False,True,False,False,False,False,True
3,False,False,False,False,False,False,False,False,True,True,True,True,True,True
4,False,False,False,False,False,False,False,False,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4626,False,False,False,False,False,False,False,False,True,False,False,False,False,True
4627,False,False,False,False,False,False,False,False,True,True,True,True,True,True
4628,False,False,False,False,False,False,False,False,False,False,False,False,False,True
4629,False,False,False,False,False,False,False,False,True,False,False,False,False,True


In [15]:
mask = X_subsample2.isnull()

unique_rows, counts = np.unique(mask, axis=0, return_counts=True)
print(unique_rows.shape) # 5 patterns, we will train 6 models

for i in range(len(counts)):
    print(unique_rows[i],counts[i])

(5, 14)
[False False False False False False False False False False False False
 False False] 360
[False False False False False False False False False False False False
 False  True] 575
[False False False False False False False False False  True  True  True
  True  True] 824
[False False False False False False False False  True False False False
 False  True] 1754
[False False False False False False False False  True  True  True  True
  True  True] 1118


In [27]:
def xgb_model(X_train, Y_train, X_CV, y_CV, X_test, y_test, i, verbose=4):

    # make into row vectors to avoid an obnoxious sklearn/xgb warning
    Y_train = np.reshape(np.array(Y_train), (1, -1)).ravel()
    y_CV = np.reshape(np.array(y_CV), (1, -1)).ravel()
    y_test = np.reshape(np.array(y_test), (1, -1)).ravel()

    XGB = xgboost.XGBClassifier(num_class=3,
                                #objective = "multi:softprob",
                                eval_metric = 'merror', 
                                random_state = i, 
                                use_label_encoder = False)
    
    # find the best parameter set
    param_grid = {"learning_rate": [0.01, 0.03],
                  "n_estimators": [1000],
                  "seed": [i],
                  #"reg_alpha": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
                  "reg_lambda": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
                  "missing": [np.nan], 
                  "max_depth": [1,3,10],
                  "colsample_bytree": [0.9],              
                  "subsample": [0.7, 0.9]}

    pg = ParameterGrid(param_grid)

    scores = np.zeros(len(pg))

    weights = compute_sample_weight(class_weight='balanced', y = Y_train)
    
    for i in range(len(pg)):
        if verbose >= 5:
            print("Param set " + str(i + 1) + " / " + str(len(pg)))
        params = pg[i]
        XGB.set_params(**params)
        eval_set = [(X_CV, y_CV)]
        XGB.fit(X_train, Y_train,
                early_stopping_rounds=50, eval_set=eval_set, verbose=False,
                sample_weight = weights)# with early stopping
        y_CV_pred = XGB.predict(X_CV, iteration_range=(0, XGB.best_ntree_limit))
        scores[i] = accuracy_score(y_CV,y_CV_pred)
        scores_f1[i] = f1_score(y_CV,y_CV_pred)

    best_params = np.array(pg)[scores == np.max(scores_f1)]
    if verbose >= 4:
        print('Test set max score and best parameters are:')
        print(np.max(scores_f1))
        print(best_params)
        print()
    # test the model on the test set with best parameter set
    XGB.set_params(**best_params[0])
    XGB.fit(X_train,Y_train,
            early_stopping_rounds=50,eval_set=eval_set, verbose=False)
    y_test_pred = XGB.predict(X_test, iteration_range=(0, XGB.best_ntree_limit))

    if verbose >= 1:
        print ('The F1 is:',f1_score(y_CV,y_CV_pred))
        print()
    if verbose >= 2:
        print ('The predictions are:')
        print (y_test_pred)
        print()
    #if verbose >= 3:
        #print("Feature importances:")
        #print(XGB.feature_importances_)

    return (f1_score(y_CV,y_CV_pred), y_test_pred, XGB.feature_importances_ )


In [28]:
# Function: Reduced-feature XGB model
# all the inputs need to be pandas DataFrame
def reduced_feature_xgb(X_train, Y_train, X_CV, y_CV, X_test, y_test, i):
    
    # find all unique patterns of missing value in test set
    mask = X_test.isnull()
    unique_rows = np.array(np.unique(mask, axis=0))
    all_y_test_pred = pd.DataFrame()
    
    print('    There are', len(unique_rows), 'unique missing value patterns.')
    print()
    # divide test sets into subgroups according to the unique patterns
    for i in range(len(unique_rows)):
        print ('    *** Working on unique pattern', i, ' ***')
        ## generate X_test subset that matches the unique pattern i
        sub_X_test = pd.DataFrame()
        sub_y_test = pd.Series(dtype=float)
        for j in range(len(mask)): # check each row in mask
            row_mask = np.array(mask.iloc[j])
            if np.array_equal(row_mask, unique_rows[i]): # if the pattern matches the ith unique pattern
                sub_X_test = sub_X_test.append(X_test.iloc[j])# append the according X_test row j to the subset
                sub_y_test = sub_y_test.append(y_test.iloc[j])# append the according y_test row j
                                                
        sub_X_test = sub_X_test[X_test.columns[~unique_rows[i]]]
        
        ## choose the according reduced features for subgroups
        sub_X_train = pd.DataFrame()
        sub_Y_train = pd.DataFrame()
        sub_X_CV = pd.DataFrame()
        sub_y_CV = pd.DataFrame()
        # 1.cut the feature columns that have nans in the according sub_X_test
        sub_X_train = X_train[X_train.columns[~unique_rows[i]]]
        sub_X_CV = X_CV[X_CV.columns[~unique_rows[i]]]
        # 2.cut the rows in the sub_X_train and sub_X_CV that have any nans
        sub_X_train = sub_X_train.dropna()
        sub_X_CV = sub_X_CV.dropna()   
        # 3.cut the sub_Y_train and sub_y_CV accordingly
        sub_Y_train = Y_train.iloc[sub_X_train.index]
        sub_y_CV = y_CV.iloc[sub_X_CV.index]
        
        # run XGB
        sub_y_test_pred = xgb_model(sub_X_train, sub_Y_train, sub_X_CV, 
                                       sub_y_CV, sub_X_test, sub_y_test, i, verbose=4)
        sub_y_test_pred = pd.DataFrame(sub_y_test_pred[1],columns=['sub_y_test_pred'],
                                          index=sub_y_test.index)
        print()
        print('   Accuracy:', accuracy_score(sub_y_test,sub_y_test_pred))
        print('   F1 Score:', f1_score(sub_y_test, sub_y_test_pred))
        
        # collect the test predictions
        all_y_test_pred = all_y_test_pred.append(sub_y_test_pred)
        
    # rank the final y_test_pred according to original y_test index
    all_y_test_pred = all_y_test_pred.sort_index()
    y_test = y_test.sort_index()
               
    # get global scores
    total_accuracy = (accuracy_score(y_test,all_y_test_pred))
    f1 = (f1_score(y_test,all_y_test_pred))
    
    cm = confusion_matrix
    
    return total_accuracy, f1

In [25]:
%%time
from sklearn.metrics import f1_score
results = []
f1_scores = []
cm = []

# label encoder for XGBoost
le = LabelEncoder()
y_subsample2 = le.fit_transform(y_subsample2)


## Perform 60-20-20 Split
# splitter for subsampled data
stratGroupKFold2 = StratifiedGroupKFold(n_splits=5)

# splitter for other
stratGroupKFold3 = StratifiedGroupKFold(n_splits=4)


nr_states = [0,1,2] #,1,2
counter = 0
counter2 = 0
final_models = []
best_parameters = []

for i_other2,i_test2 in stratGroupKFold2.split(X_subsample2.values, y_subsample2, groups_subsample2.values):
    
    counter = counter + 1
    
    X_other2, y_other2, groups_other2 = X_subsample2.values[i_other2], y_subsample2[i_other2], groups_subsample2.values[i_other2]
    X_test, y_test, groups_test = X_subsample2.values[i_test2], y_subsample2[i_test2], groups_subsample2.values[i_test2]
    
    
    # Reshape the data
    
    X_other2 = pd.DataFrame(X_other2)
    X_other2.columns = X_subsample2.columns
    
    X_test = pd.DataFrame(X_test)
    X_test.columns = X_subsample2.columns

    y_other2 = pd.DataFrame(y_other2)
    y_other2.columns = y_subsample2_columns
    
    y_test = pd.DataFrame(y_test)
    y_test.columns = y_subsample2_columns
    
    groups_other2 = pd.DataFrame(groups_other2)
    groups_other2.columns = groups_subsample2.columns
    
    
    
    print(f'Test Set #{counter}')

    print("    Test Set Size:", len(y_test))

    print()
    
    
    
    #for i in range(len(nr_states)):

        #print("         Random State:", i)
        #print()
        
    counter2 = 0
    
    for i in range(len(nr_states)):
    
        for i_train3,i_val3 in stratGroupKFold3.split(X_other2.values, y_other2.values, groups_other2.values):
            
            counter2 = counter2 + 1

            # Perform n-Fold CV
            #cv = stratGroupKFold3.split(X_other2, y_other2, groups_other2)

            X_train, y_train, groups_train = X_other2.iloc[i_train3], y_other2.iloc[i_train3], groups_other2.iloc[i_train3]
            X_val, y_val, groups_val = X_other2.iloc[i_val3], y_other2.iloc[i_val3], groups_other2.iloc[i_val3]

            print(f"    Train Set #{counter2} Size: {len(X_train)}")
            print()


            X_prep = preprocessor.fit_transform(X_train)

            # collect feature names
            feature_names = preprocessor.get_feature_names_out()

            df_train = pd.DataFrame(data=X_prep,columns=feature_names)
            print(f"    Train Set Shape after preprocessing: {df_train.shape}")
            print()
            
            y_train = pd.DataFrame(y_train)
            y_train.columns = y_subsample2_columns

            # transform the CV
            df_CV = preprocessor.transform(X_val)
            df_CV = pd.DataFrame(data=df_CV,columns = feature_names)
            print(f"    Validation Set Shape after preprocessing: {df_CV.shape}")
            print()
            
            y_CV = pd.DataFrame(y_val)
            y_CV.columns = y_subsample2_columns


            # transform the test
            df_test = preprocessor.transform(X_test)
            df_test = pd.DataFrame(data=df_test,columns = feature_names)
            print(f"    Test Set Shape after preprocessing: {df_test.shape}")
            print()

            

            total_accuracy, f1, cm = reduced_feature_xgb(df_train, y_train, df_CV, y_CV, df_test, y_test, i)
            results.append(total_accuracy)
            
            f1_scores.append(f1)
    

Test Set #1
    Test Set Size: 927

    Train Set #1 Size: 2778

    Train Set Shape after preprocessing: (2778, 903)

    Validation Set Shape after preprocessing: (926, 903)

    Test Set Shape after preprocessing: (927, 903)

    There are 5 unique missing value patterns.

    *** Working on unique pattern 0  ***
Test set max score and best parameters are:
0.8441558441558441
[{'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 1, 'missing': nan, 'n_estimators': 1000, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'seed': 0, 'subsample': 0.9}
 {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 1, 'missing': nan, 'n_estimators': 1000, 'reg_alpha': 0.0, 'reg_lambda': 0.01, 'seed': 0, 'subsample': 0.9}
 {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 1, 'missing': nan, 'n_estimators': 1000, 'reg_alpha': 0.01, 'reg_lambda': 0.0, 'seed': 0, 'subsample': 0.9}
 {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 1, 'missing': nan, 'n_estimators': 1000, 're

KeyboardInterrupt: 

In [None]:
%%time

test_scores = []
best_params = []
confusion_mat = []
class_met = []

for i in range(1):
    print(f'Random State # {i}')
    print()
    
    grid, score, cm, class_metrics = ML_pipeline_groups_GridSearchCV(X, y, groups, i*42, 2)
    
    confusion_mat.append(cm)
    
    class_met.append(class_metrics)
    
    print(grid.best_params_)
    
    best_params.append(grid.best_params_)
    print()
    print('best CV score:',grid.best_score_)
    print()
    print('test score:', score)
    test_scores.append(score)
    print()
    
print('test accuracy:',np.around(np.mean(test_scores),2),'+/-',np.around(np.std(test_scores),2))