In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline
import xgboost as xgb
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GroupShuffleSplit

  from pandas import MultiIndex, Int64Index


In [2]:
categorical_ftrs = ['Prscrbr_City',
                    'Prscrbr_State_Abrvtn',
                    'Brnd_Name',
                    'Gnrc_Name']

std_ftrs = ['Tot_Clms', 
            'Tot_30day_Fills', 
            'Tot_Day_Suply', 
            'Tot_Drug_Cst', 
            'Tot_Benes', 
            'GE65_Tot_Clms',
            'GE65_Tot_30day_Fills',
            'GE65_Tot_Drug_Cst',
            'GE65_Tot_Day_Suply',
            'GE65_Tot_Benes']



#clf = Pipeline(steps=[('preprocessor', preprocessor)])                                               


In [3]:
param_grid = {"xgbclassifier__subsample": [0.5, 0.7, 0.9],
              "xgbclassifier__missing": [np.nan],
              "xgbclassifier__max_depth": [1, 3, 10],
              "xgbclassifier__learning_rate": [0.001, 0.01, 0.1, 0.3],
              "xgbclassifier__n_estimators": [1000],
              "xgbclassifier__gamma": [1,5,10]}


param_grid1 = {"xgbclassifier__subsample": [0.5, 0.7, 0.9],
              "xgbclassifier__missing": [np.nan],
              "xgbclassifier__learning_rate": [0.01],
              "xgbclassifier__max_depth": [1],
              "xgbclassifier__gamma": [5],
              "xgbclassifier__n_estimators": [1000]}


fit_params = {"xgbclassifier__early_stopping_rounds": 50}

scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1_macro': make_scorer(f1_score, average = 'macro'),
           'f1_weighted': make_scorer(f1_score, average = 'weighted')}

In [5]:
os.chdir('../data')

X = pd.read_csv('X_4specialties_equalWeight_subsample.zip',compression='zip', index_col=False)
y = pd.read_csv('y_4specialties_equalWeight_subsample.zip',compression='zip')
groups = pd.read_csv('groups_4specialties_equalWeight_subsample.zip',compression='zip')

X = X.iloc[:,1:]
y = y.iloc[:,1:]
groups = groups.iloc[:,1:]

y_columns = y.columns

le = LabelEncoder()
y = y.values.ravel()
y = le.fit_transform(y)
y = pd.DataFrame(y)
y.columns = y_columns

In [10]:

def ML_pipeline_groups_GridSearchCV(X,y,groups,random_state,n_folds):
    # create a test set based on groups
    splitter = GroupShuffleSplit(n_splits=1,test_size=0.2,random_state=random_state)
    
    for i_other,i_test in splitter.split(X, y, groups):
        X_other, y_other, groups_other = X.iloc[i_other], y.iloc[i_other], groups.iloc[i_other]
        X_test, y_test, groups_test = X.iloc[i_test], y.iloc[i_test], groups.iloc[i_test]
        

    # check the split
#     print(pd.unique(groups))
#     print(pd.unique(groups_other))
#     print(pd.unique(groups_test))
    # splitter for _other
    kf = GroupKFold(n_splits=n_folds)
    # create the pipeline: preprocessor + supervised ML method

    
    clf = xgb.XGBClassifier(num_class=4,
                                eval_metric = "mlogloss",
                                objective = "multi:softprob",
                                random_state = i, 
                                use_label_encoder = False)
    
    preprocessor = ColumnTransformer(
        transformers=[
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), categorical_ftrs),
        ('std', StandardScaler(), std_ftrs)])
    
    pipe = make_pipeline(preprocessor,clf)
    
    
    # the parameter(s) we want to tune
    param_grid1 = {"xgbclassifier__subsample": [0.8],
              "xgbclassifier__missing": [np.nan],
              "xgbclassifier__learning_rate": [0.1],
              "xgbclassifier__max_depth": [15],
              "xgbclassifier__gamma": [5],
              "xgbclassifier__n_estimators": [1000]}
    
    # prepare gridsearch
    #grid = GridSearchCV(pipe, param_grid=param_grid,scoring = make_scorer(accuracy_score),
                        #cv=kf, return_train_score = True)
    
    grid = GridSearchCV(pipe, 
                            param_grid=param_grid1,
                            scoring = "f1_macro", #‘f1_macro’ #scorer #accuracy
                            cv=kf, 
                            return_train_score = True, 
                            n_jobs=1, 
                            verbose=10)
    # do kfold CV on _other
    grid_result = grid.fit(X_other, y_other, groups=groups_other)
    
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    
    y_test_pred = grid.predict(X_test)
    
    score = f1_score(y_test,y_test_pred, average = "macro")
    
    cm = confusion_matrix(y_test, y_test_pred)
    
    class_metrics = metrics.classification_report(y_test, y_test_pred, digits=3)
    
    return grid, score



In [11]:
%%time

test_scores = []

for i in range(1):
    print(f'Random State # {i}')
    print()
    
    grid, score = ML_pipeline_groups_GridSearchCV(X, y, groups, i*42, 5)
    print(grid.best_params_)
    print('best CV score:',grid.best_score_)
    print()
    print('test score:', score)
    test_scores.append(score)
    print()
    
print('test accuracy:',np.around(np.mean(test_scores),2),'+/-',np.around(np.std(test_scores),2))

Random State # 0

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START xgbclassifier__gamma=5, xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=15, xgbclassifier__missing=nan, xgbclassifier__n_estimators=1000, xgbclassifier__subsample=0.8
[CV 1/5; 1/1] END xgbclassifier__gamma=5, xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=15, xgbclassifier__missing=nan, xgbclassifier__n_estimators=1000, xgbclassifier__subsample=0.8;, score=(train=0.980, test=0.391) total time=11.5min
[CV 2/5; 1/1] START xgbclassifier__gamma=5, xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=15, xgbclassifier__missing=nan, xgbclassifier__n_estimators=1000, xgbclassifier__subsample=0.8
[CV 2/5; 1/1] END xgbclassifier__gamma=5, xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=15, xgbclassifier__missing=nan, xgbclassifier__n_estimators=1000, xgbclassifier__subsample=0.8;, score=(train=0.977, test=0.584) total time=13.0min
[CV 3/5; 1/1] START xgbclassi

KeyboardInterrupt: 