In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline
import xgboost as xgb
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder

  from pandas import MultiIndex, Int64Index


In [2]:
categorical_ftrs = ['Prscrbr_City',
                    'Prscrbr_State_Abrvtn',
                    'Brnd_Name',
                    'Gnrc_Name']

std_ftrs = ['Tot_Clms', 
            'Tot_30day_Fills', 
            'Tot_Day_Suply', 
            'Tot_Drug_Cst', 
            'Tot_Benes', 
            'GE65_Tot_Clms',
            'GE65_Tot_30day_Fills',
            'GE65_Tot_Drug_Cst',
            'GE65_Tot_Day_Suply',
            'GE65_Tot_Benes']

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), categorical_ftrs),
        ('std', StandardScaler(), std_ftrs)])

#clf = Pipeline(steps=[('preprocessor', preprocessor)])                                               


In [127]:
os.chdir('../data')

X_subsample2 = pd.read_csv('X_subsample_round1_split2.zip',compression='zip',index_col=False)
y_subsample2 = pd.read_csv('y_subsample_round1_split2.zip',compression='zip')
groups_subsample2 = pd.read_csv('groups_subsample_round1_split2.zip',compression='zip')

X_subsample2 = X_subsample2.iloc[:,1:]
y_subsample2 = y_subsample2.iloc[:,1:]
groups_subsample2 = groups_subsample2.iloc[:,1:]

y_subsample2_columns = y_subsample2.columns
#y_subsample = y_subsample.values.ravel()

In [128]:
# subsample [0.4, 0.6, 0.8]
# max depth [1, 3, 10, 30]
# lr [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]

param_grid = {"xgbclassifier__subsample": [0.4, 0.6, 0.8],
              "xgbclassifier__missing": [np.nan],
              "xgbclassifier__max_depth": [1, 3, 10, 30],
              "xgbclassifier__learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
              "xgbclassifier__n_estimators": [500],
              "xgbclassifier__colsample_bytree": [0.4, 0.6, 0.8]}


param_grid1 = {"xgbclassifier__subsample": [0.4],
              "xgbclassifier__missing": [np.nan],
              "xgbclassifier__max_depth": [1],
              "xgbclassifier__learning_rate": [0.01],
              "xgbclassifier__max_depth": [3]}


In [123]:
pd.Series(y_other2).head(10)

0    1
1    1
2    5
3    5
4    5
5    5
6    5
7    5
8    5
9    5
dtype: int64

In [None]:
np.unique(y_other2, return_counts=True)

In [129]:
from sklearn.utils.class_weight import compute_sample_weight

weights = compute_sample_weight(class_weight='balanced', y=y_other2)
pd.Series(weights).head(10)

0    7.011776
1    7.011776
2    7.011776
3    7.011776
4    0.221865
5    0.221865
6    0.221865
7    0.221865
8    0.221865
9    0.221865
dtype: float64

In [None]:
%%time




# splitter for subsampled data
stratGroupKFold2 = StratifiedGroupKFold(n_splits=2)

# splitter for other
stratGroupKFold3 = StratifiedGroupKFold(n_splits=2)

# label encoder for XGBoost
le = LabelEncoder()
y_subsample2 = le.fit_transform(y_subsample2)

# Initialize lists
nr_states = [0]
test_scores = []
final_models = []
best_params = []
test_scores = []
counter = 0

# num of lists should equal number of test sets that the StratifiedGroupKFold will generate
# each list should contain entries with the same number of random state
    
list1_params = []
list1_grid = []
list1_pred = []
list1_score = []

list2_params = []
list2_grid = []
list2_pred = []
list2_score = []

list3_params = []
list3_grid = []
list3_pred = []
list3_score = []

for i_other2,i_test2 in stratGroupKFold2.split(X_subsample2.values, y_subsample2, groups_subsample2.values):
    
    counter = counter + 1
    
    X_other2, y_other2, groups_other2 = X_subsample2.values[i_other2], y_subsample2[i_other2], groups_subsample2.values[i_other2]
    X_test2, y_test2, groups_test2 = X_subsample2.values[i_test2], y_subsample2[i_test2], groups_subsample2.values[i_test2]

    # Reshape the data
    
    X_other2 = pd.DataFrame(X_other2)
    X_other2.columns = X_subsample2.columns

    y_other2 = pd.DataFrame(y_other2)
    y_other2.columns = y_subsample2_columns
    

    groups_other2 = pd.DataFrame(groups_other2)
    groups_other2.columns = groups_subsample2.columns

    y_other2 = np.reshape(np.array(y_other2), (1, -1)).ravel()

    y_test2 = np.reshape(np.array(y_test2), (1, -1)).ravel()
    
    
    
    print(f'Test Set #{counter}')

    print("    Test Set Size:", len(y_test2))

    print()
    
    for i in range(len(nr_states)):

        print("         Random State:", i)
        print()

        # Perform n-Fold CV
        cv = stratGroupKFold3.split(X_other2, y_other2, groups_other2)


        # Initialize XGBoost Classifier
        clf = xgb.XGBClassifier(num_class=10, eval_metric = 'mlogloss', 
                                random_state=i, use_label_encoder=False, )

        pipe = make_pipeline(preprocessor,clf)

        grid = GridSearchCV(pipe, param_grid=param_grid1,scoring ='accuracy', 
                            cv=cv, return_train_score = True, n_jobs=1, verbose=10)
        
        # Compute sample weights
        weights = compute_sample_weight(class_weight='balanced', y = y_other2)

        grid_result = grid.fit(X_other2, y_other2, groups = groups_other2,
                              xgbclassifier__sample_weight = weights) #xgbclassifier__early_stopping_rounds=50
        
        print('best model parameters:', grid.best_params_)

        print()

        print('validation score:',grid.best_score_)
        
        print()
        
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        
        y_test_pred = grid.predict(X_test2)
        score = accuracy_score(y_test,y_test_pred)
        
        if counter == 1:
            list1_params.append(grid.best_params_)

            list1_grid.append(grid)

            list1_pred.append(y_test_pred)
            
            list1_score.append(score)

        if counter == 2:
            list2_params.append(grid.best_params_)

            list2_grid.append(grid)
            
            list2_pred.append(y_test_pred)
            
            list2_score.append(score)
            
        if counter == 2:
            list3_params.append(grid.best_params_)

            list3_grid.append(grid)

            list3_pred.append(y_test_pred)

            list3_score.append(score)
            
            
    
        #print('test score:', score)

        #test_scores.append(score)


Test Set #1
    Test Set Size: 37513

         Random State: 0

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV 1/2; 1/1] START xgbclassifier__learning_rate=0.01, xgbclassifier__max_depth=3, xgbclassifier__missing=nan, xgbclassifier__subsample=0.4
[CV 1/2; 1/1] END xgbclassifier__learning_rate=0.01, xgbclassifier__max_depth=3, xgbclassifier__missing=nan, xgbclassifier__subsample=0.4;, score=(train=0.550, test=0.219) total time= 2.9min
[CV 2/2; 1/1] START xgbclassifier__learning_rate=0.01, xgbclassifier__max_depth=3, xgbclassifier__missing=nan, xgbclassifier__subsample=0.4
[CV 2/2; 1/1] END xgbclassifier__learning_rate=0.01, xgbclassifier__max_depth=3, xgbclassifier__missing=nan, xgbclassifier__subsample=0.4;, score=(train=0.521, test=0.214) total time= 3.3min
best model parameters: {'xgbclassifier__learning_rate': 0.01, 'xgbclassifier__max_depth': 3, 'xgbclassifier__missing': nan, 'xgbclassifier__subsample': 0.4}

validation score: 0.21640490585654448

Test Set #2
    Te

In [78]:
%%time

stratGroupKFold2 = StratifiedGroupKFold(n_splits=5)
stratGroupKFold3 = StratifiedGroupKFold(n_splits=2)

#clf = Pipeline(steps=[('preprocessor', preprocessor)])                                               
counter = 0
for i_other2,i_test2 in stratGroupKFold2.split(X_subsample2.values, y_subsample2.values, groups_subsample2.values):
    
    counter = counter + 1
    print(f'Test Set #{counter}')

    
    X_other2, y_other2, groups_other2 = X_subsample2.values[i_other2], y_subsample2.values[i_other2], groups_subsample2.values[i_other2]
    X_test2, y_test2, groups_test2 = X_subsample2.values[i_test2], y_subsample2.values[i_test2], groups_subsample2.values[i_test2]

    #print("Size of OTHER:", len(y_other))
    print("Size of OTHER:", len(y_other2))
    print("Size of OTHER:", len(y_other2)/len(y_subsample2))
    print()

    #print("Size of TEST:", len(y_test))
    print("Size of TEST:", len(y_test2))
    print("Size of TEST:", len(y_test2)/len(y_subsample2))
    print()
    

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [12]:
X_other2 = pd.DataFrame(X_other2)
X_other2.columns = X_subsample2.columns

y_other2 = pd.DataFrame(y_other2)
y_other2.columns = y_subsample2.columns

groups_other2 = pd.DataFrame(groups_other2)
groups_other2.columns = groups_subsample2.columns

y_other2 = np.reshape(np.array(y_other2), (1, -1)).ravel()

y_test2 = np.reshape(np.array(y_test2), (1, -1)).ravel()

In [23]:
%%time
cv = stratGroupKFold2.split(X_other2, y_other2, groups_other2)

clf = xgb.XGBClassifier(use_label_encoder=False) # use_label_encoder=False

pipe = make_pipeline(preprocessor,clf)

grid = GridSearchCV(pipe, param_grid=param_grid,scoring ='accuracy', 
                        cv=cv, return_train_score = True, n_jobs=1, verbose=4)

CPU times: user 391 µs, sys: 7.9 ms, total: 8.29 ms
Wall time: 19.3 ms


In [20]:
np.unique(y_other2)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [21]:
%%time

grid_result = grid.fit(X_other2, y_other2, groups=groups_other2)

#results = pd.DataFrame(grid.cv_results_)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#means = grid_result.cv_results_['mean_test_score']
#stds = grid_result.cv_results_['std_test_score']
#params = grid_result.cv_results_['params']

Fitting 4 folds for each of 1 candidates, totalling 4 fits




[CV 1/4] END xgbclassifier__max_depth=1, xgbclassifier__missing=nan, xgbclassifier__subsample=0.4;, score=(train=0.676, test=0.446) total time=15.6min




[CV 2/4] END xgbclassifier__max_depth=1, xgbclassifier__missing=nan, xgbclassifier__subsample=0.4;, score=(train=0.693, test=0.498) total time=15.4min




[CV 3/4] END xgbclassifier__max_depth=1, xgbclassifier__missing=nan, xgbclassifier__subsample=0.4;, score=(train=0.696, test=0.466) total time=15.0min




[CV 4/4] END xgbclassifier__max_depth=1, xgbclassifier__missing=nan, xgbclassifier__subsample=0.4;, score=(train=0.678, test=0.469) total time=15.2min




Best: 0.469953 using {'xgbclassifier__max_depth': 1, 'xgbclassifier__missing': nan, 'xgbclassifier__subsample': 0.4}
CPU times: user 6h 59s, sys: 1h 45min 55s, total: 7h 46min 55s
Wall time: 1h 31min 55s


1h 31min

In [22]:
means = grid_result.cv_results_['mean_test_score']
means

array([0.46995263])