In [15]:
import pandas as pd
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from xgboost.sklearn import XGBClassifier  
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, ShuffleSplit, GridSearchCV, ParameterGrid

player = pd.read_csv('./data/player.csv')
playerScale = pd.read_csv('./data/playerscale.csv')

In [2]:
player.head()
player = player.drop(['Unnamed: 0'], axis=1)
playerScale.head()
playerScale = playerScale.drop(['Unnamed: 0'], axis=1)

In [3]:
print(playerScale.info())
playerScale.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19820 entries, 0 to 19819
Data columns (total 40 columns):
G_x               19820 non-null float64
AB                19820 non-null float64
R                 19820 non-null float64
H                 19820 non-null float64
2B                19820 non-null float64
3B                19820 non-null float64
HR                19820 non-null float64
RBI               19820 non-null float64
SB_x              19820 non-null float64
CS_x              19820 non-null float64
BB                19820 non-null float64
SO                19820 non-null float64
IBB               19820 non-null float64
HBP               19820 non-null float64
SH                19820 non-null float64
SF                19820 non-null float64
GIDP              19820 non-null float64
stint             19820 non-null float64
G_y               19820 non-null float64
GS                19820 non-null float64
InnOuts           19820 non-null float64
PO                19820 non-nu

G_x               0
AB                0
R                 0
H                 0
2B                0
3B                0
HR                0
RBI               0
SB_x              0
CS_x              0
BB                0
SO                0
IBB               0
HBP               0
SH                0
SF                0
GIDP              0
stint             0
G_y               0
GS                0
InnOuts           0
PO                0
A                 0
E                 0
DP                0
PB                0
SB_y              0
CS_y              0
salary            0
MinSalaries       0
adjSalary         0
adjMinSalaries    0
adjSalary2        0
Bavg              0
Slug              0
OBP               0
FPct              0
yearID            0
playerID          0
Primary           0
dtype: int64

In [4]:
#Create Mapping to Positions
def format_cat(df):
    cat_cols = ['Primary']
    for i in cat_cols:
        df[i] = df[i].map({'C':2, '1B':3, '2B': 4, '3B':5, 'SS':6, 'OF':7, 'DH':10})

format_cat(playerScale)

In [5]:
playerScale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19820 entries, 0 to 19819
Data columns (total 40 columns):
G_x               19820 non-null float64
AB                19820 non-null float64
R                 19820 non-null float64
H                 19820 non-null float64
2B                19820 non-null float64
3B                19820 non-null float64
HR                19820 non-null float64
RBI               19820 non-null float64
SB_x              19820 non-null float64
CS_x              19820 non-null float64
BB                19820 non-null float64
SO                19820 non-null float64
IBB               19820 non-null float64
HBP               19820 non-null float64
SH                19820 non-null float64
SF                19820 non-null float64
GIDP              19820 non-null float64
stint             19820 non-null float64
G_y               19820 non-null float64
GS                19820 non-null float64
InnOuts           19820 non-null float64
PO                19820 non-nu

In [6]:
#Seperate Predicter 
X = playerScale.drop(['Primary', 'playerID'], axis=1)
y = playerScale['Primary'].values
yhat = np.empty(y.shape,dtype=str)

In [7]:
#Get training and test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Simple XGBoost Model

In [9]:
xgbModel = XGBClassifier()
xgbModel.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [10]:
y_pred = xgbModel.predict(X_test)

In [11]:
accuracy = accuracy_score(y_test, y_pred)
print("The Accuracy of the XGBoost Model: %.2f%%" % (accuracy * 100.0))

The Accuracy of the XGBoost Model: 87.61%


# XG Boost Model with Cross Validation

### With Cross Validation

In [17]:
gbm_param_grid = {
     'colsample_bytree': np.linspace(0.5, 0.9, 5),
     'n_estimators':[100],
     'max_depth': [10, 15, 20]
}

gbm = xgb.XGBRegressor()
gbm_mse = GridSearchCV(estimator = gbm, param_grid = gbm_param_grid, cv = 5, verbose = 1)
gbm_mse.fit(X_train, y_train)
print("Lowest RMSE found: ", np.sqrt(np.abs(gbm_mse.best_score_)))

pred = gbm_mse.predict(X_test)
print("Root mean square error for test dataset: {}".format(np.round(np.sqrt(mean_squared_error(y_test, pred)), 2)))


Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 11.1min finished


Lowest RMSE found:  0.949646796858
Root mean square error for test dataset: 0.61


In [30]:
gbm_mse.grid_scores_



[mean: 0.89919, std: 0.00149, params: {'colsample_bytree': 0.5, 'max_depth': 10, 'n_estimators': 100},
 mean: 0.89162, std: 0.00177, params: {'colsample_bytree': 0.5, 'max_depth': 15, 'n_estimators': 100},
 mean: 0.88774, std: 0.00279, params: {'colsample_bytree': 0.5, 'max_depth': 20, 'n_estimators': 100},
 mean: 0.90052, std: 0.00320, params: {'colsample_bytree': 0.59999999999999998, 'max_depth': 10, 'n_estimators': 100},
 mean: 0.89578, std: 0.00398, params: {'colsample_bytree': 0.59999999999999998, 'max_depth': 15, 'n_estimators': 100},
 mean: 0.89388, std: 0.00329, params: {'colsample_bytree': 0.59999999999999998, 'max_depth': 20, 'n_estimators': 100},
 mean: 0.90155, std: 0.00181, params: {'colsample_bytree': 0.69999999999999996, 'max_depth': 10, 'n_estimators': 100},
 mean: 0.89922, std: 0.00303, params: {'colsample_bytree': 0.69999999999999996, 'max_depth': 15, 'n_estimators': 100},
 mean: 0.89688, std: 0.00254, params: {'colsample_bytree': 0.69999999999999996, 'max_depth': 20,

### Optimizing Accuracy

In [21]:
cv_params = {'max_depth': [3,5,7]}

ind_params = {'learning_rate': 0.1, 'n_estimators': 500, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}
accuracy_gbm = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 5, n_jobs = -1)

accuracy_gbm.fit(X_train, y_train)

pred = accuracy_gbm.predict(X_test)

print("Lowest RMSE found: ", np.sqrt(np.abs(accuracy_gbm.best_score_)))

Lowest RMSE found:  0.942868500419




{'mean_fit_time': array([ 105.16629562,  159.56041799,  176.3811502 ]),
 'mean_score_time': array([ 1.74261189,  2.3094954 ,  2.39035378]),
 'mean_test_score': array([ 0.88893794,  0.88900101,  0.88799193]),
 'mean_train_score': array([ 0.97608164,  1.        ,  1.        ]),
 'param_max_depth': masked_array(data = [3 5 7],
              mask = [False False False],
        fill_value = ?),
 'params': [{'max_depth': 3}, {'max_depth': 5}, {'max_depth': 7}],
 'rank_test_score': array([2, 1, 3], dtype=int32),
 'split0_test_score': array([ 0.88755906,  0.8888189 ,  0.88661417]),
 'split0_train_score': array([ 0.97579055,  1.        ,  1.        ]),
 'split1_test_score': array([ 0.88965952,  0.88965952,  0.89281211]),
 'split1_train_score': array([ 0.97713655,  1.        ,  1.        ]),
 'split2_test_score': array([ 0.88832808,  0.89022082,  0.88864353]),
 'split2_train_score': array([ 0.97595775,  1.        ,  1.        ]),
 'split3_test_score': array([ 0.88580442,  0.88233438,  0.88233438

In [29]:
accuracy_gbm.grid_scores_




[mean: 0.88894, std: 0.00253, params: {'max_depth': 3},
 mean: 0.88900, std: 0.00377, params: {'max_depth': 5},
 mean: 0.88799, std: 0.00346, params: {'max_depth': 7}]

In [28]:
from xgboost import plot_importance
plot_importance(accuracy_gbm, )

ValueError: tree must be Booster, XGBModel or dict instance

In [31]:
xgdmat = xgb.DMatrix(X_train, y_train)
final_gb = xgb.train(ind_params, xgdmat, num_boost_round = 432)
xgb.plot_importance(final_gb)

XGBoostError: b'[15:08:57] src/objective/regression_obj.cc:98: Check failed: Loss::CheckLabel(y) label must be in [0,1] for logistic regression\n\nStack trace returned 8 entries:\n[bt] (0) 0   libxgboost.dylib                    0x0000000109d1d0e0 dmlc::StackTrace() + 288\n[bt] (1) 1   libxgboost.dylib                    0x0000000109d1ce7f dmlc::LogMessageFatal::~LogMessageFatal() + 47\n[bt] (2) 2   libxgboost.dylib                    0x0000000109d9b46b xgboost::obj::RegLossObj<xgboost::obj::LogisticClassification>::LazyCheckLabels(std::__1::vector<float, std::__1::allocator<float> > const&) + 235\n[bt] (3) 3   libxgboost.dylib                    0x0000000109d9aa4c xgboost::obj::RegLossObj<xgboost::obj::LogisticClassification>::GetGradient(xgboost::HostDeviceVector<float>*, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*) + 668\n[bt] (4) 4   libxgboost.dylib                    0x0000000109d19239 xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*) + 1017\n[bt] (5) 5   libxgboost.dylib                    0x0000000109d33b47 XGBoosterUpdateOneIter + 87\n[bt] (6) 6   libffi.6.dylib                      0x00000001045d3884 ffi_call_unix64 + 76\n[bt] (7) 7   ???                                 0x00007ffeecd49250 0x0 + 140732871774800\n\n'

In [40]:
def get_xgb_imp(xgb, feat_names):
    from numpy import array
    imp_vals = xgb.booster().get_fscore()
    imp_dict = {feat_names[i]:float(imp_vals.get('f'+str(i),0.)) for i in range(len(feat_names))}
    total = array(imp_dict.values()).sum()
    return {k:v/total for k,v in imp_dict.items()}

feat_names = ['G_x' , 'AB' ,'R', 'H','2B','3B' ,'HR' ,'RBI' ,'SB_x', 'CS_x','BB' ,'SO' ,'IBB' ,'HBP','SH' ,'SF' ,'GIDP' ,'stint','G_y' ,'GS' ,'InnOuts','PO' ,'A' ,'E' ,'DP' ,'PB' ,'SB_y' ,'CS_y'  ,'salary' ,'MinSalaries' ,'adjSalary' ,'adjMinSalaries','adjSalary2','Bavg' ,'Slug' , 'OBP','FPct' ,'yearID' ]


get_xgb_imp(accuracy_gbm, feat_names)

AttributeError: 'GridSearchCV' object has no attribute 'booster'