In [16]:
import pandas as pd
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from xgboost.sklearn import XGBClassifier  
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, ShuffleSplit, GridSearchCV, ParameterGrid

player = pd.read_csv('./data/player.csv')
playerScale = pd.read_csv('./data/playerscale.csv')

In [3]:
player.head()
player = player.drop(['Unnamed: 0'], axis=1)
playerScale.head()
playerScale = playerScale.drop(['Unnamed: 0'], axis=1)

ValueError: labels ['Unnamed: 0'] not contained in axis

In [4]:
print(playerScale.info())
playerScale.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19820 entries, 0 to 19819
Data columns (total 40 columns):
G_x               19820 non-null float64
AB                19820 non-null float64
R                 19820 non-null float64
H                 19820 non-null float64
2B                19820 non-null float64
3B                19820 non-null float64
HR                19820 non-null float64
RBI               19820 non-null float64
SB_x              19820 non-null float64
CS_x              19820 non-null float64
BB                19820 non-null float64
SO                19820 non-null float64
IBB               19820 non-null float64
HBP               19820 non-null float64
SH                19820 non-null float64
SF                19820 non-null float64
GIDP              19820 non-null float64
stint             19820 non-null float64
G_y               19820 non-null float64
GS                19820 non-null float64
InnOuts           19820 non-null float64
PO                19820 non-nu

G_x               0
AB                0
R                 0
H                 0
2B                0
3B                0
HR                0
RBI               0
SB_x              0
CS_x              0
BB                0
SO                0
IBB               0
HBP               0
SH                0
SF                0
GIDP              0
stint             0
G_y               0
GS                0
InnOuts           0
PO                0
A                 0
E                 0
DP                0
PB                0
SB_y              0
CS_y              0
salary            0
MinSalaries       0
adjSalary         0
adjMinSalaries    0
adjSalary2        0
Bavg              0
Slug              0
OBP               0
FPct              0
yearID            0
playerID          0
Primary           0
dtype: int64

In [5]:
#Create Mapping to Positions
def format_cat(df):
    cat_cols = ['Primary']
    for i in cat_cols:
        df[i] = df[i].map({'C':2, '1B':3, '2B': 4, '3B':5, 'SS':6, 'OF':7, 'DH':10})

format_cat(playerScale)

In [6]:
playerScale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19820 entries, 0 to 19819
Data columns (total 40 columns):
G_x               19820 non-null float64
AB                19820 non-null float64
R                 19820 non-null float64
H                 19820 non-null float64
2B                19820 non-null float64
3B                19820 non-null float64
HR                19820 non-null float64
RBI               19820 non-null float64
SB_x              19820 non-null float64
CS_x              19820 non-null float64
BB                19820 non-null float64
SO                19820 non-null float64
IBB               19820 non-null float64
HBP               19820 non-null float64
SH                19820 non-null float64
SF                19820 non-null float64
GIDP              19820 non-null float64
stint             19820 non-null float64
G_y               19820 non-null float64
GS                19820 non-null float64
InnOuts           19820 non-null float64
PO                19820 non-nu

In [7]:
#Seperate Predicter 
X = playerScale.drop(['Primary', 'playerID'], axis=1)
y = playerScale['Primary'].values
yhat = np.empty(y.shape,dtype=str)

In [8]:
#Get training and test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Simple XGBoost Model

In [9]:
xgbModel = XGBClassifier()
xgbModel.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [10]:
y_pred = xgbModel.predict(X_test)

In [11]:
accuracy = accuracy_score(y_test, y_pred)
print("The Accuracy of the XGBoost Model: %.2f%%" % (accuracy * 100.0))

The Accuracy of the XGBoost Model: 87.61%


# XG Boost Model with Cross Validation

### Optimizing MSE

In [13]:
gbm_param_grid = {
     'colsample_bytree': np.linspace(0.5, 0.9, 5),
     'n_estimators':[100],
     'max_depth': [10, 15, 20]
}

gbm = xgb.XGBRegressor()
gbm_mse = GridSearchCV(estimator = gbm, param_grid = gbm_param_grid, scoring = 'neg_mean_squared_error', cv = 5, verbose = 1)
gbm_mse.fit(X_train, y_train)
print("Lowest RMSE found: ", np.sqrt(np.abs(gbm_mse.best_score_)))

pred = gbm_mse.predict(X_test)
print("Root mean square error for test dataset: {}".format(np.round(np.sqrt(mean_squared_error(y_test, pred)), 2)))


Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 11.0min finished


Lowest RMSE found:  0.626728568939


NameError: name 'mean_squared_error' is not defined

In [24]:
gbm_mse.grid_scores_
gbm_mse.accuracy_score



AttributeError: 'GridSearchCV' object has no attribute 'accuracy_score'

### Optimizing Accuracy

In [26]:
cv_params = {'max_depth': [3,5,7]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 500, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}
accuracy_gbm = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 5, n_jobs = -1)
accuracy_gbm.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'min_child_weight': [1, 3, 5], 'max_depth': [3, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

pred = accuracy_gbm.predict(X_test)

print("Lowest RMSE found: ", np.sqrt(np.abs(accuracy_gbm.best_score_)))

accuracy_gbm.grid_scores_





Lowest RMSE found:  0.942868500419




[mean: 0.88894, std: 0.00253, params: {'max_depth': 3},
 mean: 0.88900, std: 0.00377, params: {'max_depth': 5},
 mean: 0.88799, std: 0.00346, params: {'max_depth': 7}]