In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import tree
import graphviz
from sklearn.metrics import confusion_matrix
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

Using TensorFlow backend.


In [2]:
pitchers_df = pd.read_csv('Data/pitchers_complete.csv')

In [3]:
pitchers_train_model = pitchers_df[pitchers_df['eligible_for_hall']=='eligible']
scale_pos_weight = (pitchers_train_model.inducted.value_counts()[0])/pitchers_train_model.inducted.value_counts()[1]
data = pitchers_train_model.drop(['inducted', 'playerID', 'nameFirst', 'nameLast', 'finalGame','bats', 
                                  'throws','weight', 'height','yearid','years_since_final_game', 
                                  'eligible_for_hall'], axis=1)
data.fillna(0, inplace=True)
data['Batters_Faced']=pd.to_numeric(data['Batters_Faced'])
target = pitchers_train_model['inducted']
target.fillna('N', inplace=True)
feature_names = data.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [5]:
model = XGBClassifier(scale_pos_weight=1)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 97.92%


In [6]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[324   3]
 [  4   5]]
              precision    recall  f1-score   support

           N       0.99      0.99      0.99       327
           Y       0.62      0.56      0.59         9

   micro avg       0.98      0.98      0.98       336
   macro avg       0.81      0.77      0.79       336
weighted avg       0.98      0.98      0.98       336



In [7]:
classification_report(y_test, y_pred, output_dict=True)

{'N': {'precision': 0.9878048780487805,
  'recall': 0.9908256880733946,
  'f1-score': 0.9893129770992366,
  'support': 327},
 'Y': {'precision': 0.625,
  'recall': 0.5555555555555556,
  'f1-score': 0.5882352941176471,
  'support': 9},
 'micro avg': {'precision': 0.9791666666666666,
  'recall': 0.9791666666666666,
  'f1-score': 0.9791666666666666,
  'support': 336},
 'macro avg': {'precision': 0.8064024390243902,
  'recall': 0.7731906218144751,
  'f1-score': 0.7887741356084419,
  'support': 336},
 'weighted avg': {'precision': 0.9780868902439024,
  'recall': 0.9791666666666666,
  'f1-score': 0.9785698248765154,
  'support': 336}}

In [8]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [5, 6, 7, 8, 9, 10],
        'scale_pos_weight':[7, 9, 11, 13, 15, 18]
        }

In [9]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

In [23]:
grid = GridSearchCV(estimator=xgb, param_grid=params, scoring='roc_auc', n_jobs=4, verbose=3 )
grid.fit(X_train_scaled, y_train)
print('\n All results:')
print(grid.cv_results_)
print('\n Best estimator:')
print(grid.best_estimator_)
print('\n Best score:')
print(grid.best_score_ * 2 - 1)
print('\n Best parameters:')
print(grid.best_params_)
results = pd.DataFrame(grid.cv_results_)
results.to_csv('xgb-grid-search-results-01.csv', index=False)

y_test = grid.best_estimator_.predict_proba(y_test)
results_df = pd.DataFrame(data={'id':test_df['id'], 'target':y_test[:,1]})

Fitting 3 folds for each of 2700 candidates, totalling 8100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   17.1s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:   38.7s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 504 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  3.3min
[Parallel(n_jobs=4)]: Done 1144 tasks      | elapsed:  4.7min
[Parallel(n_jobs=4)]: Done 1560 tasks      | elapsed:  6.3min
[Parallel(n_jobs=4)]: Done 2040 tasks      | elapsed:  8.1min
[Parallel(n_jobs=4)]: Done 2584 tasks      | elapsed: 10.2min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed: 12.9min
[Parallel(n_jobs=4)]: Done 3864 tasks      | elapsed: 15.9min
[Parallel(n_jobs=4)]: Done 4600 tasks      | elapsed: 19.1min
[Parallel(n_jobs=4)]: Done 5400 tasks      | elapsed: 22.8min
[Parallel(n_jobs=4)]: Done 6264 tasks      | elapsed: 27.2min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 31.8min



 All results:
{'mean_fit_time': array([0.95052457, 0.96168168, 1.03920158, ..., 1.11256989, 1.35795752,
       1.37728198]), 'std_fit_time': array([0.02868096, 0.07281955, 0.06776853, ..., 0.1735907 , 0.12472777,
       0.11738205]), 'mean_score_time': array([0.01570336, 0.01419727, 0.01196766, ..., 0.01179409, 0.01010998,
       0.00706697]), 'std_score_time': array([0.00313268, 0.00245483, 0.00013902, ..., 0.00353323, 0.0010884 ,
       0.00070269]), 'param_colsample_bytree': masked_array(data=[0.6, 0.6, 0.6, ..., 1.0, 1.0, 1.0],
             mask=[False, False, False, ..., False, False, False],
       fill_value='?',
            dtype=object), 'param_gamma': masked_array(data=[0.5, 0.5, 0.5, ..., 5, 5, 5],
             mask=[False, False, False, ..., False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[5, 5, 5, ..., 8, 8, 8],
             mask=[False, False, False, ..., False, False, False],
       fill_value='?',
           



TypeError: can not initialize DMatrix from Series

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_gamma,param_max_depth,param_min_child_weight,param_scale_pos_weight,param_subsample,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.950525,0.028681,0.015703,0.003133,0.6,0.5,5,1,9,0.6,...,0.945728,0.987778,0.972599,0.019067,1338,1.0,1.0,1.0,1.0,0.0
1,0.961682,0.07282,0.014197,0.002455,0.6,0.5,5,1,9,0.8,...,0.94462,0.985397,0.970803,0.018567,1828,1.0,1.0,1.0,1.0,6.409876000000001e-17
2,1.039202,0.067769,0.011968,0.000139,0.6,0.5,5,1,9,1.0,...,0.939082,0.985238,0.966685,0.01991,2470,1.0,1.0,1.0,1.0,6.409876000000001e-17
3,0.989514,0.044479,0.013269,0.000449,0.6,0.5,5,1,11,0.6,...,0.94731,0.98746,0.973338,0.018439,938,1.0,1.0,1.0,1.0,6.409876000000001e-17
4,0.965075,0.071037,0.013797,0.000649,0.6,0.5,5,1,11,0.8,...,0.943829,0.985873,0.970592,0.019,1884,1.0,1.0,1.0,1.0,9.064933000000001e-17
