In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.classifier import EnsembleVoteClassifier, StackingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
# pre-processing
df = pd.read_csv("master_data.csv")
df['RS%'] = df['RS%'].str.rstrip('%').astype('float') / 100.0
df['SB%'] = df['SB%'].str.rstrip('%').astype('float') / 100.0
df = df.dropna()
train, test = train_test_split(df, random_state=123)
xcols = ["Age", "G", "R", "H", "HR", "RBI", "SB", "SO", "BA", "SLG", "OPS", "Value Ranking", "WPA", "RS%", "SB%", "ISO"]
xcols_alt = ["R", "H", "HR", "RBI", "SO", "OPS", "Value Ranking"]
X_train = train[xcols_alt]
y_train = train['allstars']
X_test = test[xcols_alt]
y_test = test['allstars']

In [3]:
# Logistic regression

log_reg = LogisticRegression(random_state=3)
params =  {
    'penalty': ['none', 'l2', 'l1', 'elasticnet'],
    'solver': ['newton-cg', 'lbfgs', 'saga', 'sag']
}
grid = GridSearchCV(estimator=log_reg,
                    param_grid=params,
                    cv=10,
                    n_jobs=1,
                    verbose=0)
grid.fit(X_train, y_train)
p = grid.best_params_
print(p)

# Logistic Reg post tuning

log_reg = Pipeline([
    ("std", StandardScaler()),
    ("logr", LogisticRegression(penalty = p['penalty'], solver = p['solver'], random_state=4))
])

{'penalty': 'none', 'solver': 'newton-cg'}


In [4]:
# k-Nearest Neighbors
knn = KNeighborsClassifier()
params = {'n_neighbors': [6, 8, 9, 10, 12]}
grid = GridSearchCV(estimator=knn,
                    param_grid=params,
                    cv=10,
                    n_jobs=1,
                    verbose=0)
grid.fit(X_train, y_train)
print(grid.best_params_)
knn = KNeighborsClassifier(n_neighbors = grid.best_params_['n_neighbors'])

{'n_neighbors': 8}


In [5]:
# Random Forest
rf = RandomForestClassifier(random_state = 5)
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [9, 10, 11],
    'max_features': ['auto', 'sqrt', 'log2']
}
grid = GridSearchCV(estimator=rf,
                    param_grid=params,
                    cv=10,
                    n_jobs=1,
                    verbose=0)
grid.fit(X_train, y_train)
p = grid.best_params_
print(p)
rf = RandomForestClassifier(random_state = 5, criterion = p['criterion'], max_depth = p['max_depth'], max_features = p['max_features'])

{'criterion': 'entropy', 'max_depth': 10, 'max_features': 'auto'}


In [6]:
# Decision Tree
dtc = DecisionTreeClassifier(random_state = 7)
params =  {
    'min_samples_split': [1, 2, 3],
    'max_depth': [2, 4, 6, 8, None],
    'max_features': ["auto","sqrt", "log2", None],
    'criterion': ["gini", "entropy"]
}
grid = GridSearchCV(estimator=dtc,
                    param_grid=params,
                    cv=10,
                    n_jobs=1,
                    verbose=0)
grid.fit(X_train, y_train)
p = grid.best_params_
print(p)
dtc = DecisionTreeClassifier(random_state = 7,
                             max_depth = grid.best_params_['max_depth'],
                             max_features = grid.best_params_['max_features'],
                             min_samples_split = grid.best_params_['min_samples_split'], 
                             criterion = grid.best_params_['criterion'])

{'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'min_samples_split': 2}


In [7]:
# Multi-Layer Perceptron
# note: gridsearch takes forever
mlp = MLPClassifier(random_state=0)

In [8]:
# AdaBoost
ada = AdaBoostClassifier()

In [10]:
# Vote Classifier (Ensemble)
clfs = [log_reg, knn, rf, dtc, mlp, ada]
ens = EnsembleVoteClassifier(clfs = clfs)
labels = ['Logistic Regression', "k Nearest Neighbors", "Random Forest", "Decision Tree", "MLP", "AdaBoost", "Vote Ensemble"]
for clf, label in zip(clfs + [ens], labels):
    clf.fit(X_train, y_train)
    print("Test accuracy: %0.3f [%s]" % (clf.score(X_test, y_test) * 100, label))

# Stacking Classifier
meta_clfs = [LogisticRegression(random_state=69), KNeighborsClassifier(), RandomForestClassifier(random_state = 5),
             DecisionTreeClassifier(random_state = 7), MLPClassifier(random_state=0), AdaBoostClassifier()]
for meta_clf in meta_clfs:
    print("\n" + "Stacking with meta clf: " + str(meta_clf))
    stack = StackingClassifier(classifiers = clfs, meta_classifier = meta_clf)
    stack.fit(X_train, y_train)
    print("Train accuracy: %0.3f" % (stack.score(X_train, y_train) * 100))
    print("Test accuracy: %0.3f" % (stack.score(X_test, y_test) * 100))

Test accuracy: 92.044 [Logistic Regression]
Test accuracy: 91.084 [k Nearest Neighbors]
Test accuracy: 91.632 [Random Forest]
Test accuracy: 91.770 [Decision Tree]
Test accuracy: 90.261 [MLP]
Test accuracy: 91.632 [AdaBoost]
Test accuracy: 91.907 [Vote Ensemble]

Stacking with meta clf: LogisticRegression(random_state=69)
LogisticRegression(random_state=69)
Train accuracy: 98.764
Test accuracy: 91.632

Stacking with meta clf: KNeighborsClassifier()
KNeighborsClassifier()
Train accuracy: 98.764
Test accuracy: 91.632

Stacking with meta clf: RandomForestClassifier(random_state=5)
RandomForestClassifier(random_state=5)
Train accuracy: 98.764
Test accuracy: 91.632

Stacking with meta clf: DecisionTreeClassifier(random_state=7)
DecisionTreeClassifier(random_state=7)
Train accuracy: 98.764
Test accuracy: 91.632

Stacking with meta clf: MLPClassifier(random_state=0)
MLPClassifier(random_state=0)
Train accuracy: 98.764
Test accuracy: 91.632

Stacking with meta clf: AdaBoostClassifier()
AdaBoos

In [11]:
# Vote Classifier (Ensemble)
clfs = [log_reg, rf, mlp]
ens = EnsembleVoteClassifier(clfs = clfs)
labels = ['Logistic Regression', "Random Forest", "MLP", "Vote Ensemble"]
for clf, label in zip(clfs + [ens], labels):
    clf.fit(X_train, y_train)
    print("Test accuracy: %0.3f [%s]" % (clf.score(X_test, y_test) * 100, label))

# Stacking Classifier
meta_clfs = [LogisticRegression(random_state=69), KNeighborsClassifier(), RandomForestClassifier(random_state = 5),
             DecisionTreeClassifier(random_state = 7), MLPClassifier(random_state=0), AdaBoostClassifier()]
for meta_clf in meta_clfs:
    print("\n" + "Stacking with meta clf: " + str(meta_clf))
    stack = StackingClassifier(classifiers = clfs, meta_classifier = meta_clf)
    stack.fit(X_train, y_train)
    print("Train accuracy: %0.3f" % (stack.score(X_train, y_train) * 100))
    print("Test accuracy: %0.3f" % (stack.score(X_test, y_test) * 100))

Test accuracy: 92.044 [Logistic Regression]
Test accuracy: 91.632 [Random Forest]
Test accuracy: 90.261 [MLP]
Test accuracy: 91.770 [Vote Ensemble]

Stacking with meta clf: LogisticRegression(random_state=69)
Train accuracy: 98.764
Test accuracy: 91.632

Stacking with meta clf: KNeighborsClassifier()
Train accuracy: 98.764
Test accuracy: 91.632

Stacking with meta clf: RandomForestClassifier(random_state=5)
Train accuracy: 98.764
Test accuracy: 91.632

Stacking with meta clf: DecisionTreeClassifier(random_state=7)
Train accuracy: 98.764
Test accuracy: 91.632

Stacking with meta clf: MLPClassifier(random_state=0)
Train accuracy: 98.764
Test accuracy: 91.632

Stacking with meta clf: AdaBoostClassifier()
Train accuracy: 98.764
Test accuracy: 91.632
