In [10]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from IPython.display import clear_output
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from scipy.stats import randint, uniform, mstats
import xgboost as xgb
import process_data
import classifier
import csv
%matplotlib inline

In [4]:
# Load data
df_x, x, y, game_ids = classifier.load_data('output_7.pkl')

print df_x.columns.values[:30]  # print the first 30 columns

['home_start_p1_FG3_PCT' 'home_start_p1_FG_PCT' 'home_start_p1_FT_PCT'
 'home_start_p1_MIN' 'home_start_p1_PF' 'home_start_p1_PLUS_MINUS'
 'home_start_p1_PTS' 'home_start_p1_3PAr' 'home_start_p1_ASTr'
 'home_start_p1_BLKr' 'home_start_p1_DRBr' 'home_start_p1_DRtg'
 'home_start_p1_DWS' 'home_start_p1_FTr' 'home_start_p1_ORBr'
 'home_start_p1_ORtg' 'home_start_p1_OWS' 'home_start_p1_STLr'
 'home_start_p1_TOVr' 'home_start_p1_TRBr' 'home_start_p1_TS'
 'home_start_p1_USGr' 'home_start_p1_WS' 'home_start_p1_WS48'
 'home_start_p1_atb3_freq' 'home_start_p1_atb3_pps' 'home_start_p1_c3_freq'
 'home_start_p1_c3_pps' 'home_start_p1_mid_freq' 'home_start_p1_mid_pps']


In [17]:
# create pipeline
xgb_select = SelectKBest(f_classif, k=200)
selection = SelectKBest(f_classif, k=120)
selection2 = SelectKBest(f_classif, k=80)
selection3 = SelectKBest(f_classif, k=40)
pca = PCA(n_components=3)
union3 = FeatureUnion([('pca', pca), ('select', selection3)])

clf = ExtraTreesClassifier(n_estimators=1100, n_jobs=-1, min_samples_split=6, min_samples_leaf=3, random_state=3333)
clf2 = ExtraTreesClassifier(n_estimators=1100, n_jobs=-1, min_samples_split=6, min_samples_leaf=3, random_state=12)
clf3 = ExtraTreesClassifier(n_estimators=1100, n_jobs=-1, random_state=1337)



pipeline = make_pipeline(selection, clf)
kf = KFold(len(df_x.index), n_folds=5, shuffle=True, random_state=12323)

params = {'max_depth': 6, 'eta': 0.005, 'silent': 1, 'objective': 'binary:logistic',
          'colsample_bytree': 0.3, 'min_child_weight': 37, 'seed': 233, 'gamma': 4}

xgb_clf = xgb.XGBClassifier(n_estimators=200, learning_rate=0.005, max_depth=6, 
                        colsample_bytree=0.3, min_child_weight=37, seed=233, gamma=4)

xgb_clf2 = xgb.XGBClassifier(n_estimators=200, learning_rate=0.005, max_depth=6, 
                        colsample_bytree=0.5, min_child_weight=37, seed=3213, gamma=4)

xgb_clf3 = xgb.XGBClassifier(n_estimators=150, learning_rate=0.01, max_depth=6, 
                        colsample_bytree=0.5, min_child_weight=37, seed=12323, gamma=1)

xgb_clf4 = xgb.XGBClassifier(n_estimators=250, learning_rate=0.005, max_depth=6, 
                        colsample_bytree=0.5, min_child_weight=37, seed=1337, gamma=4)

xgb_clf5 = xgb.XGBClassifier(n_estimators=150, learning_rate=0.01, max_depth=6, 
                        colsample_bytree=0.5, min_child_weight=37, seed=4)

xgb_clf6 = xgb.XGBClassifier(n_estimators=150, learning_rate=0.01, max_depth=6, 
                        colsample_bytree=0.6, subsample=0.9, min_child_weight=23, seed=422)


In [18]:
clf_names = ['extratrees120', 'extratrees80', 'extratrees30pca', 'xgb_col0.3', 
             'xgb_col0.5', 'xgb_eta0.01', 'xgbcol0.5v2', 'xgb_eta0.01v2', 'xgb_minchild23']
classifiers = [clf, clf2, clf3, xgb_clf, xgb_clf2, 
               xgb_clf3, xgb_clf4, xgb_clf5, xgb_clf6]
transformers = [selection, selection2, union3, 
                None, None, None, None, None, None]

f = open('predictions.csv', 'wb')
f_csv = csv.writer(f)
f_csv.writerow(['game_id', 'result', 'avg_prob', 'final_pred', 'won_bet'] + clf_names)


for train_index, test_index in kf:
    # print 'Training...'
    x_train = x[train_index]
    y_train = y[train_index]
    x_test = x[test_index]
    y_test = y[test_index]

    test_ids = game_ids[test_index]
    ensemble_preds = []

    for name, transformer, classifier in zip(clf_names, transformers, classifiers):
        print 'Training...' + name

        if transformer:
            fitted_transformer = transformer.fit(x_train, y_train)
            x_train_transform = fitted_transformer.transform(x_train)
            x_test_transform = fitted_transformer.transform(x_test)
        else:
            x_train_transform = x_train
            x_test_transform = x_test

        fitted_classifier = classifier.fit(x_train_transform, y_train)

        preds = fitted_classifier.predict_proba(x_test_transform)

        ensemble_preds.append(preds[:, 0])

    average_preds = mstats.gmean(np.transpose(np.array(ensemble_preds)), axis=1)

    final_pred = (average_preds < 0.50).astype(int)
    wins = (final_pred == y_test).astype(int)
    f_csv.writerows(zip(test_ids, y_test, average_preds, final_pred, wins, *ensemble_preds))
f.close()
print 'Done'

Training...extratrees120
Training...extratrees80
Training...extratrees30pca
Training...xgb_col0.3
Training...xgb_col0.5
Training...xgb_eta0.01
Training...xgbcol0.5v2
Training...xgb_eta0.01v2
Training...xgb_minchild23
Training...extratrees120
Training...extratrees80
Training...extratrees30pca
Training...xgb_col0.3
Training...xgb_col0.5
Training...xgb_eta0.01
Training...xgbcol0.5v2
Training...xgb_eta0.01v2
Training...xgb_minchild23
Training...extratrees120
Training...extratrees80
Training...extratrees30pca
Training...xgb_col0.3
Training...xgb_col0.5
Training...xgb_eta0.01
Training...xgbcol0.5v2
Training...xgb_eta0.01v2
Training...xgb_minchild23
Training...extratrees120
Training...extratrees80
Training...extratrees30pca
Training...xgb_col0.3
Training...xgb_col0.5
Training...xgb_eta0.01
Training...xgbcol0.5v2
Training...xgb_eta0.01v2
Training...xgb_minchild23
Training...extratrees120
Training...extratrees80
Training...extratrees30pca
Training...xgb_col0.3
Training...xgb_col0.5
Training...x

In [27]:
from sklearn.linear_model import LogisticRegression


In [107]:
### df_ensemble = pd.read_csv('predictions.csv')

target = df_ensemble['result'].values
features = df_ensemble.iloc[:, 5:].values

en_clf = LogisticRegression(penalty='l2', C=1.565)

scores = cross_val_score(en_clf, features, target, cv=5)
print scores, scores.mean()

[ 0.53813738  0.53265908  0.53855879  0.52001686  0.53372681] 0.532619783128


[ 0.53327717  0.54085931  0.52569503  0.53074979  0.53917439  0.53625632
  0.51096121  0.52445194  0.53204047  0.53248945] 0.530595508749
