# Load Data, Data Wrangling, Train Dataset and Test Dataset Preparation

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, zero_one_loss, precision_recall_fscore_support
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc

a = time.time()

In [2]:
RegularSeason_df = pd.read_csv('Data/RegularSeasonDetailedResults.csv')
Tourney_df = pd.read_csv('Data/TourneyDetailedResults.csv')

In [3]:
RegularSeason_df = RegularSeason_df.drop(['Season', 'Daynum', 'Wloc', 'Numot'], axis = 1)
Tourney_df = Tourney_df.drop(['Season', 'Daynum', 'Wloc', 'Numot'], axis = 1)

In [4]:
# Training Data
WinLosePair_df = RegularSeason_df
WinLosePair_df

Unnamed: 0,Wteam,Wscore,Lteam,Lscore,Wfgm,Wfga,Wfgm3,Wfga3,Wftm,Wfta,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,1104,68,1328,62,27,58,3,14,11,18,...,10,16,22,10,22,8,18,9,2,20
1,1272,70,1393,63,26,62,8,20,10,19,...,24,9,20,20,25,7,12,8,6,16
2,1266,73,1437,61,24,58,8,18,17,29,...,26,14,23,31,22,9,12,2,5,23
3,1296,56,1457,50,18,38,3,9,17,31,...,22,8,15,17,20,9,19,4,3,23
4,1400,77,1208,71,30,61,6,14,11,13,...,16,17,27,21,15,12,10,7,1,14
5,1458,81,1186,55,26,57,6,12,23,27,...,11,12,17,6,22,8,19,4,3,25
6,1161,80,1236,62,23,55,2,8,32,39,...,15,20,28,9,21,11,30,10,4,28
7,1186,75,1457,61,28,62,4,14,15,21,...,17,17,23,8,25,10,15,14,8,18
8,1194,71,1156,66,28,58,5,11,10,18,...,18,12,27,13,26,13,25,8,2,18
9,1458,84,1296,56,32,67,5,17,15,19,...,14,7,12,9,23,10,18,1,3,18


In [5]:
TrainDataOriginal_1_df = pd.DataFrame(WinLosePair_df[\
                                           ['Wfgm', 'Wfga', 'Wfgm3', 'Wfga3', 'Wftm', 'Wfta', 'Wor', 'Wdr', 'Wast', 'Wto', 'Wstl', 'Wblk', 'Wpf',\
                                           'Lfgm', 'Lfga', 'Lfgm3', 'Lfga3', 'Lftm', 'Lfta', 'Lor', 'Ldr', 'Last', 'Lto', 'Lstl', 'Lblk', 'Lpf']])\
.rename(columns = {'Wfgm':'fgm_x', 'Wfga':'fga_x', 'Wfgm3':'fgm3_x', 'Wfga3':'fga3_x', 'Wftm':'ftm_x', 'Wfta':'fta_x', 'Wor':'or_x', 'Wdr':'dr_x', 'Wast':'ast_x', 'Wto':'to_x', 'Wstl':'stl_x', 'Wblk':'blk_x', 'Wpf':'pf_x',\
                  'Lfgm':'fgm_y', 'Lfga':'fga_y', 'Lfgm3':'fgm3_y', 'Lfga3':'fga3_y', 'Lftm':'ftm_y', 'Lfta':'fta_y', 'Lor':'or_y', 'Ldr':'dr_y', 'Last':'ast_y', 'Lto':'to_y', 'Lstl':'stl_y', 'Lblk':'blk_y', 'Lpf':'pf_y'})\
.reset_index(drop = True)

TrainDataOriginal_2_df = pd.DataFrame(WinLosePair_df[\
                                           ['Lfgm', 'Lfga', 'Lfgm3', 'Lfga3', 'Lftm', 'Lfta', 'Lor', 'Ldr', 'Last', 'Lto', 'Lstl', 'Lblk', 'Lpf',\
                                           'Wfgm', 'Wfga', 'Wfgm3', 'Wfga3', 'Wftm', 'Wfta', 'Wor', 'Wdr', 'Wast', 'Wto', 'Wstl', 'Wblk', 'Wpf']])\
.rename(columns = {'Lfgm':'fgm_x', 'Lfga':'fga_x', 'Lfgm3':'fgm3_x', 'Lfga3':'fga3_x', 'Lftm':'ftm_x', 'Lfta':'fta_x', 'Lor':'or_x', 'Ldr':'dr_x', 'Last':'ast_x', 'Lto':'to_x', 'Lstl':'stl_x', 'Lblk':'blk_x', 'Lpf':'pf_x',\
                  'Wfgm':'fgm_y', 'Wfga':'fga_y', 'Wfgm3':'fgm3_y', 'Wfga3':'fga3_y', 'Wftm':'ftm_y', 'Wfta':'fta_y', 'Wor':'or_y', 'Wdr':'dr_y', 'Wast':'ast_y', 'Wto':'to_y', 'Wstl':'stl_y', 'Wblk':'blk_y', 'Wpf':'pf_y'})\
.reset_index(drop = True)


TrainData_df = TrainDataOriginal_1_df.append(TrainDataOriginal_2_df).reset_index(drop = True)
TrainData = TrainData_df.values
TrainLabel = np.ones((len(WinLosePair_df)*2))
TrainLabel[len(WinLosePair_df):] = 2

In [6]:
# Test Data
Season2018_df = pd.read_csv('Data/2018.csv', skiprows = 1)
NCAA_df = Season2018_df[Season2018_df['School'].str.contains('NCAA')]
NCAA_df['School'] = NCAA_df['School'].map(lambda x: x.rstrip(' NCAA'))
NCAA_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,Rk,School,G,W,L,W-L%,SRS,SOS,W.1,L.1,...,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF,Unnamed: 34
6,7,Alabama,36,20,16,0.556,12.34,10.64,8,10,...,828,0.670,358,1290,458,228,192,513,676,
11,12,Arizona State,32,20,12,0.625,14.37,6.56,8,10,...,793,0.733,315,1127,458,209,114,343,604,
12,13,Arizona,35,27,8,0.771,15.67,6.84,14,4,...,741,0.760,352,1277,531,173,158,426,603,
16,17,Arkansas,35,23,12,0.657,14.76,9.85,10,8,...,793,0.681,350,1208,501,218,163,381,700,
18,19,Auburn,34,26,8,0.765,15.97,7.29,13,5,...,836,0.774,403,1290,481,252,180,411,672,
33,34,Bucknell,35,25,10,0.714,4.48,-3.46,16,2,...,872,0.719,324,1281,505,194,152,437,655,
34,35,Buffalo,36,27,9,0.750,7.99,0.25,15,3,...,695,0.699,426,1388,597,225,146,446,755,
35,36,Butler,35,21,14,0.600,16.92,10.55,9,9,...,562,0.776,319,1198,489,232,100,391,642,
38,39,Cal State Fullerton,32,20,12,0.625,-0.92,-1.02,10,6,...,745,0.729,276,1123,403,201,106,455,578,
55,56,Cincinnati,36,31,5,0.861,20.25,3.53,16,2,...,756,0.692,477,1413,570,256,194,404,561,


In [7]:
ncaa2018_df = pd.read_csv('Data/ncaa2018.csv')
ncaa2018_df

Unnamed: 0,School_x,School_y,Results
0,Kansas,Penn,1
1,Seton Hall,North Carolina State,1
2,Clemson,New Mexico State,1
3,Auburn,College of Charleston,1
4,Texas Christian,Syracuse,2
5,Michigan State,Bucknell,1
6,Rhode Island,Oklahoma,1
7,Duke,Iona,1
8,Kansas,Seton Hall,1
9,Clemson,Auburn,1


## Testing on the all the tournaments together using best performing Adaboost classifier

In [8]:
Season2018_stat_df = NCAA_df[['G', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'TRB', 'AST', 
                              'TOV', 'STL', 'BLK', 'PF']]
Season2018_stat_df['TRB'] = Season2018_stat_df['TRB'] - Season2018_df['ORB']
Season2018_stat_df.rename(columns = {'TRB': 'DRB'}, inplace = True)
Season2018_avg_df = Season2018_stat_df.div(Season2018_stat_df.G, axis = 0).join(NCAA_df['School'])
Season2018_avg_df.drop('G', axis=1, inplace=True)
Season2018_avg_df.columns = ['fgm', 'fga', 'fgm3', 'fga3', 'ftm', 'fta', 'or', 'dr', 'ast', 'to', 
                            'stl', 'blk', 'pf', 'School']
Season2018_avg_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,fgm,fga,fgm3,fga3,ftm,fta,or,dr,ast,to,stl,blk,pf,School
6,25.277778,54.861111,6.361111,19.500000,15.416667,23.000000,9.944444,25.888889,12.722222,14.250000,6.333333,5.333333,18.777778,Alabama
11,27.781250,60.000000,9.000000,24.781250,18.156250,24.781250,9.843750,25.375000,14.312500,10.718750,6.531250,3.562500,18.875000,Arizona State
12,28.828571,57.257143,6.771429,18.342857,16.085714,21.171429,10.057143,26.428571,15.171429,12.171429,4.942857,4.514286,17.228571,Arizona
16,28.714286,60.542857,7.685714,19.457143,15.428571,22.657143,10.000000,24.514286,14.314286,10.885714,6.228571,4.657143,20.000000,Arkansas
18,26.647059,61.823529,9.529412,26.705882,19.029412,24.588235,11.852941,26.088235,14.147059,12.088235,7.411765,5.294118,19.764706,Auburn
33,27.571429,58.571429,8.000000,22.971429,17.914286,24.914286,9.257143,27.342857,14.428571,12.485714,5.542857,4.342857,18.714286,Bucknell
34,30.694444,64.833333,9.750000,26.333333,13.500000,19.305556,11.833333,26.722222,16.583333,12.388889,6.250000,4.055556,20.972222,Buffalo
35,29.142857,61.571429,8.200000,23.000000,12.457143,16.057143,9.114286,25.114286,13.971429,11.171429,6.628571,2.857143,18.342857,Butler
38,25.125000,53.781250,5.218750,15.625000,16.968750,23.281250,8.625000,26.468750,12.593750,14.218750,6.281250,3.312500,18.062500,Cal State Fullerton
55,26.222222,58.333333,7.277778,20.611111,14.527778,21.000000,13.250000,26.000000,15.833333,11.222222,7.111111,5.388889,15.583333,Cincinnati


In [9]:
# Tourney_test_df = Tourney_df[['Wfgm', 'Wfga', 'Wfgm3', 'Wfga3', 'Wftm', 'Wfta', 'Wor', 'Wdr', 'Wast', 'Wto', 'Wstl', 'Wblk', 'Wpf',\
#                                            'Lfgm', 'Lfga', 'Lfgm3', 'Lfga3', 'Lftm', 'Lfta', 'Lor', 'Ldr', 'Last', 'Lto', 'Lstl', 'Lblk', 'Lpf']]\
# .rename(columns = {'Wfgm':'fgm_x', 'Wfga':'fga_x', 'Wfgm3':'fgm3_x', 'Wfga3':'fga3_x', 'Wftm':'ftm_x', 'Wfta':'fta_x', 'Wor':'or_x', 'Wdr':'dr_x', 'Wast':'ast_x', 'Wto':'to_x', 'Wstl':'stl_x', 'Wblk':'blk_x', 'Wpf':'pf_x',\
#                   'Lfgm':'fgm_y', 'Lfga':'fga_y', 'Lfgm3':'fgm3_y', 'Lfga3':'fga3_y', 'Lftm':'ftm_y', 'Lfta':'fta_y', 'Lor':'or_y', 'Ldr':'dr_y', 'Last':'ast_y', 'Lto':'to_y', 'Lstl':'stl_y', 'Lblk':'blk_y', 'Lpf':'pf_y'})\
# .reset_index(drop = True)
# TestLabel = np.ones((len(Tourney_test_df)))
# Tourney_test_df.head(n=30)

# half with first team winning, half with second team winning, to make the class more balanced
Tourney_df1 = Tourney_df.iloc[0 : int(len(Tourney_df)/2)]
Tourney_df2 = Tourney_df.iloc[int(len(Tourney_df)/2) : ]
TestDataOriginal_1_df = pd.DataFrame(Tourney_df1[\
                                           ['Wfgm', 'Wfga', 'Wfgm3', 'Wfga3', 'Wftm', 'Wfta', 'Wor', 'Wdr', 'Wast', 'Wto', 'Wstl', 'Wblk', 'Wpf',\
                                           'Lfgm', 'Lfga', 'Lfgm3', 'Lfga3', 'Lftm', 'Lfta', 'Lor', 'Ldr', 'Last', 'Lto', 'Lstl', 'Lblk', 'Lpf']])\
.rename(columns = {'Wfgm':'fgm_x', 'Wfga':'fga_x', 'Wfgm3':'fgm3_x', 'Wfga3':'fga3_x', 'Wftm':'ftm_x', 'Wfta':'fta_x', 'Wor':'or_x', 'Wdr':'dr_x', 'Wast':'ast_x', 'Wto':'to_x', 'Wstl':'stl_x', 'Wblk':'blk_x', 'Wpf':'pf_x',\
                  'Lfgm':'fgm_y', 'Lfga':'fga_y', 'Lfgm3':'fgm3_y', 'Lfga3':'fga3_y', 'Lftm':'ftm_y', 'Lfta':'fta_y', 'Lor':'or_y', 'Ldr':'dr_y', 'Last':'ast_y', 'Lto':'to_y', 'Lstl':'stl_y', 'Lblk':'blk_y', 'Lpf':'pf_y'})\
.reset_index(drop = True)

TestDataOriginal_2_df = pd.DataFrame(Tourney_df2[\
                                           ['Lfgm', 'Lfga', 'Lfgm3', 'Lfga3', 'Lftm', 'Lfta', 'Lor', 'Ldr', 'Last', 'Lto', 'Lstl', 'Lblk', 'Lpf',\
                                           'Wfgm', 'Wfga', 'Wfgm3', 'Wfga3', 'Wftm', 'Wfta', 'Wor', 'Wdr', 'Wast', 'Wto', 'Wstl', 'Wblk', 'Wpf']])\
.rename(columns = {'Lfgm':'fgm_x', 'Lfga':'fga_x', 'Lfgm3':'fgm3_x', 'Lfga3':'fga3_x', 'Lftm':'ftm_x', 'Lfta':'fta_x', 'Lor':'or_x', 'Ldr':'dr_x', 'Last':'ast_x', 'Lto':'to_x', 'Lstl':'stl_x', 'Lblk':'blk_x', 'Lpf':'pf_x',\
                  'Wfgm':'fgm_y', 'Wfga':'fga_y', 'Wfgm3':'fgm3_y', 'Wfga3':'fga3_y', 'Wftm':'ftm_y', 'Wfta':'fta_y', 'Wor':'or_y', 'Wdr':'dr_y', 'Wast':'ast_y', 'Wto':'to_y', 'Wstl':'stl_y', 'Wblk':'blk_y', 'Wpf':'pf_y'})\
.reset_index(drop = True)

TestData_df = TestDataOriginal_1_df.append(TestDataOriginal_2_df).reset_index(drop = True)
TestData_df

Unnamed: 0,fgm_x,fga_x,fgm3_x,fga3_x,ftm_x,fta_x,or_x,dr_x,ast_x,to_x,...,fga3_y,ftm_y,fta_y,or_y,dr_y,ast_y,to_y,stl_y,blk_y,pf_y
0,32,69,11,29,17,26,14,30,17,12,...,31,14,31,17,28,16,15,5,0,22
1,31,66,7,23,11,14,11,36,22,16,...,16,7,7,8,26,12,17,10,3,15
2,31,59,6,14,16,22,10,27,18,9,...,28,14,21,20,22,11,12,2,5,18
3,29,53,3,7,18,25,11,20,15,18,...,17,12,17,14,17,20,21,6,6,21
4,27,64,7,20,15,23,18,20,17,13,...,21,15,20,10,26,16,14,5,8,19
5,17,52,4,14,20,27,12,29,8,14,...,17,11,13,15,26,11,11,8,4,22
6,19,54,4,13,25,31,13,27,4,16,...,11,18,22,11,24,8,19,5,4,19
7,20,47,6,14,28,37,8,28,12,12,...,27,7,10,13,22,13,10,7,6,24
8,24,56,5,14,12,14,15,23,15,14,...,24,8,13,17,18,10,14,6,5,16
9,28,51,2,6,6,11,7,20,13,11,...,17,9,10,13,19,13,13,6,1,15


In [10]:

Test_df = pd.merge(ncaa2018_df, Season2018_avg_df, left_on = 'School_x', right_on = 'School', how = 'inner')
Test_df = pd.merge(Test_df, Season2018_avg_df, left_on = 'School_y', right_on = 'School', how = 'inner')
Results = Test_df['Results'].values

# creating labels for tournament testing data
TestLabel1 = np.ones((1, int(TestData_df.shape[0] / 2)))
TestLabel2 = np.ones((1, int(TestData_df.shape[0] / 2))) * 2
TestLabel = np.append(TestLabel1 , TestLabel2)
TestLabel = np.append(Results, TestLabel)

Test_df = Test_df.drop(['School_x', 'School_y', 'Rk_x', 'G_x', 'Results', 'Rk_y', 'G_y'], axis = 1)

# add the past tournament data
Test_df = Test_df.append(TestData_df)

Test_df = Test_df.astype(int)
TestData = Test_df.values
Test_df

Unnamed: 0,fgm_x,fga_x,fgm3_x,fga3_x,ftm_x,fta_x,or_x,dr_x,ast_x,to_x,...,fga3_y,ftm_y,fta_y,or_y,dr_y,ast_y,to_y,stl_y,blk_y,pf_y
0,29,60,10,24,11,16,9,26,16,11,...,20,14,21,12,26,15,12,6,3,18
1,29,60,10,24,11,16,9,26,16,11,...,22,14,18,8,27,13,11,5,4,16
2,29,60,10,24,11,16,9,26,16,11,...,22,15,21,13,27,17,12,7,5,15
3,28,61,6,19,12,18,11,23,15,10,...,22,15,21,13,27,17,12,7,5,15
4,22,53,5,17,16,21,11,25,10,12,...,22,15,21,13,27,17,12,7,5,15
5,28,60,7,20,14,21,12,26,15,12,...,21,14,21,12,23,16,12,7,3,18
6,25,56,8,22,14,18,8,27,13,11,...,22,11,17,12,28,14,12,5,3,17
7,25,56,8,22,14,18,8,27,13,11,...,26,19,24,11,26,14,12,7,5,19
8,26,61,9,26,19,24,11,26,14,12,...,21,15,20,8,24,11,9,5,3,16
9,29,59,8,21,14,20,11,25,18,12,...,17,16,21,11,25,10,12,6,5,16


## Visualization of the classifier performance

## Comparing four classifiers

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_error_mean = np.mean(1-train_scores, axis=1)
    train_error_std = np.std(1-train_scores, axis=1)
    test_error_mean = np.mean(1-test_scores, axis=1)
    test_error_std = np.std(1-test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_error_mean - train_error_std,
                     train_error_mean + train_error_std, alpha=0.1, color="r"
                     )
    # color="r"
    plt.fill_between(train_sizes, test_error_mean - test_error_std,
                     test_error_mean + test_error_std, alpha=0.1, color="g")
    # , color="g"
    plt.plot(train_sizes, train_error_mean, 'o-', color="r",
             label=" Training error")
    # color="r"
    plt.plot(train_sizes, test_error_mean, 'o-', color="g",
             label=" Cross-validation error")
    plt.legend(loc="best")
    plt.show()
#     return plt


X_test,y_test = TestData, TestLabel
X_train,y_train = TrainData, TrainLabel

print("start")

title = "Learning Curves for Random Forest"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = RandomForestClassifier()
plot_learning_curve(estimator, title, X_train, y_train, ylim=(0, 0.35), cv=cv, n_jobs=4)

print("rf finished")


title = "Learning Curves for Adaboost Classifier"
estimator = AdaBoostClassifier()
plot_learning_curve(estimator, title, X_train, y_train, ylim=(0, 0.35), cv=cv, n_jobs=4)

print("ab finished")

title = "Learning Curves for Gradient Boosting Tree"
#cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
estimator = GradientBoostingClassifier()
plot_learning_curve(estimator, title, X_train, y_train, ylim=(0, 0.35), cv=cv, n_jobs=4)

print("gbm finished")

title = "Learning Curves for Logistic Regression"
estimator = LogisticRegression()
plot_learning_curve(estimator, title, X_train, y_train, ylim=(0, 0.35), cv=cv, n_jobs=-1)

print("lr finished")

# plt.show()

start


## Comparing adaboost and gradient boosting tree

In [None]:
n_estimators = 400
learning_rate = 1
X_test,y_test = TestData, TestLabel
X_train,y_train = TrainData, TrainLabel

dt_stump=DecisionTreeClassifier(max_depth=1,min_samples_leaf=1)
dt_stump.fit(X_train,y_train)
dt_stump_err=1.0-dt_stump.score(X_test,y_test)
 
# dt=DecisionTreeClassifier(max_depth=9,min_samples_leaf=1)
# dt.fit(X_train,y_train)
# dt_err=1.0-dt.score(X_test,y_test)
 
# ada_discrete=AdaBoostClassifier(base_estimator=dt_stump,learning_rate=learning_rate,n_estimators=n_estimators,algorithm='SAMME')
# ada_discrete.fit(X_train,y_train)

# Random Forest
rf_clf = RandomForestClassifier(n_estimators=n_estimators)
rf_clf.fit(X_train,y_train)

# Gradient Boosting Tree
gbm_clf = GradientBoostingClassifier(n_estimators=n_estimators) # default n_estimator is 100
gbm_clf.fit(X_train,y_train)
 
# base_estimator=dt_stump
ada_real=AdaBoostClassifier(learning_rate=learning_rate,n_estimators=n_estimators,algorithm='SAMME.R')
ada_real.fit(X_train,y_train)
 
fig=plt.figure()
ax=fig.add_subplot(111)
# ax.plot([1,n_estimators],[dt_stump_err]*2,'k-',label='Decision Stump Error')
# ax.plot([1,n_estimators],[dt_err]*2,'k--',label='Decision Tree Error')
 
# ada_discrete_err=np.zeros((n_estimators,))
# for i,y_pred in enumerate(ada_discrete.staged_predict(X_test)):
#     ada_discrete_err[i]=zero_one_loss(y_pred,y_test)    ######zero_one_loss
# ada_discrete_err_train=np.zeros((n_estimators,))
# for i,y_pred in enumerate(ada_discrete.staged_predict(X_train)):
#     ada_discrete_err_train[i]=zero_one_loss(y_pred,y_train)
    
ada_real_err=np.zeros((n_estimators,))
for i,y_pred in enumerate(ada_real.staged_predict(X_test)):
    ada_real_err[i]=zero_one_loss(y_pred,y_test)
ada_real_err_train=np.zeros((n_estimators,))
for i,y_pred in enumerate(ada_real.staged_predict(X_train)):
    ada_real_err_train[i]=zero_one_loss(y_pred,y_train)
    
gbm_err=np.zeros((n_estimators,))
for i,y_pred in enumerate(gbm_clf.staged_predict(X_test)):
    gbm_err[i]=zero_one_loss(y_pred,y_test)
gbm_err_train=np.zeros((n_estimators,))
for i,y_pred in enumerate(gbm_clf.staged_predict(X_train)):
    gbm_err_train[i]=zero_one_loss(y_pred,y_train)

ax.plot(np.arange(n_estimators)+1,ada_real_err,label='Real AdaBoost Test Error',color='orange')
ax.plot(np.arange(n_estimators)+1,ada_real_err_train,label='Real AdaBoost Train Error',color='green')
# ax.plot(np.arange(n_estimators)+1,rf_err,label='Random Forest Test Error',color='red')
# ax.plot(np.arange(n_estimators)+1,rf_err_train,label='Random Forest Train Error',color='blue')
ax.plot(np.arange(n_estimators)+1,gbm_err,label='Gradient Boosting Tree Test Error',color='black')
ax.plot(np.arange(n_estimators)+1,gbm_err_train,label='Gradient Boosting Tree Train Error',color='yellow')
 
ax.set_ylim((0.0,0.5))
ax.set_xlabel('n_estimators')
ax.set_ylabel('error rate')
 
leg=ax.legend(loc='upper right',fancybox=True)
leg.get_frame().set_alpha(0.7)
b=time.time()
print('total running time of this example is :',b-a)
plt.show()



## ROC curve for logistic and adaboost

In [None]:
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)

In [None]:
fpr, tpr, thresholds = roc_curve(TestLabel, log_clf.predict(TestData), pos_label=2)

roc_auc = auc(fpr, tpr)

plt.title('ROC')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)

plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(TestLabel, ada_real.predict(TestData), pos_label=2)

roc_auc = auc(fpr, tpr)

plt.title('ROC')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)

plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# precision, recall, fscore
precision_recall_fscore_support(TestLabel, log_clf.predict(TestData), average = "weighted")

In [None]:
precision_recall_fscore_support(TestLabel, ada_real.predict(TestData), average = "weighted")

In [None]:
confusion_matrix(TestLabel, log_clf.predict(TestData)) 

In [None]:
confusion_matrix(TestLabel, ada_real.predict(TestData)) 

In [None]:
import scikitplot as skplt
import matplotlib.pyplot as plt

y_true = TestLabel
y_probas = ada_real.predict_proba(TestData)
skplt.metrics.plot_roc_curve(y_true, y_probas)
plt.show()

In [None]:
import scikitplot as skplt
import matplotlib.pyplot as plt

y_true = TestLabel
y_probas = log_clf.predict_proba(TestData)
skplt.metrics.plot_roc_curve(y_true, y_probas)
plt.show()

## Testing on this year's tournament on some single games

In [None]:
# random game from this year's tournament (UMBC vs. Virginia)
t1 = Season2018_avg_df[Season2018_avg_df['School'] == 'Maryland-Baltimore County']
t2 = Season2018_avg_df[Season2018_avg_df['School'] == 'Virginia']
columns_use = t1.columns.tolist()
#columns_use.remove('G')
columns_use.remove('School')
t1_test = t1[columns_use].reset_index(drop=True)
t2_test = t2[columns_use].reset_index(drop=True)
test_game = pd.concat([t1_test,t2_test], axis=1)

In [None]:
ada_real.predict(test_game) # UMBC won, predicted correctly

In [None]:
# random game from this year's tournament (Missouri vs. Florida State)
t1 = Season2018_avg_df[Season2018_avg_df['School'] == 'Missouri']
t2 = Season2018_avg_df[Season2018_avg_df['School'] == 'Florida State']
columns_use = t1.columns.tolist()
columns_use.remove('School')
t1_test = t1[columns_use].reset_index(drop=True)
t2_test = t2[columns_use].reset_index(drop=True)
test_game = pd.concat([t1_test,t2_test], axis=1)

In [None]:
ada_real.predict(test_game) # FSU won, predicted correctly

In [None]:
# random game from this year's tournament (Duke vs. Iona)
t1 = Season2018_avg_df[Season2018_avg_df['School'] == 'Duke']
t2 = Season2018_avg_df[Season2018_avg_df['School'] == 'Iona']
columns_use = t1.columns.tolist()
#columns_use.remove('G')
columns_use.remove('School')
t1_test = t1[columns_use].reset_index(drop=True)
t2_test = t2[columns_use].reset_index(drop=True)
test_game = pd.concat([t1_test,t2_test], axis=1)

In [None]:
ada_real.predict(test_game) # Duke won, predicted correctly

In [None]:
# random game from this year's tournament (Arizona vs. Buffalo)
t1 = Season2018_avg_df[Season2018_avg_df['School'] == 'Arizona']
t2 = Season2018_avg_df[Season2018_avg_df['School'] == 'Buffalo']
columns_use = t1.columns.tolist()
#columns_use.remove('G')
columns_use.remove('School')
t1_test = t1[columns_use].reset_index(drop=True)
t2_test = t2[columns_use].reset_index(drop=True)
test_game = pd.concat([t1_test,t2_test], axis=1)

In [None]:
ada_real.predict(test_game) # Buffalo won, predicted correctly

In [None]:
# random game from this year's tournament (Villanova vs. Radford)
t1 = Season2018_avg_df[Season2018_avg_df['School'] == 'Radford']
t2 = Season2018_avg_df[Season2018_avg_df['School'] == 'Villanova']
columns_use = t1.columns.tolist()
#columns_use.remove('G')
columns_use.remove('School')
t1_test = t1[columns_use].reset_index(drop=True)
t2_test = t2[columns_use].reset_index(drop=True)
test_game = pd.concat([t1_test,t2_test], axis=1)

In [None]:
ada_real.predict(test_game) # Villanova won, predicted correctly

In [None]:
plotlist = ['fgm', 'fga', 'fgm3', 'fga3', 'ftm', 'fta', 'or', 'dr', 'ast', 'to', 'stl', 'blk', 'pf']

for key in plotlist:
    x = WinLosePair_df['W' + key]
    y = WinLosePair_df['L' + key]
    plt.figure()
    plt.plot(range(len(WinLosePair_df['W' + key])), x, 'o-', color = "r", label=" Win")
    plt.plot(range(len(WinLosePair_df['L' + key])), y, '*-', color = "g", label=" Lose")
    plt.xlabel("Team")
    plt.ylabel("Value")
    plt.legend(loc="best")
    plt.title(key + '_Comparison')
    plt.savefig(key + '_Comparison')

In [None]:
x = WinLosePair_df['Wfgm'] / WinLosePair_df['Wfga']
y = WinLosePair_df['Lfgm'] / WinLosePair_df['Lfga']
plt.figure()
plt.plot(range(len(WinLosePair_df['Wfgm'])), x, 'o-', color = "r", label=" Win")
plt.plot(range(len(WinLosePair_df['Lfgm'])), y, '*-', color = "g", label=" Lose")
plt.xlabel("Team")
plt.ylabel("Value")
plt.legend(loc="best")
plt.title('ShootingAverage_Comparison')
plt.savefig('ShootingAverage_Comparison')