In [42]:
#import ML frameworks
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = RidgeClassifier(alpha=1)

lreg = LogisticRegression(solver='liblinear')
svm = SVC()
knn = KNeighborsClassifier()
decTree = DecisionTreeClassifier()
rForest = RandomForestClassifier()
gaussNB = GaussianNB()
logReg = LogisticRegression()

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=30, 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )

In [43]:
#adds "TARGET" column to data
def addTarget(group):
    group["TARGET"] = group["WIN"]
    return group

In [44]:
#use previous season data to predict future seasons
def backtest(data, model, predictors, start=2, step=1):
    #list of dataframes where each df is the prediction for a season
    allPreds = []
    
    szns = sorted(data["YEAR"].unique())
    
    for sznIndex in range(start, len(szns), step):
        szn = szns[sznIndex]
        
        train = data[data["YEAR"] < szn]
        test = data[data["YEAR"] == szn]
        
        model.fit(train[predictors], train["TARGET"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        
        combined = pd.concat([test["TARGET"], preds], axis=1)
        combined.columns = ["ACTUAL", "PREDICTION"]
        
        allPreds.append(combined)
    
    return pd.concat(allPreds)

In [45]:
#runs given prediction model on given data
def fullTest(data, model, predictors, start=2, step=1):
    #list of dataframes where each df is the prediction for a season
    allPreds = []
    
    szns = sorted(data["YEAR"].unique())
    
    for sznIndex in range(start, len(szns), step):
        szn = szns[sznIndex]
        
        train = data[data["YEAR"] < szn]
        test = data[data["YEAR"] == szn]
        
        model.fit(data[predictors], data["TARGET"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        
        combined = pd.concat([test["TARGET"], preds], axis=1)
        combined.columns = ["ACTUAL", "PREDICTION"]
        
        allPreds.append(combined)
    
    return pd.concat(allPreds)

In [46]:
from sklearn.metrics import accuracy_score
modelsNames = ["Ridge", "LinReg", "SVM", "KNN", "DecisionTree", "RandomForest", "GaussianNB", "LogisticRegression"]
def runAllModels(df):
    models = [rr, lreg, svm, knn, decTree, rForest, gaussNB, logReg]
#     modelsNames = ["Ridge", "LinReg", "SVM", "KNN", "DecisionTree", "RandomForest", "GaussianNB"]
    accs = []
    for i in range(len(models)):
        predictions = backtest(df, models[i], predictors)
        predictions2 = fullTest(df, models[i], predictors)
        # predictions

        acc1 = accuracy_score(predictions["ACTUAL"], predictions["PREDICTION"])
        acc2 = accuracy_score(predictions2["ACTUAL"], predictions2["PREDICTION"])
        print(modelsNames[i])
        print(round(acc1,5))
        print(round(acc2,5))
        print("more accurate:", "backTest" if acc1 > acc2 else "fullTest")
        accs.append(acc1 if acc1 > acc2 else acc2)
        print("======")
    return(accs)

In [47]:
#import data

import pandas as pd
filePath = "C:/Users/danna/Documents/GitHub/WBBTournamentPredictions/MachineLearning/LearningData/learningData8.csv"
df = pd.read_csv(filePath, index_col=0)
df

Unnamed: 0,TEAM,FT,FTA,ORB,DRB,AST,STL,BLK,TOV,PF,...,PF_MAX_OPP,PTS_MAX_OPP,FT/FGA_MAX_OPP,AST/TOV_MAX_OPP,FG%_MAX_OPP,2P%_MAX_OPP,3P%_MAX_OPP,FT%_MAX_OPP,WIN,GAME_ID
0,Austin Peay,7,9,8,12,4,4,3,20,13,...,5,18,0.571429,2.000000,0.818182,1.000000,0.333333,0.666667,False,0
1,Tennessee,8,13,21,25,18,6,5,13,14,...,4,21,0.222222,0.333333,0.571429,0.545455,0.666667,1.000000,True,0
2,Baylor,15,20,5,23,11,5,5,20,17,...,4,34,0.625000,2.000000,0.538462,0.611111,0.375000,0.625000,False,1
3,Connecticut,11,19,12,23,20,6,3,7,15,...,5,14,0.400000,1.500000,0.500000,0.750000,0.000000,1.000000,True,1
4,Baylor,13,20,8,28,12,4,9,12,18,...,5,16,0.388889,1.000000,0.375000,0.375000,0.500000,1.000000,True,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
947,Connecticut,11,17,8,24,14,6,7,11,15,...,4,16,0.375000,3.000000,0.375000,0.454545,1.000000,1.000000,True,473
948,UCLA,20,21,25,19,12,6,2,12,16,...,4,23,0.300000,1.500000,0.875000,0.875000,0.166667,0.750000,True,474
949,Maryland,15,25,17,21,12,7,8,13,19,...,3,30,0.625000,2.000000,0.521739,0.666667,0.666667,1.000000,False,474
950,Wright State,5,7,17,21,10,6,1,11,25,...,3,27,1.750000,3.500000,0.545455,0.500000,0.555556,1.000000,False,475


In [48]:
#add target column
df = addTarget(df)

In [49]:
#handles null values in target column
df["TARGET"][pd.isnull(df["TARGET"])] = 2
df["TARGET"] = df["TARGET"].astype(int, errors="ignore")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["TARGET"][pd.isnull(df["TARGET"])] = 2


In [50]:
#select needed columns
removed_columns = ["YEAR", "WIN", "TEAM", "TEAM_OPP", "GAME_ID", "TARGET", "PTS", "PTS_OPP"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [51]:
#preprocess with Min Max Scaler
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [52]:
#handle unneeded columns
df[selected_columns][pd.isnull(df[selected_columns])] = 2
df[selected_columns] = df[selected_columns].astype(int, errors="ignore")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[selected_columns][pd.isnull(df[selected_columns])] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[selected_columns][pd.isnull(df[selected_columns])] = 2


In [53]:
#fit the data 
sfs.fit(df[selected_columns], df["TARGET"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=RidgeClassifier(alpha=1),
                          n_features_to_select=30, n_jobs=1)

In [54]:
#get predictor columns
predictors = list(selected_columns[sfs.get_support()])
predictors

['FTA',
 'ORB',
 'DRB',
 'AST',
 'STL',
 'TOV',
 'PF',
 'AST/TOV',
 'FG%',
 '2P%',
 '3P%',
 'FT_MAX',
 'FTA_MAX',
 'ORB_MAX',
 'DRB_MAX',
 'AST_MAX',
 'STL_MAX',
 'TOV_MAX',
 'PF_MAX',
 'PTS_MAX',
 'FT/FGA_MAX',
 'AST/TOV_MAX',
 '2P%_MAX',
 '3P%_MAX',
 'FTA_OPP',
 'ORB_OPP',
 'AST_MAX_OPP',
 'PF_MAX_OPP',
 'FG%_MAX_OPP',
 '3P%_MAX_OPP']

In [41]:
accs = runAllModels(df)
print(accs)
maxIndex = accs.index(max(accs))
topModels = []
for i in range(len(accs)):
    if(accs[i] == max(accs)):
        topModels.append(models[i])
print(topModels)

Ridge
0.66854
0.67416
more accurate: fullTest
LinReg
0.66573
0.66994
more accurate: fullTest
SVM
0.63904
0.67416
more accurate: fullTest


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNN
0.62079
0.66573
more accurate: fullTest
DecisionTree
0.64045
0.67275
more accurate: fullTest
RandomForest
0.64607
0.67275
more accurate: fullTest
GaussianNB
0.56882
0.51826
more accurate: backTest
LogisticRegression
0.66573
0.66994
more accurate: fullTest
[0.6741573033707865, 0.6699438202247191, 0.6741573033707865, 0.6657303370786517, 0.672752808988764, 0.672752808988764, 0.5688202247191011, 0.6699438202247191]
[RidgeClassifier(alpha=1), SVC()]


In [21]:
model = 
predictions2 = fullTest(df, model, predictors)
predictions2

Unnamed: 0,ACTUAL,PREDICTION
240,0,0
241,1,0
242,0,0
243,1,0
244,0,0
...,...,...
947,1,0
948,1,0
949,0,0
950,0,0


In [13]:
#cross validate
from sklearn.model_selection import cross_val_predict, cross_val_score
# models = [rr, lreg, svm, knn, decTree, rForest, gaussNB, logReg]
for model in models:
    cvPred = cross_val_predict(model, df[predictors], df["TARGET"], cv=3)
    cvScore = cross_val_score(model, df[predictors], df["TARGET"], cv=3)
    print(cvScore)

[0.68553459 0.67823344 0.64037855]
[0.68553459 0.67507886 0.64037855]
[0.68238994 0.66561514 0.6214511 ]


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


[0.68553459 0.66876972 0.56782334]
[0.68867925 0.66561514 0.60252366]
[0.68238994 0.67192429 0.57728707]
[0.50314465 0.51104101 0.50473186]
