In [1]:
## our imports for the model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sc
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from itertools import combinations

In [2]:
# game plan, let's build a model from 2015 through 2018.   
# then to present findings, we can use it on 2019 data and predict the results of the conf tournaments
### 

#### variables  ###
numIt = 100

# import our data
totalDataset = pd.read_csv('PowerSixBball.csv', delimiter='\t', encoding="utf-8-sig", index_col=[0]) #later this can be made for any title
seasonStats = pd.read_csv('SeasonStatistics.csv', delimiter='\t', encoding="utf-8-sig" , index_col=[0]) 

Now we need to generate the column which we will train our model to. Assign 1 to games where the first team wins, assign 0 to games where the second team wins, and assign 0.5 to games where there was a tie.

In [3]:
# for now, lets do points team1 - points team2, so a score differential
totalDataset['Score_Diff'] = totalDataset[totalDataset.columns[-2]] - totalDataset[totalDataset.columns[-1]]

## these below corrections are because our model was overfitting i.e. three data columns were devoted to FG, 2P, 3P and FT, and MP and G are redundant as well
seasonStats.drop(['MP', 'Team_ID' , 'FG' , '2P' , '3P' , 'FT' ] , axis = 1 , inplace = True)
totalDataset.drop(['MP', 'Team_ID' , 'FG' , '2P' , '3P' , 'FT' , 'Team_ID.1' , 'MP.1', 'FG.1' , '2P.1' , '3P.1' , 'FT.1' ] , axis = 1 , inplace = True)
## this will output the winning team. 1 for team 1, 2 for team 2, 1.5 if there is a tie
## make that a new column
for i in range(len(totalDataset.index)):
	if totalDataset.loc[i , 'Score_Diff'] < 0:
		totalDataset.loc[i , 'Winning_Team'] = 0
	elif totalDataset.loc[i , 'Score_Diff'] > 0:
		totalDataset.loc[i , 'Winning_Team']  = 1
	else:
		totalDataset.loc[i , 'Winning_Team'] = 0.5


## Remove the 2019 Data from the data frame. --- This will be used as a case study of our model.  

In [4]:
##### we aren't building the model with the 2019 data

currYearIndex = totalDataset[totalDataset['Date']==2019].index.values.astype(int)   # get the indicies for 2019 games
currGamesNum = len(currYearIndex)   # find how many 2019 games there are
totalGamesNum = len(totalDataset.index)           # find how many total games

# split the data into two dataframes for the 2019 data and for the 2015 through 2018 data

currYearGames = totalDataset.tail(currGamesNum)                      # 2019
currYearGames.reset_index(drop=True, inplace=True)
dataset = totalDataset.head(totalGamesNum - currGamesNum)      # 2015 - 2018

## we need the 2019 season statistics data as well
currYearSSIndex = seasonStats[seasonStats['Date']==2019].index.values.astype(int)   # get the indicies for 2019 stats
currSSNum = len(currYearSSIndex)   # how many teams in 2019
currYearData = seasonStats.tail(currSSNum)    
currYearData.reset_index(drop=True , inplace=True)


## Determine the Games which We will predict

In [5]:
##### this will get a list of lists which contain the teams in each conference
numConferences = 6    # how many conferences under consideration

conferenceTeams = []  # empty list
for i in range(numConferences):        # enter empty lists into the above list with this loop
	conferenceTeams += [[]]



for m in range(len(currYearData.index)):      ### this fills the lists with the teams according to conference ID
	for n in range(1 , numConferences +1):
		if currYearData.loc[m , 'Conf_ID'] == n:
			conferenceTeams[n-1] += [currYearData.loc[m , 'Team']]

##### this will get the games we need to have odds for

gamesToPredict = []
for conf in conferenceTeams:
	gamesToPredict += list(combinations(conf, 2))   ## this will generate combinations of 2 within the conferences


In [6]:
###### need to compile the dataframe  #####

# now we need to combine in a helpful way. Let's put the season stats next to each team in the games dataframe.
predictedGames = pd.DataFrame()    # this is our final dataframe which we will output 
team1DataDF = pd.DataFrame()    # placeHolder
team2DataDF = pd.DataFrame()    # placeHolder
for i in range(len(gamesToPredict)):
	year  = '2019'
	team1 = gamesToPredict[i][0]
	team2 = gamesToPredict[i][1]


	team1Index = currYearData[currYearData['Team']==team1].index.values.astype(int)  
	team2Index = currYearData[currYearData['Team']==team2].index.values.astype(int)  


	placeHolder1 = pd.DataFrame([currYearData.iloc[team1Index[0]]])   # convert the desired data row into a dataframe
	placeHolder2 = pd.DataFrame([currYearData.iloc[team2Index[0]]])  # same for team two
	team1DataDF = team1DataDF.append(placeHolder1 ,  ignore_index=True) 
	team2DataDF = team2DataDF.append(placeHolder2 ,  ignore_index=True)   

# Add .1 to all the second team data, except the index labeled 'Date'
names = ['Date']   # need to start with date in it. 
colNames = team2DataDF.columns
for j in range(len(colNames) ):
	add = ".1"
	if colNames[j] not in names:
		val = colNames[j] + add
		names = names + [val]
team2DataDF.columns = names   # this puts it back in as column indexes

# concatenate them together in the correct order
predictedGames = pd.concat( ( team1DataDF  ,  team2DataDF  ) ,  axis=1 , sort = False)   # we have one extra date column 
predictedGames = predictedGames.loc[:,~predictedGames.columns.duplicated()]  # this removes the duplicates. This is the reason we kept 'Date' as is earlier

predictedGames2 =  predictedGames.drop( ['Team' , 'Team.1' , 'Conf' , 'Conf.1'  ] , axis =1 )


## Use Machine Learning to generate Odds for each possible conference Tournament Game 

In [7]:

odds = []    # this will get the odds
percentages = []
for j in range(numIt):
	#this splits our data into test and train sets. 30 percent for test
	train , test = train_test_split(dataset,test_size=0.2)  # adjust test size to change model accuracy (pareto Principle)

	# split into x_train and y_train, x_test , y_test

	# do not want any score information in the model or team names -> only data from each teams season
	x_test = test.drop( ['Team' , 'Team.1' , 'Conf' , 'Conf.1' , 'Score' , 'Score_Diff' , 'Score.1' , 'Winning_Team' ] , axis =1)
	x_train = train.drop( ['Team' , 'Team.1' , 'Conf' , 'Conf.1' , 'Score' , 'Score_Diff' , 'Score.1' , 'Winning_Team' ] , axis =1)
	# this tests for the winning team
	y_train = train['Winning_Team']  
	y_test = test['Winning_Team']

	## we now build the Decision tree model
	model = DecisionTreeRegressor()
	model.fit(x_train , y_train)     #fit it to the training sets

	predicted1=model.predict(predictedGames2)  # this is the predicted model
	predicted=model.predict(x_test)
	
	odds += [predicted1]   ## this is where we get the odds started Use this with the data we asses

	## this calculates the percentage correct 

	test_y = y_test.tolist()   # need the y_test dataframe in a list format
	
	correct = 0  # number correct
	false = 0    # number incorrect

	for i in range(len(test)):
		if predicted[i] == test_y[i]:
			correct +=1
		else:
			false +=1

	percentage = correct / len(predicted)  # gets the percentage of correct guesses
	percentages += [percentage]            # talleys that for the running models



aveCorrect = np.mean(percentages)			# interested on the correct percentage on average
print('On average, the decision tree models had an accuracy of:   ', aveCorrect)

### use the models to calculate the odds for each game -> this will be the odds the first team wins
finalOdds = []      
for i in range(len(predicted1)):       # this is the number of games in the model
	num = 0.00						  # empty number, will help get odds for each team
	for array in odds:				  # this is for the number of models
		num += array[i]

	finalOdds +=[num / numIt]		  # append each calcuated value to the final list



On average, the decision tree models had an accuracy of:    0.706


In [8]:
### we will build our dataframe to export 
# make lists of the first and second teams
firstTeamIDList = []
secondTeamIDList = []
team1List = []
team2List = []
for game in gamesToPredict:
	firstTeamIDList+= [game[0]]
	secondTeamIDList += [game[1]]


# build a dictionary from the three lists
d = {'First_Team':firstTeamIDList,'Second_Team':secondTeamIDList , 'Probability_First_Team': finalOdds}
# build dataframe from the dictionary
oddsDataframe = pd.DataFrame(d, columns=['First_Team','Second_Team' , 'Probability_First_Team'])


## Visual Representation of the Dataframe built 

In [9]:
oddsDataframe

Unnamed: 0,First_Team,Second_Team,Probability_First_Team
0,villanova,butler,0.80
1,villanova,georgetown,0.82
2,villanova,providence,0.87
3,villanova,st-johns-ny,0.92
4,villanova,xavier,0.62
5,villanova,seton-hall,0.87
6,villanova,depaul,0.45
7,villanova,creighton,0.61
8,villanova,marquette,0.46
9,butler,georgetown,0.53


## export to csv 

In [10]:
predictedGames2.to_csv('2019_games_predicted.csv' , sep='\t', encoding='utf-8')
oddsDataframe.to_csv('2019_tournament_o2odds.csv' , sep='\t', encoding='utf-8')