# MARCH MADNESS PREDICTOR

Authors: Connor Finn, Riley Greene <br>
Date: 1/24/20 <br>
Warren Buffet is still paying 1 billion for a perfect bracket

In [6]:
# our imports for the model
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup 
import lxml.html as lh

# a few constants
start_year = 2009

Because basketball has changed so much over the past years, we believe that data prior to the year 2009 could be detrimental to our model. For this reason, the next block of code will be used to slim the data down from the 1985 - 2018 seasons to a dataframe including the the 2020 - 2018 NCAA tournaments.

In [7]:
# read ncaa tournament results into dataframe ( data obtained from Kaggle)
ncaa_total = pd.read_csv('NCAATourneyCompactResults.csv')  
# read team ID dataframe (from Kaggle)
team_data = pd.read_csv('teams.csv')  
# get the first index for the desired year
start = min(ncaa_total[ncaa_total.iloc[:,0] == start_year].index.values.astype(int)) 
# shorten our dataframe
ncaa_short = ncaa_total.iloc[start: , :]

The team_id provides names written as 'Penn State'. The data we will be collecting is from Sports-reference.com. The url for penn state's data is 'https://www.sports-reference.com/cbb/schools/penn-state/2020.html' <br>
1) the name needs to be adjusted to no spaces, all lower caps <br>
2) the overall framework is 'https://www.sports-reference.com/cbb/schools/NAME/YEAR.html'<br>

In [8]:
"""
* this function will be used to transform a provided name to one that can be input into a sports-reference url
* this will likely need to be adjusted as new teams are included
* this needs a unit test (i.e. run all our team names through once to see if they pass)
"""

# this is soooo bad
def clean_team_name(name):
    # name: string
    team = name
    team = '!' + team + '!'                   # first and last character
    team = team.replace(' ' , '-')            # no spaces
    team = team.replace('(' , '')
    team = team.replace(')' , '')
    team = team.replace('.' , '')
    team = team.replace( "'" , '')
    team = team.replace("&" , "")
    team = team.lower()
    team = team.replace('!southern-univ!' , '!southern!')
    team = team.replace('!w-texas-am!' , '!west-texas-am!')
    team = team.replace('!armstrong-st!' , '!armstrong!')
    team = team.replace('ark-' , 'arkansas-')
    team = team.replace('-st!' , '-state!')
    team = team.replace('!n-' , '!north-')
    team = team.replace('!e-' , '!eastern-')
    team = team.replace('!cs-' , '!cal-state-')
    team = team.replace('!s-illinois!' , '!southern-illinois!')
    team = team.replace('!s-' , '!south-')
    team = team.replace('!w-' , '!western-')
    team = team.replace('!nc-' , '!north-carolina-')
    team = team.replace('chr!' , 'christian!')
    team = team.replace('-so!' , '-southern!')
    team = team.replace('!fl-' , '!florida-')
    team = team.replace('!uc-' , '!california-')
    team = team.replace('intl!' , 'international!')
    team = team.replace('cal-state-sacramento' , 'sacramento-state')
    team = team.replace('univ!' , 'university!')
    team = team.replace('miss!' , 'mississippi!')
    team = team.replace('-slo' , '')
    team = team.replace('car!' , 'carolina!')
    team = team.replace('!ne-omaha!' , '!nebraska-omaha!')
    team = team.replace('!ne-' , '!northeastern-')
    team = team.replace('!ut-' , '!texas-')
    team = team.replace('okla-' , 'oklahoma-')
    team = team.replace('!penn!' , '!pennsylvania!')
    team = team.replace('!unc' , '!north-carolina-')
    team = team.replace('!american-university!' , '!american!')
    team = team.replace('!detroit!' , '!detroit-mercy!')
    team = team.replace('!loy-' , '!loyola-')
    team = team.replace('!loyola-chicago!' , '!loyola-il!')
    team = team.replace('!north-kentucky!' , '!northern-kentucky!')
    team = team.replace('!north-illinois!' , '!northern-illinois!')
    team = team.replace('!north-colorado!' , '!northern-colorado!')
    team = team.replace('!cal-baptist!' , '!california-baptist!')
    team = team.replace('!augusta!' , '!augusta-state!')
    team = team.replace('!etsu!' , '!east-tennessee-state!')
    team = team.replace('!mt-' , '!mount-')
    team = team.replace('!g-washington!' , '!george-washington!')
    team = team.replace('!ga-' , '!georgia-')
    team = team.replace('!il-' , '!illinois-')
    team = team.replace('!houston-bap!' , '!houston-baptist!')
    team = team.replace('!kennesaw!' , '!kennesaw-state!')
    team = team.replace('!bowling-green!' , '!bowling-green-state!')
    team = team.replace('!col-charleston!' , '!college-of-charleston!')
    team = team.replace('!cent-arkansas!' , '!central-arkansas!')
    team = team.replace('!central-conn!' , '!central-connecticut-state!')
    team = team.replace('!kent!' , '!kent-state!')
    team = team.replace('!lsu!' , '!louisiana-state!')
    team = team.replace('!ms-' , '!mississippi-')
    team = team.replace('!f-dickinson!' , '!fairleigh-dickinson!')
    team = team.replace('!byu!' , '!brigham-young!')
    team = team.replace('!ma-' , '!massachusetts-')
    team = team.replace('!northwestern-la!' , '!northwestern-state!')
    team = team.replace('!long-island!' , '!long-island-university!')
    team = team.replace('!wi-' , '!')
    team = team.replace('!c-' , '!central-')
    team = team.replace('!md-e-shore!' , '!maryland-eastern-shore!')
    team = team.replace('!st-johns!' , '!st-johns-ny!')
    team = team.replace('!tcu!' , '!texas-christian!')
    team = team.replace('!tx-' , '!texas-')
    team = team.replace('!va-' , '!virginia-')
    team = team.replace('!vmi!' , '!virginia-millitary-institute!')
    team = team.replace('!wku!' , '!western-kentucky!')
    team = team.replace('!utep!' , '!texas-el-paso!')
    team = team.replace('!st-marys-ca!' , '!saint-marys-ca!')
    team = team.replace('!santa-barbara!' , '!california-santa-barbara!')
    team = team.replace('!unlv!' , '!nevada-las-vegas!')
    team = team.replace('-pa!' , '!')
    team = team.replace('!uab!' , '!alabama-birmingham!')
    team = team.replace('!mtsu!' , '!middle-tennessee!')
    team = team.replace('!smu!' , '!southern-methodist!')
    team = team.replace('!sf-austin!' , '!stephen-f-austin!')
    team = team.replace('!st-josephs!' , '!saint-josephs!')
    team = team.replace('!umbc!' , '!maryland-baltimore-county!')
    team = team.replace('!usc!' , '!southern-california!')
    team = team.replace('!st-peters!' , '!saint-peters!')
    team = team.replace('!st-louis!' , '!saint-louis!')
    team = team.replace('!ull!' , '!louisiana-lafayette!')
    team = team.replace('!usc!' , '!southern-california!')

    
    

    team = team.replace('--' , '-')
    team = team.replace("university-of-" , "")
    
    
    team = team[1:-1]
    
    return team
def test_team_names(team_names):
    # team_names: list of strings 
    errors = 0
    for name in team_names:
        url_test =  "https://www.sports-reference.com/cbb/schools/" + name + "/index.html"
        page_test = requests.get(url_test)   # scrape
        soup_test = BeautifulSoup(page_test.content, "lxml")   # parse
        tables_test = soup_test.findAll('table')    # search for tables 
        if len(tables_test) != 0:
            print("                                 " , name , " Pass")
        else:
            print(name , " Fail")
            errors += 1
    print("finished unit test. There were " , errors , " errors.")
    return errors

team_data is a dataframe which has the columns: 'TeamID', 'TeamName', 'FirstD1Season', 'LastD1Season'.  We want to create a new column which has the team names in accordance to the sports-reference.com framework. 

First, reduce the team_data dataframe to only those teams who actually compteded in the tournament

In [9]:
s1 = set(team_data.TeamID)
s2 = set(ncaa_short.WTeamID.append(ncaa_short.LTeamID))
extra_teams = s1.symmetric_difference(s2)
for value in extra_teams:
    team_data = team_data.drop(team_data[team_data["TeamID"] == value].index)


In [10]:
# get a list of all the team names in our dataframe
team_names = team_data.TeamName.values.tolist()

# clean every name in the team_data dataframe
sr_names = []
for name in team_names:
    sr_names += [clean_team_name(name)]
# run the test: if no error messages show up, then we add the list to dataframe
num_errors = test_team_names(sr_names)
#num_errors = 0
if num_errors ==0:
    team_data["SrNames"] = sr_names

                                  akron  Pass
                                  alabama  Pass
                                  alabama-state  Pass
                                  albany-ny  Pass
                                  american  Pass
                                  arizona  Pass
                                  arizona-state  Pass
                                  arkansas-little-rock  Pass
                                  arkansas-pine-bluff  Pass
                                  arkansas  Pass
                                  auburn  Pass
                                  austin-peay  Pass
                                  baylor  Pass
                                  belmont  Pass
                                  binghamton  Pass
                                  boise-state  Pass
                                  boston-college  Pass
                                  boston-university  Pass
                                  bucknell  Pass
                      

                                  south-carolina  Pass
                                  south-florida  Pass
                                  southern-mississippi  Pass
                                  southern  Pass
                                  st-bonaventure  Pass
                                  st-johns-ny  Pass
                                  saint-josephs  Pass
                                  saint-louis  Pass
                                  saint-marys-ca  Pass
                                  saint-peters  Pass
                                  stanford  Pass
                                  stony-brook  Pass
                                  syracuse  Pass
                                  texas-christian  Pass
                                  temple  Pass
                                  tennessee  Pass
                                  texas  Pass
                                  texas-am  Pass
                                  texas-tech  Pass
           

Goal: compile season long data for the teams who competed in the 2010 - 2018 NCAA tournaments. The idea is that the season long data is the information we will have in the future for creating predictions.<br>
- in the ncaa_short, we have the year the game was played, and the id's of both teams
- in team_data we have the team id, along with the team name (cleaned for sports-reference use).

In [11]:
# get arrays which include the team years and id's 
winner_array = np.vstack((ncaa_short.Season.values , ncaa_short.WTeamID.values ))
loser_array = np.vstack((ncaa_short.Season.values , ncaa_short.LTeamID.values))

# List of years
years = ncaa_short.Season.unique()

# the below loop will create a 2 x n array of all unique teams (year , school_id) which competed in NCAA's
all_teams = np.empty((2,0)) # fill this array
a_team_list = []
for year in years:
    temp_1 = ncaa_short.WTeamID.loc[(ncaa_short.Season == year)].append(ncaa_short.LTeamID.loc[(ncaa_short.Season == year)]).unique()
    a_team_list = a_team_list + temp_1.tolist()
    temp_2 = np.full(shape = len(temp_1), fill_value = year , dtype = np.int)
    temp_3 = np.vstack((temp_2 , temp_1)) 
    all_teams = np.hstack((all_teams , temp_3)) # fill array


We now want to build a dataframe which has a list of the teams and their season long statistics. <br> 
all_teams is now a 2xn array of unique teams (year, school_id). We will gain access to the team name using: <br>
- team_name = team_data.SrNames.loc[(team_data.TeamID == INSERT TEAM ID HERE)].values.tolist()[0]

In [13]:
# this will collect the season data for every team we selected in the list, for the number of indicated years
seasonStats = pd.DataFrame()   # start with an empty dataframe

for i in range(all_teams.shape[1]):   # this is the years 
	team_name = team_data.SrNames.loc[(team_data.TeamID == all_teams[1 , i])].values.tolist()[0] # get the team Name
	url3 = "https://www.sports-reference.com/cbb/schools/" + str(team_name) + "/" + str(int(all_teams[0 , i])) + ".html" # season data
	pageSe = requests.get(url3)   # scrape
	soupSe = BeautifulSoup(pageSe.content, "lxml")   # parse
	tablesSe = soupSe.findAll('table')    # search for tables 
	# convert to a dataframe and label the data
	print("Team number " ,  i)   # this shows progress
	dfSe = pd.read_html(str(tablesSe[1]))[0]      # select the table of interest into a pandas dataframe
	dfSe.drop([1 , 2, 3] , inplace=True)	      # only want the team data. (although might consider fouls against too)
	print(team_name)    
	dfSe = dfSe.replace('Team' , team_name)   # want the name of the team 
	numRows = len(dfSe.index) # get the number of rows...... should just be one here
	dfSe['Team_ID'] = int(all_teams[1 , i])        # put in the team ID's
# 	dfSe['Conf_ID'] = powerSixTeamsAndConf[j][1]        # put in the conference ID's  
# 	dfSe['Conf'] = powerSixConf[powerSixTeamsAndConf[j][1] - 1]       # put in the team's conference
	dfSe['Date'] = all_teams[0 , i]  # only want the year in this column
	seasonStats = pd.concat([seasonStats, dfSe] ,  axis=0) # add this to the season stats empty dataframe we started with
seasonStats = seasonStats.rename(columns={'Unnamed: 0': 'Team'})  # simply clean up the columns 
    
    

Team number  0
morehead-state
Team number  1
connecticut
Team number  2
duke
Team number  3
gonzaga
Team number  4
louisiana-state
Team number  5
maryland
Team number  6
memphis
Team number  7
michigan
Team number  8
north-carolina
Team number  9
oklahoma
Team number  10
purdue
Team number  11
texas
Team number  12
texas-am
Team number  13
ucla
Team number  14
villanova
Team number  15
western-kentucky
Team number  16
washington
Team number  17
arizona
Team number  18
arizona-state
Team number  19
cleveland-state
Team number  20
dayton
Team number  21
kansas
Team number  22
louisville
Team number  23
marquette
Team number  24
michigan-state
Team number  25
missouri
Team number  26
oklahoma-state
Team number  27
pittsburgh
Team number  28
siena
Team number  29
syracuse
Team number  30
southern-california
Team number  31
wisconsin
Team number  32
xavier
Team number  33
alabama-state
Team number  34
chattanooga
Team number  35
binghamton
Team number  36
akron
Team number  37
butler
Team n

Team number  292
iowa-state
Team number  293
kansas
Team number  294
miami-fl
Team number  295
minnesota
Team number  296
mississippi
Team number  297
north-carolina
Team number  298
ohio-state
Team number  299
san-diego-state
Team number  300
temple
Team number  301
liberty
Team number  302
middle-tennessee
Team number  303
long-island-university
Team number  304
boise-state
Team number  305
belmont
Team number  306
bucknell
Team number  307
nevada-las-vegas
Team number  308
missouri
Team number  309
southern
Team number  310
new-mexico
Team number  311
davidson
Team number  312
south-dakota-state
Team number  313
valparaiso
Team number  314
oklahoma-state
Team number  315
new-mexico-state
Team number  316
montana
Team number  317
akron
Team number  318
pittsburgh
Team number  319
cincinnati
Team number  320
albany-ny
Team number  321
georgetown
Team number  322
northwestern-state
Team number  323
colorado
Team number  324
notre-dame
Team number  325
western-kentucky
Team number  326


Team number  580
florida-gulf-coast
Team number  581
south-dakota-state
Team number  582
nevada
Team number  583
minnesota
Team number  584
vanderbilt
Team number  585
princeton
Team number  586
vermont
Team number  587
virginia-commonwealth
Team number  588
north-carolina-wilmington
Team number  589
bucknell
Team number  590
virginia-tech
Team number  591
maryland
Team number  592
seton-hall
Team number  593
new-mexico-state
Team number  594
troy
Team number  595
northern-kentucky
Team number  596
jacksonville-state
Team number  597
oklahoma-state
Team number  598
miami-fl
Team number  599
texas-southern
Team number  600
iona
Team number  601
creighton
Team number  602
marquette
Team number  603
kent-state
Team number  604
southern-methodist
Team number  605
dayton
Team number  606
radford
Team number  607
st-bonaventure
Team number  608
syracuse
Team number  609
texas-southern
Team number  610
alabama
Team number  611
buffalo
Team number  612
duke
Team number  613
florida
Team number

In [15]:
seasonStats

Unnamed: 0,Team,G,MP,FG,FGA,FG%,2P,2PA,2P%,3P,...,TRB,AST,STL,BLK,TOV,PF,PTS,PTS/G,Team_ID,Date
0,morehead-state,36.0,,865,1978,.437,677,1413,.479,188,...,1400,464,239,139,557,657,2511,69.8,1287,2009.0
0,connecticut,36.0,7375.0,996,2102,.474,831,1618,.514,165,...,1559,567,209,280,458,471,2819,78.3,1163,2009.0
0,duke,37.0,7425.0,971,2186,.444,704,1420,.496,267,...,1348,492,311,146,454,671,2867,77.5,1181,2009.0
0,gonzaga,34.0,6850.0,967,1982,.488,716,1339,.535,251,...,1276,512,246,174,386,551,2684,78.9,1211,2009.0
0,louisiana-state,35.0,7050.0,937,2093,.448,724,1521,.476,213,...,1365,529,267,212,425,609,2617,74.8,1261,2009.0
0,maryland,35.0,7075.0,924,2179,.424,723,1575,.459,201,...,1274,511,268,145,428,582,2512,71.8,1268,2009.0
0,memphis,37.0,7425.0,979,2176,.450,757,1495,.506,222,...,1452,526,328,222,464,654,2779,75.1,1272,2009.0
0,michigan,35.0,7100.0,812,1910,.425,507,998,.508,305,...,1087,542,223,92,402,535,2341,66.9,1276,2009.0
0,north-carolina,38.0,7625.0,1205,2509,.480,941,1827,.515,264,...,1594,685,325,196,472,608,3413,89.8,1314,2009.0
0,oklahoma,36.0,7225.0,982,1997,.492,740,1308,.566,242,...,1402,521,249,156,500,616,2833,78.7,1328,2009.0


Save the dataframes locally to CSV.

In [16]:
seasonStats.to_csv('season_data.csv' , index=False)
team_data.to_csv('team_list.csv' , index=False)
ncaa_short.to_csv('ncaa_short.csv' , index = False)