# MARCH MADNESS PREDICTOR

Authors: Connor Finn, Riley Greene <br>
Date: 1/24/20 <br>
Warren Buffet is still paying 1 billion for a perfect bracket

In [11]:
# our imports for the model
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup 
import lxml.html as lh

# a few constants
start_year = 2010

Because basketball has changed so much over the past years, we believe that data prior to the year 2010 could be detrimental to our model. For this reason, the next block of code will be used to slim the data down from the 1985 - 2018 seasons to a dataframe including the the 2020 - 2018 NCAA tournaments.

In [12]:
# read ncaa tournament results into dataframe ( data obtained from Kaggle)
ncaa_total = pd.read_csv('NCAATourneyCompactResults.csv')  
# read team ID dataframe (from Kaggle)
team_data = pd.read_csv('teams.csv')  
# get the first index for the desired year
start = min(ncaa_total[ncaa_total.iloc[:,0] == start_year].index.values.astype(int)) 
# shorten our dataframe
ncaa_short = ncaa_total.iloc[start: , :]

The team_id provides names written as 'Penn State'. The data we will be collecting is from Sports-reference.com. The url for penn state's data is 'https://www.sports-reference.com/cbb/schools/penn-state/2020.html' <br>
1) the name needs to be adjusted to no spaces, all lower caps <br>
2) the overall framework is 'https://www.sports-reference.com/cbb/schools/NAME/YEAR.html'<br>

In [13]:
"""
* this function will be used to transform a provided name to one that can be input into a sports-reference url
* this will likely need to be adjusted as new teams are included
* this needs a unit test (i.e. run all our team names through once to see if they pass)
"""

# this is soooo bad
def clean_team_name(name):
    # name: string
    team = name
    team = '!' + team + '!'                   # first and last character
    team = team.replace(' ' , '-')            # no spaces
    team = team.replace('(' , '')
    team = team.replace(')' , '')
    team = team.replace('.' , '')
    team = team.replace( "'" , '')
    team = team.replace("&" , "")
    team = team.lower()
    team = team.replace('!southern-univ!' , '!southern-university!')
    team = team.replace('!w-texas-am!' , '!west-texas-am!')
    team = team.replace('!armstrong-st!' , '!armstrong!')
    team = team.replace('ark-' , 'arkansas-')
    team = team.replace('-st!' , '-state!')
    team = team.replace('!n-' , '!north-')
    team = team.replace('!e-' , '!eastern-')
    team = team.replace('!cs-' , '!cal-state-')
    team = team.replace('!s-illinois!' , '!southern-illinois!')
    team = team.replace('!s-' , '!south-')
    team = team.replace('!w-' , '!western-')
    team = team.replace('!nc-' , '!north-carolina-')
    team = team.replace('chr!' , 'christian!')
    team = team.replace('-so!' , '-southern!')
    team = team.replace('!fl-' , '!florida-')
    team = team.replace('!uc-' , '!california-')
    team = team.replace('intl!' , 'international!')
    team = team.replace('cal-state-sacramento' , 'sacramento-state')
    team = team.replace('univ!' , 'university!')
    team = team.replace('miss!' , 'mississippi!')
    team = team.replace('-slo' , '')
    team = team.replace('car!' , 'carolina!')
    team = team.replace('!ne-omaha!' , '!nebraska-omaha!')
    team = team.replace('!ne-' , '!northeastern-')
    team = team.replace('!ut-' , '!texas-')
    team = team.replace('okla-' , 'oklahoma-')
    team = team.replace('!penn!' , '!pennsylvania!')
    team = team.replace('!unc' , '!north-carolina-')
    team = team.replace('!american-university!' , '!american!')
    team = team.replace('!detroit!' , '!detroit-mercy!')
    team = team.replace('!loy-' , '!loyola-')
    team = team.replace('!loyola-chicago!' , '!loyola-il!')
    team = team.replace('!north-kentucky!' , '!northern-kentucky!')
    team = team.replace('!north-illinois!' , '!northern-illinois!')
    team = team.replace('!north-colorado!' , '!northern-colorado!')
    team = team.replace('!cal-baptist!' , '!california-baptist!')
    team = team.replace('!augusta!' , '!augusta-state!')
    team = team.replace('!etsu!' , '!east-tennessee-state!')
    team = team.replace('!mt-' , '!mount-')
    team = team.replace('!g-washington!' , '!george-washington!')
    team = team.replace('!ga-' , '!georgia-')
    team = team.replace('!il-' , '!illinois-')
    team = team.replace('!houston-bap!' , '!houston-baptist!')
    team = team.replace('!kennesaw!' , '!kennesaw-state!')
    team = team.replace('!bowling-green!' , '!bowling-green-state!')
    team = team.replace('!col-charleston!' , '!college-of-charleston!')
    team = team.replace('!cent-arkansas!' , '!central-arkansas!')
    team = team.replace('!central-conn!' , '!central-connecticut-state!')
    team = team.replace('!kent!' , '!kent-state!')
    team = team.replace('!lsu!' , '!louisiana-state!')
    team = team.replace('!ms-' , '!mississippi-')
    team = team.replace('!f-dickinson!' , '!fairleigh-dickinson!')
    team = team.replace('!byu!' , '!brigham-young!')
    team = team.replace('!ma-' , '!massachusetts-')
    team = team.replace('!northwestern-la!' , '!northwestern-state!')
    team = team.replace('!long-island!' , '!long-island-university!')
    team = team.replace('!wi-' , '!')
    team = team.replace('!c-' , '!central-')
    team = team.replace('!md-e-shore!' , '!maryland-eastern-shore!')
    team = team.replace('!st-johns!' , '!st-johns-ny!')
    team = team.replace('!tcu!' , '!texas-christian!')
    team = team.replace('!tx-' , '!texas-')
    team = team.replace('!va-' , '!virginia-')
    team = team.replace('!vmi!' , '!virginia-millitary-institute!')
    team = team.replace('!wku!' , '!western-kentucky!')
    team = team.replace('!utep!' , '!texas-el-paso!')
    team = team.replace('!st-marys-ca!' , '!saint-marys-ca!')
    team = team.replace('!santa-barbara!' , '!california-santa-barbara!')
    team = team.replace('!unlv!' , '!nevada-las-vegas!')
    team = team.replace('-pa!' , '!')
    team = team.replace('!uab!' , '!alabama-birmingham!')
    team = team.replace('!mtsu!' , '!middle-tennessee!')
    team = team.replace('!smu!' , '!southern-methodist!')
    team = team.replace('!sf-austin!' , '!stephen-f-austin!')
    team = team.replace('!st-josephs!' , '!saint-josephs!')
    team = team.replace('!umbc!' , '!maryland-baltimore-county!')
    team = team.replace('!usc!' , '!southern-california!')
    team = team.replace('!st-peters!' , '!saint-peters!')
    team = team.replace('!st-louis!' , '!saint-louis!')
    team = team.replace('!ull!' , '!louisiana-lafayette!')
    team = team.replace('!usc!' , '!southern-california!')
    
    

    team = team.replace('--' , '-')
    team = team.replace("university-of-" , "")
    
    
    team = team[1:-1]
    
    return team
def test_team_names(team_names):
    # team_names: list of strings 
    errors = 0
    for name in team_names:
        url_test =  "https://www.sports-reference.com/cbb/schools/" + name + "/index.html"
        page_test = requests.get(url_test)   # scrape
        soup_test = BeautifulSoup(page_test.content, "lxml")   # parse
        tables_test = soup_test.findAll('table')    # search for tables 
        if len(tables_test) != 0:
            print("                                 " , name , " Pass")
        else:
            print(name , " Fail")
            errors += 1
    print("finished unit test. There were " , errors , " errors.")
    return errors

team_data is a dataframe which has the columns: 'TeamID', 'TeamName', 'FirstD1Season', 'LastD1Season'.  We want to create a new column which has the team names in accordance to the sports-reference.com framework. 

In [6]:
# get a list of all the team names in our dataframe
team_names = team_data.TeamName.values.tolist()
# clean every name in the team_data dataframe
sr_names = []
for name in team_names:
    sr_names += [clean_team_name(name)]
# run the test: if no error messages show up, then we add the list to dataframe
num_errors = test_team_names(sr_names)
#num_errors = 0
if num_errors ==0:
    team_data["SrNames"] = sr_names

                                  abilene-christian  Pass
                                  air-force  Pass
                                  akron  Pass
                                  alabama  Pass
                                  alabama-am  Pass
                                  alabama-state  Pass
                                  albany-ny  Pass
                                  alcorn-state  Pass
                                  alliant-international  Pass
                                  american  Pass
                                  appalachian-state  Pass
                                  arizona  Pass
                                  arizona-state  Pass
                                  arkansas-little-rock  Pass
                                  arkansas-pine-bluff  Pass


KeyboardInterrupt: 

Goal: compile season long data for the teams who competed in the 2010 - 2018 NCAA tournaments. The idea is that the season long data is the information we will have in the future for creating predictions.<br>
- in the ncaa_short, we have the year the game was played, and the id's of both teams
- in team_data we have the team id, along with the team name (cleaned for sports-reference use).

In [14]:
# get arrays which include the team years and id's 
winner_array = np.vstack((ncaa_short.Season.values , ncaa_short.WTeamID.values ))
loser_array = np.vstack((ncaa_short.Season.values , ncaa_short.LTeamID.values))

# List of years
years = ncaa_short.Season.unique()

# the below loop will create a 2 x n array of all unique teams (year , school_id) which competed in NCAA's
all_teams = np.empty((2,0)) # fill this array
a_team_list = []
for year in years:
    temp_1 = ncaa_short.WTeamID.loc[(ncaa_short.Season == year)].append(ncaa_short.LTeamID.loc[(ncaa_short.Season == year)]).unique()
    a_team_list = a_team_list + temp_1.tolist()
    temp_2 = np.full(shape = len(temp_1), fill_value = year , dtype = np.int)
    temp_3 = np.vstack((temp_2 , temp_1)) 
    all_teams = np.hstack((all_teams , temp_3)) # fill array


In [15]:
real_team_list = []
for piece in a_team_list:
    team_name = team_data.TeamName.loc[(team_data.TeamID == piece)].values.tolist()[0] # get the team Name
    real_team_list = real_team_list + [team_name]

sr_names_new = []
for name in real_team_list:
    sr_names_new += [clean_team_name(name)]    
test_team_names(sr_names_new)


                                  arkansas-pine-bluff  Pass
                                  baylor  Pass
                                  butler  Pass
                                  brigham-young  Pass
                                  kansas  Pass
                                  kansas-state  Pass
                                  kentucky  Pass
                                  murray-state  Pass
                                  new-mexico  Pass
                                  northern-iowa  Pass
                                  ohio  Pass
                                  old-dominion  Pass
                                  saint-marys-ca  Pass
                                  tennessee  Pass
                                  villanova  Pass
                                  wake-forest  Pass
                                  washington  Pass
                                  california  Pass
                                  cornell  Pass
                              

                                  michigan-state  Pass
                                  north-carolina-state  Pass
                                  norfolk-state  Pass
                                  north-carolina  Pass
                                  ohio  Pass
                                  purdue  Pass
                                  saint-louis  Pass
                                  xavier  Pass
                                  iona  Pass
                                  mississippi-valley-state  Pass
                                  california  Pass
                                  lamar  Pass
                                  south-dakota-state  Pass
                                  nevada-las-vegas  Pass
                                  west-virginia  Pass
                                  new-mexico-state  Pass
                                  connecticut  Pass
                                  southern-mississippi  Pass
                                  dav

                                  milwaukee  Pass
                                  american  Pass
                                  weber-state  Pass
                                  nebraska  Pass
                                  louisiana-lafayette  Pass
                                  oklahoma-state  Pass
                                  north-carolina-central  Pass
                                  eastern-kentucky  Pass
                                  kansas-state  Pass
                                  george-washington  Pass
                                  duke  Pass
                                  providence  Pass
                                  virginia-commonwealth  Pass
                                  new-mexico  Pass
                                  massachusetts  Pass
                                  tulsa  Pass
                                  coastal-carolina  Pass
                                  hampton  Pass
                                  missis

                                  florida  Pass
                                  florida-state  Pass
                                  gonzaga  Pass
                                  iowa-state  Pass
                                  middle-tennessee  Pass
                                  northwestern  Pass
                                  notre-dame  Pass
                                  purdue  Pass
                                  saint-marys-ca  Pass
                                  villanova  Pass
                                  virginia  Pass
                                  west-virginia  Pass
                                  wisconsin  Pass
                                  xavier  Pass
                                  arkansas  Pass
                                  baylor  Pass
                                  cincinnati  Pass
                                  duke  Pass
                                  kansas  Pass
                                  kentucky  Pas

2

In [9]:
real_team_list


['Ark Pine Bluff',
 'Baylor',
 'Butler',
 'BYU',
 'Kansas',
 'Kansas St',
 'Kentucky',
 'Murray St',
 'New Mexico',
 'Northern Iowa',
 'Ohio',
 'Old Dominion',
 "St Mary's CA",
 'Tennessee',
 'Villanova',
 'Wake Forest',
 'Washington',
 'California',
 'Cornell',
 'Duke',
 'Georgia Tech',
 'Gonzaga',
 'Maryland',
 'Michigan St',
 'Missouri',
 'Ohio St',
 'Pittsburgh',
 'Purdue',
 'Syracuse',
 'Texas A&M',
 'West Virginia',
 'Wisconsin',
 'Xavier',
 'Winthrop',
 'Sam Houston St',
 'UTEP',
 'Florida',
 'Lehigh',
 'North Texas',
 'ETSU',
 'Vanderbilt',
 'Montana',
 'UNLV',
 'Georgetown',
 'Notre Dame',
 'Richmond',
 'San Diego St',
 'Robert Morris',
 'Texas',
 'Marquette',
 'Louisville',
 'Temple',
 'Oklahoma St',
 'Florida St',
 'Houston',
 'New Mexico St',
 'Clemson',
 'Santa Barbara',
 'Oakland',
 'Siena',
 'Vermont',
 'Utah St',
 'Morgan St',
 'Wofford',
 'Minnesota',
 'Clemson',
 'UNC Asheville',
 'UT San Antonio',
 'VA Commonwealth',
 'Butler',
 'BYU',
 'Cincinnati',
 'Connecticut',


We now want to build a dataframe which has a list of the teams and their season long statistics. <br> 
all_teams is now a 2xn array of unique teams (year, school_id). We will gain access to the team name using: <br>
- team_name = team_data.SrNames.loc[(team_data.TeamID == INSERT TEAM ID HERE)].values.tolist()[0]

In [175]:
# this will collect the season data for every team we selected in the list, for the number of indicated years
seasonStats = pd.DataFrame()   # start with an empty dataframe

for i in range(all_teams.shape[1]):   # this is the years 
	team_name = team_data.SrNames.loc[(team_data.TeamID == all_teams[1 , i])].values.tolist()[0] # get the team Name
	url3 = "https://www.sports-reference.com/cbb/schools/" + str(team_name) + "/" + str(int(all_teams[0 , i])) + ".html" # season data
	pageSe = requests.get(url3)   # scrape
	soupSe = BeautifulSoup(pageSe.content, "lxml")   # parse
	tablesSe = soupSe.findAll('table')    # search for tables 
	print(tablesSe)
	# convert to a dataframe and label the data
	print("Team number " ,  i)   # this shows progress
	dfSe = pd.read_html(str(tablesSe[1]))[0]      # select the table of interest into a pandas dataframe
	dfSe.drop([1 , 2, 3] , inplace=True)	      # only want the team data. (although might consider fouls against too)
	dfSe = dfSe.replace('Team' , powerSixTeams[j])   # want the name of the team 
	numRows = len(dfSe.index) # get the number of rows...... should just be one here
	dfSe['Team_ID'] = powerSixTeamID[j]        # put in the team ID's
	dfSe['Conf_ID'] = powerSixTeamsAndConf[j][1]        # put in the conference ID's  
	dfSe['Conf'] = powerSixConf[powerSixTeamsAndConf[j][1] - 1]       # put in the team's conference
	currYear = [i] * numRows  ## get the correct number 
	dfSe['Date'] = currYear   # only want the year in this column
	seasonStats = seasonStats.append(dfSe , ignore_index=True) # add this to the season stats empty dataframe we started with 
	seasonStats = seasonStats.rename(columns={'Unnamed: 0': 'Team'})  # simply clean up the columns 
    
    

https://www.sports-reference.com/cbb/schools/ark-pine-bluff/2010.html
[]
Team number  0


IndexError: list index out of range

We are interested in using a regression ML model. To do this, we need to have a continuous solution (i.e. not win or lose) So I will assign a scoreDiff

In [None]:
ncaa_short['ScoreDiff'] = ncaa_short['WScore'] - ncaa_short['LScore']

0          Abilene Chr
1            Air Force
2                Akron
3              Alabama
4          Alabama A&M
5           Alabama St
6            Albany NY
7            Alcorn St
8         Alliant Intl
9        American Univ
10      Appalachian St
11             Arizona
12          Arizona St
13     Ark Little Rock
14      Ark Pine Bluff
15            Arkansas
16         Arkansas St
17        Armstrong St
18                Army
19              Auburn
20             Augusta
21         Austin Peay
22             Ball St
23              Baylor
24             Belmont
25     Bethune-Cookman
26          Binghamton
27       Birmingham So
28            Boise St
29      Boston College
            ...       
336          Villanova
337           Virginia
338      Virginia Tech
339                VMI
340         W Carolina
341         W Illinois
342                WKU
343         W Michigan
344         W Salem St
345        W Texas A&M
346             Wagner
347        Wake Forest
348        

In [207]:
v = 'dog'


In [208]:
v

'dog'

In [211]:
v[1:-1]

'o'

In [7]:
ncaa_short


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
1584,2010,134,1115,61,1457,44,N,0
1585,2010,136,1124,68,1358,59,N,0
1586,2010,136,1139,77,1431,59,N,0
1587,2010,136,1140,99,1196,92,N,2
1588,2010,136,1242,90,1250,74,N,0
1589,2010,136,1243,82,1317,62,N,0
1590,2010,136,1246,100,1190,71,N,0
1591,2010,136,1293,66,1435,65,N,0
1592,2010,136,1307,62,1285,57,N,0
1593,2010,136,1320,69,1424,66,N,0
