# <center>NCAA 2017 Basketball Model Setup</center>

This Notebook reads the Regular Season data and the Tournament Season data and stores information into a easy to use multilevel dictionary. This first level is indexed by a tuple of <i><b>(Season, Team ID)</b></i>. The value returned by this is another dictionary. This dictionary maps a given <b>Stat Label</b> to a list the keeps record of all that stat for every game. There is also an additional key to this dictionary called <b>'GameDict'</b>. This returns another dictionary that maps a given day of which to team played to what number game the team played in that season. This is useful to have when we want a stat from a specific game based on a given <b>Daynum</b>. At the end this dictionary is written to a <i>'statsdict.p'</i> file using pickle so that it can be used in Notebooks and programs aside from this one.

In [1]:
from matplotlib import pyplot as ply
import numpy as np
import pandas as pd
from IPython.display import display

In [2]:
regseasons_d = pd.read_csv("2017_Data/RegularSeasonDetailedResults.csv")
tourney_d = pd.read_csv("2017_Data/TourneyDetailedResults.csv")

In [4]:
display(regseasons_d.head())
display(tourney_d.head())


Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19


In [7]:
#Put a marker telling whether the game is from a tourney or a season
regseasons_d['GameType'] = 'S'
tourney_d['GameType'] = 'T'

In [8]:
#Concatanating the tournament and season data to a new data frame called 'games_d' 
games_d = pd.concat([regseasons_d, tourney_d])
games_d= games_d.sort_values(['Season','Daynum'])

In [9]:
#Displaying the frame to make sure the tournament and season data are properly sorted by Season and Day
pd.options.display.max_rows=400#60
games_d[((games_d.Season == 2003)|(games_d.Season ==2004)) & (games_d.Daynum>131)]

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf,GameType
4612,2003,132,1181,84,1301,77,N,0,24,56,...,20,26,12,25,13,15,6,3,26,S
4613,2003,132,1228,72,1326,59,N,0,26,59,...,11,14,11,25,13,11,4,3,17,S
4614,2003,132,1246,64,1280,57,N,0,22,44,...,9,14,14,19,9,14,11,2,19,S
4615,2003,132,1328,49,1281,47,N,0,18,48,...,1,2,8,25,8,13,2,2,11,S
0,2003,134,1421,92,1411,84,N,1,32,69,...,14,31,17,28,16,15,5,0,22,T
1,2003,136,1112,80,1436,51,N,0,31,66,...,7,7,8,26,12,17,10,3,15,T
2,2003,136,1113,84,1272,71,N,0,31,59,...,14,21,20,22,11,12,2,5,18,T
3,2003,136,1141,79,1166,73,N,0,29,53,...,12,17,14,17,20,21,6,6,21,T
4,2003,136,1143,76,1301,74,N,1,27,64,...,15,20,10,26,16,14,5,8,19,T
5,2003,136,1163,58,1140,53,N,0,17,52,...,11,13,15,26,11,11,8,4,22,T


## START PROGRAM HERE

#### Calculating and Extending Data Frame for Free Throw and Field Goal Percentage

In [10]:
#Field Goal Percentage
Wfgp = games_d.Wfgm/games_d.Wfga
Lfgp = games_d.Lfgm/games_d.Lfga
#3 Point Percentage
Wfgp3 = games_d.Wfgm3/games_d.Wfga3
Lfgp3 = games_d.Lfgm3/games_d.Lfga3
#Freethrow Percentage
Wftp = games_d.Wftm/games_d.Wfta
Lftp = games_d.Lftm/games_d.Lfta

In [11]:
#Field Goal Percentage
games_d['Wfgp'] = Wfgp
games_d['Lfgp'] = Lfgp
#3 Point Percentage
games_d['Wfgp3'] = Wfgp3
games_d['Lfgp3'] = Lfgp3
#Freethrow Percentage
games_d['Wftp'] = Wftp
games_d['Lftp'] = Lftp

#### Creating list of Stat Labels For Wteam and Lteam

In [12]:
#Find the relevant stats to use in the program
print(games_d.columns)

Index(['Season', 'Daynum', 'Wteam', 'Wscore', 'Lteam', 'Lscore', 'Wloc',
       'Numot', 'Wfgm', 'Wfga', 'Wfgm3', 'Wfga3', 'Wftm', 'Wfta', 'Wor', 'Wdr',
       'Wast', 'Wto', 'Wstl', 'Wblk', 'Wpf', 'Lfgm', 'Lfga', 'Lfgm3', 'Lfga3',
       'Lftm', 'Lfta', 'Lor', 'Ldr', 'Last', 'Lto', 'Lstl', 'Lblk', 'Lpf',
       'GameType', 'Wfgp', 'Lfgp', 'Wfgp3', 'Lfgp3', 'Wftp', 'Lftp'],
      dtype='object')


In [13]:
stats = ['Wscore', 'Wfgm', 'Wfga', 'Wfgm3', 'Wfga3', 'Wftm', 'Wfta', 'Wor', 'Wdr',
        'Wast', 'Wto', 'Wstl', 'Wblk', 'Wpf', 'Lscore', 'Lfgm', 'Lfga', 'Lfgm3', 'Lfga3',
        'Lftm', 'Lfta', 'Lor', 'Ldr', 'Last', 'Lto', 'Lstl', 'Lblk', 'Lpf',
        'Wfgp', 'Lfgp', 'Wfgp3', 'Lfgp3', 'Wftp', 'Lftp']
stats.sort()

In [14]:
#Setting the stat labels for each respective team
team1_stats = (stats[int(len(stats)/2):]) # Team 1 is the Winning Team
team2_stats = (stats[0:int(len(stats)/2)]) # Team 2 is the Losing Team

In [15]:
#Display the stat label for each team, followed by the length of the label list
print(team1_stats,len(team2_stats))
print(team2_stats,len(team1_stats))

['Wast', 'Wblk', 'Wdr', 'Wfga', 'Wfga3', 'Wfgm', 'Wfgm3', 'Wfgp', 'Wfgp3', 'Wfta', 'Wftm', 'Wftp', 'Wor', 'Wpf', 'Wscore', 'Wstl', 'Wto'] 17
['Last', 'Lblk', 'Ldr', 'Lfga', 'Lfga3', 'Lfgm', 'Lfgm3', 'Lfgp', 'Lfgp3', 'Lfta', 'Lftm', 'Lftp', 'Lor', 'Lpf', 'Lscore', 'Lstl', 'Lto'] 17


#### Initialize an Empty Stats Dictionary

In [32]:
statsdict = {}

#### Creating Elo Functions

In [43]:
def get_K(wteam_elo, lteam_elo, wteam_score, lteam_score):
    diff = wteam_score - lteam_score
    if(diff>30):
        return 40
    if(diff>15):
        return 30
    return 20

def new_elo(wteam_elo=1600, lteam_elo=1600, wteam_score=80, lteam_score=80, game_type='S'):
    eloDiff = wteam_elo - lteam_elo
    odds = 1 / (10**(-eloDiff/400) + 1)
    K = get_K(wteam_elo, lteam_elo, wteam_score, lteam_score)
    if(game_type == 'T'):
        K = K*1.5
    diff = round(K * (1-odds))
    return (wteam_elo + diff, lteam_elo - diff)

def get_elo(season, teamnum, game_day=-1):
    if (season,teamnum) in statsdict and ('Elo' in statsdict[season,teamnum]):
        if game_day<0:
            return statsdict[season,teamnum]['Elo'][-1]
        else:
            gameNum = statsdict[season, teamnum]['GameDict'][game_day]
            return statsdict[season,teamnum]['Elo'][gameNum]
    elif ((season-1,teamnum) in statsdict) and ('Elo' in statsdict[season-1,teamnum]):
        return (1600*(1/4)) + (3/4)*statsdict[season-1,teamnum]['Elo'][-1]
    else:
        return 1600

#### Creating Functions to Add and Get Stats

In [42]:
def has_team(stat, season, teamnum):
    return (season,teamnum) in statsdict and stat in statsdict[(season,teamnum)]

def add_stat(stat, stat_value, season, teamnum,daynum):
    if (season,teamnum) in statsdict:
        if(stat in statsdict[(season,teamnum)]):
            statsdict[(season,teamnum)][stat].append((stat_value))
            pass
        else:
            statsdict[(season,teamnum)][stat] = [(stat_value)]
    else:
        statsdict[(season,teamnum)] = {}
        statsdict[(season,teamnum)]['GameDict'] = {}
        statsdict[(season,teamnum)]['Elo'] = [round(1600*(1/4)+get_elo(season-1, teamnum)*(3/4))]
        statsdict[(season,teamnum)][stat] = [(stat_value)]



#### Creating our Stats Dictionary
Here we iterate through our Game Dataframe, so we can build our data structure game by game. Set game filter if you don't want to iterate through or build from all the Games.

In [44]:
statsdict = {}
game_filter = (games_d.Season<=2017)&(games_d.Season>=2003)
for index, row in games_d[game_filter].iterrows():
    #Building Base Stats from the Winning Team/ Team 1
    for stat in team1_stats:
        add_stat(stat[1:],row[stat], row['Season'], row['Wteam'], row['Daynum'])
    #Building Base Stats from the Winning Team/ Team 2
    for stat in team2_stats:
        add_stat(stat[1:],row[stat], row['Season'], row['Lteam'], row['Daynum'])
    #Create Dictionary to Map Daynum to the Index Number of the Stat Lists
    n1 = len(statsdict[row['Season'], row['Wteam']][team1_stats[0][1:]])
    n2 = len(statsdict[row['Season'], row['Lteam']][team2_stats[0][1:]])
    statsdict[row['Season'], row['Wteam']]['GameDict'][row['Daynum']] = n1
    statsdict[row['Season'], row['Lteam']]['GameDict'][row['Daynum']] = n2
    #Create Elo Scores For Teams
    old_elo1 = get_elo(row['Season'], row['Wteam'])
    old_elo2 = get_elo(row['Season'], row['Lteam'])
    new_elo1, new_elo2 = new_elo(old_elo1, old_elo2, row['Wscore'], row['Lscore'], game_type=row.GameType)
    add_stat('Elo', new_elo1, row['Season'], row['Wteam'], row['Daynum'])
    add_stat('Elo', new_elo2, row['Season'], row['Lteam'], row['Daynum'])
    

#### Exporting our Dictionary to p File for Later/Independent Use

In [55]:
import pickle
pickle.dump(statsdict, open( "statsdict.p", "wb" ))