# <center>NCAA 2017 Basketball Predictive Model Creation</center>

This note book loads <b>statsdict</b> from the p file created by <i>NCAAModelSetup.ipynb</i> to help create a predictive model to find the Probabilities of future NCAA matchups. The model uses Logistic Regression, and takes two vectors, <b>X_train</b> and <b>y_train</b>, which are respectively our Features and Labels. The features are the average stats of two teams from previous games, and labels are either <b>1</b> or <b>0</b>. <b>1</b> Indicates the first team won, while, <b>0</b> indicates the second team won. After words a submission file compatible with Kaggle is created.

In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
import pickle
from sklearn import linear_model, metrics, neural_network

In [3]:
regseasons_d = pd.read_csv("2017_Data/RegularSeasonDetailedResults.csv")
tourney_d = pd.read_csv("2017_Data/TourneyDetailedResults.csv")
teams = pd.read_csv("2017_Data/Teams.csv")
regseasons_d['GameType'] = 'S'
tourney_d['GameType'] = 'T'
games_d = pd.concat([regseasons_d, tourney_d])
games_d= games_d.sort_values(['Season','Daynum'])

In [4]:
stats = ['Wscore', 'Wfgm', 'Wfga', 'Wfgm3', 'Wfga3', 'Wftm', 'Wfta', 'Wor', 'Wdr',
        'Wast', 'Wto', 'Wstl', 'Wblk', 'Wpf', 'Lscore', 'Lfgm', 'Lfga', 'Lfgm3', 'Lfga3',
        'Lftm', 'Lfta', 'Lor', 'Ldr', 'Last', 'Lto', 'Lstl', 'Lblk', 'Lpf',
        'Wfgp', 'Lfgp', 'Wfgp3', 'Lfgp3', 'Wftp', 'Lftp']
stats.sort()

In [5]:
#Setting the stat labels for each respective team
team1_stats = (stats[int(len(stats)/2):]) # Team 1 is the Winning Team
team2_stats = (stats[0:int(len(stats)/2)]) # Team 2 is the Losing Team

In [6]:
#Display the stat label for each team, followed by the length of the label list
print(team1_stats,len(team2_stats))
print(team2_stats,len(team1_stats))

['Wast', 'Wblk', 'Wdr', 'Wfga', 'Wfga3', 'Wfgm', 'Wfgm3', 'Wfgp', 'Wfgp3', 'Wfta', 'Wftm', 'Wftp', 'Wor', 'Wpf', 'Wscore', 'Wstl', 'Wto'] 17
['Last', 'Lblk', 'Ldr', 'Lfga', 'Lfga3', 'Lfgm', 'Lfgm3', 'Lfgp', 'Lfgp3', 'Lfta', 'Lftm', 'Lftp', 'Lor', 'Lpf', 'Lscore', 'Lstl', 'Lto'] 17


#### Loading Our Stats Dictionary

In [332]:
statsdict = pickle.load(open('statsdict.p','rb'))

#### Elo Functions

In [333]:
def get_K(wteam_elo, lteam_elo, wteam_score, lteam_score):
    diff = wteam_score - lteam_score
    if(diff>25):
        return 30
    if(diff>15):
        return 25
    return 20

def new_elo(wteam_elo=1600, lteam_elo=1600, wteam_score=80, lteam_score=80):
    eloDiff = wteam_elo - lteam_elo
    odds = 1 / (10**(-eloDiff/400) + 1)
    K = get_K(wteam_elo, lteam_elo, wteam_score, lteam_score)
    diff = round(K * (1-odds))
    return (wteam_elo + diff, lteam_elo - diff)

def get_elo(season, teamnum, game_day=-1):
    if (season,teamnum) in statsdict and ('Elo' in statsdict[season,teamnum]):
        if game_day<0:
            return statsdict[season,teamnum]['Elo'][-1]
        else:
            gameNum = statsdict[season, teamnum]['GameDict'][game_day]
            return statsdict[season,teamnum]['Elo'][gameNum]
    elif ((season-1,teamnum) in statsdict) and ('Elo' in statsdict[season-1,teamnum]):
        return (1600*(1/4)) + (3/4)*statsdict[season-1,teamnum]['Elo'][-1]
    else:
        return 1600

#### Setting Up a Stat Average Function

In [334]:
#Returns the average stat for a specified team based on the name and what day their playing on
#If day is not specified gets the average stats from the last day
def get_stat_average(stat, season, teamnum, gamenum=-1, ngames=4):
    if(gamenum==1):
        return -1
    if(gamenum<0):
        return get_stat_average(stat, season, teamnum,len(statsdict[season, teamnum][stat])+1, ngames)
    avg = np.nanmean(statsdict[season, teamnum][stat][max(0, gamenum-ngames-1):gamenum-1])
    if np.isnan(avg):
        return -1
    else:
        return avg  

## Creating Training and Test Data

In [383]:
X = []
y = []
tourney_indexes=[]
for index, game in games_d[(games_d.Season<=2015)&(games_d.Season>=2003)].iterrows():
    wteam_statrow = []
    lteam_statrow = []
    skip_game = False
    gameNum = statsdict[game['Season'], game.Wteam]['GameDict'][game.Daynum]
    for stat in team1_stats:
        avg_stat =get_stat_average(stat[1:],game['Season'],game.Wteam,gameNum,ngames=7)
        if(stat in ['Wfgp', 'Wfgp3', 'Wftp']):
            avg_stat = avg_stat*1
        wteam_statrow.append(avg_stat)
        if(avg_stat<0):
            skip_game = True
    wteam_statrow.append(get_elo(game['Season'], game['Wteam'],game.Daynum))
    
    gameNum = statsdict[game['Season'], game.Lteam]['GameDict'][game.Daynum]
    for stat in team2_stats:
        avg_stat =get_stat_average(stat[1:],game['Season'],game.Lteam,gameNum,ngames=7)
        if(stat in ['Lfgp', 'Lfgp3', 'Lftp']):
            avg_stat = avg_stat*1
        lteam_statrow.append(avg_stat)
        if(avg_stat<0):
            skip_game = True            
    lteam_statrow.append(get_elo(game['Season'], game['Lteam'], game.Daynum))
    if skip_game:
        continue
    if game.GameType=='T':
        tourney_indexes.append(len(X))
        tourney_indexes.append(len(X)+1)
    row1 = []
    row1.extend(wteam_statrow)
    row1.extend(lteam_statrow)
    row2 = []
    row2.extend(lteam_statrow)
    row2.extend(wteam_statrow)
    X.append(row1)
    X.append(row2)
    y.append(1)
    y.append(0)



In [384]:
tourney_indexes2 = tourney_indexes[-(69*4):]
season_indexes = list(filter(lambda t: t not in tourney_indexes2, range(len(X))))
X_train = [X[s] for s in season_indexes]
X_test = [X[t] for t in tourney_indexes2]
y_train = [y[s] for s in season_indexes]
y_test = [y[t] for t in tourney_indexes2]

## Creating A Predictive Model

#### Fitting Training Data

In [385]:
model = linear_model.LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [386]:
model2 = linear_model.LogisticRegressionCV(max_iter = 300)
model2.fit(X_train, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=300,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

#### Scoring Test Data

In [388]:
#Model 1
print('Score', model.score((X_test), y_test))
print('Log Loss:', metrics.log_loss(y_test, model.predict_proba((X_test))[:,1]))

Score 0.826086956522
Log Loss: 0.418535822257


In [387]:
#Model 2
print('Score', model2.score((X_test), y_test))
print('Log Loss:', metrics.log_loss(y_test, model2.predict_proba((X_test))[:,1]))


Score 0.826086956522
Log Loss: 0.41690919678


#### Exporting Model to p file for Later use

In [389]:
pickle.dump(model, open('model2017.p','wb'))

#### Creating Submission File

In [390]:
def get_last_gamenum(season, teamnum, game_day=-1):
    day_key_list = list(statsdict[season, teamnum]['GameDict'].keys())
    day_key_list.sort()
    for d in day_key_list:
        if d >= game_day:
            return statsdict[season, teamnum]['GameDict'][d]
    return statsdict[season, teamnum]['GameDict'][day_key_list[-1]]

In [391]:
teams = pd.read_csv('2017_Data/Teams.csv')
t_seeds = pd.read_csv('2017_Data/TourneySeeds.csv')
arr = []

for season in [2016,2017]:
    for index, seed1 in t_seeds[t_seeds.Season==season].iterrows():
        for index2, seed2 in t_seeds[t_seeds.Season==season].loc[index+1:].iterrows():
            arr.append((seed1.Team, seed2.Team, season))

submission = []

for team1,team2,season in arr:
    X_matchup = []
    for stat in team1_stats:
        gamenum = get_last_gamenum(season, team1, 134)
        avg_stat = get_stat_average(stat[1:], season, team1, gamenum=gamenum, ngames=7)
        X_matchup.append(avg_stat)
    X_matchup.append(get_elo(season, team1))
    for stat in team2_stats:
        gamenum = get_last_gamenum(season, team2, 134)
        avg_stat = get_stat_average(stat[1:], season, team2, gamenum=gamenum, ngames=7)
        X_matchup.append(avg_stat)
    X_matchup.append(get_elo(season, team2))
    submission.append([team1,team2, model2.predict_proba([X_matchup]),season])


In [392]:
print('Size of Matchup:', len(submission))

Size of Matchup: 4556


In [393]:
import csv

with open('Submission_2017.csv', 'w') as csvfile:
    fieldnames = ['Id', 'Pred']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in submission:
        pa = row[2][0][0]
        pb = row[2][0][1]
        writer.writerow({'Id': str(row[3])+'_'+str(min(row[0],row[1]))+'_'+str(max(row[0],row[1])), 
                         'Pred': (lambda r: pa if r[0] > r[1] else pb)(row)})