## Overview ##

This is a notebook derived from a starter notebook provided by Kaggle. The starter notebook was inspired by last year's [Logistic Regression on Tournament Seeds by Kasper P. Lauritzen](https://www.kaggle.com/kplauritzen/notebookde27b18258?scriptVersionId=804590) starter kernel. It created a basic logistic regression model based on the seed differences between teams. 

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVR
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV

from subprocess import check_output
print(check_output(["ls", "DataFiles"]).decode("utf8"))


Cities.csv
ConferenceTourneyGames.csv
Conferences.csv
GameCities.csv
NCAATourneyCompactResults.csv
NCAATourneyDetailedResults.csv
NCAATourneySeedRoundSlots.csv
NCAATourneySeeds.csv
NCAATourneySlots.csv
RegularSeasonCompactResults.csv
RegularSeasonDetailedResults.csv
SampleSubmissionStage1.csv
SampleSubmissionStage2.csv
SampleSubmissionStage2_SampleTourney2018.csv
Seasons.csv
SecondaryTourneyCompactResults.csv
SecondaryTourneyTeams.csv
TeamCoaches.csv
TeamConferences.csv
TeamSpellings.csv
Teams.csv
Teams.xlsx



## Load the training data ##

In [2]:
data_dir = 'DataFiles/'
df_teams = pd.read_csv(data_dir + 'Teams.csv')
df_seeds = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')
df_tourn = pd.read_csv(data_dir + 'NCAATourneyCompactResults.csv')
df_season = pd.read_csv(data_dir + 'RegularSeasonCompactResults.csv')

In [3]:
df_teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2018
1,1102,Air Force,1985,2018
2,1103,Akron,1985,2018
3,1104,Alabama,1985,2018
4,1105,Alabama A&M,2000,2018


In [5]:
df_seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [6]:
df_tourn.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [7]:
df_season.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


## Clean up the data ##

Determine regular-season data for each team.

In [8]:
def get_season_games(season, team, df):
    winning_games = df.loc[(df['WTeamID'] == team) & (df['Season'] == season)]
    losing_games = df.loc[(df['LTeamID'] == team) & (df['Season'] == season)]
    return winning_games, losing_games

def get_win_percentage(season, team, df):
    winning_games, losing_games = get_season_games(season, team, df)
    wins = len(winning_games)
    losses = len(losing_games)
    return wins / (wins + losses)

def get_points(season, team, df):
    points = 0
    winning_games, losing_games = get_season_games(season, team, df)
    points_on_won_games = winning_games['WScore'].sum()
    points_on_lost_games = losing_games['LScore'].sum()
    return points_on_won_games + points_on_lost_games

def get_points_allowed(season, team, df):
    points = 0
    winning_games, losing_games = get_season_games(season, team, df)
    allowed_on_won_games = winning_games['LScore'].sum()
    allowed_on_lost_games = losing_games['WScore'].sum()
    return allowed_on_won_games + allowed_on_lost_games

Convert the seedings to the needed format (stripping the regional abbreviation in front of the seed).

In [9]:
def seed_to_int(seed):
    #Get just the digits from the seeding. Return as int
    s_int = int(seed[1:3])
    return s_int

df_seeds['seed_int'] = df_seeds.apply(lambda df_seeds: seed_to_int(df_seeds['Seed']), axis=1)
df_seeds['win_percentage'] = df_seeds.apply(lambda df_seeds: get_win_percentage(df_seeds['Season'], df_seeds['TeamID'], df_season), axis=1) * 100
df_seeds['points'] = df_seeds.apply(lambda df_seeds: get_points(df_seeds['Season'], df_seeds['TeamID'], df_season), axis=1)
df_seeds['points_allowed'] = df_seeds.apply(lambda df_seeds: get_points_allowed(df_seeds['Season'], df_seeds['TeamID'], df_season), axis=1)

df_seeds.drop(labels=['Seed'], inplace=True, axis=1) # This is the string label

df_seeds.tail()

Unnamed: 0,Season,TeamID,seed_int,win_percentage,points,points_allowed
2213,2018,1422,13,77.419355,2236,1963
2214,2018,1285,14,77.419355,2420,2133
2215,2018,1252,15,68.965517,2345,2276
2216,2018,1300,16,51.612903,2182,2192
2217,2018,1411,16,44.117647,2640,2711


Drop unneeded data from tournament dataset.

In [10]:
df_tourn.drop(labels=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True, axis=1)
df_tourn.head()

Unnamed: 0,Season,WTeamID,LTeamID
0,1985,1116,1234
1,1985,1120,1345
2,1985,1207,1250
3,1985,1229,1425
4,1985,1242,1325


## Merge data for each team ##
Merge the seed and regular-season data points with their corresponding TeamIDs in the compact results dataframe.

In [11]:
df_winseeds = df_seeds.rename(columns={'TeamID':'WTeamID', 'seed_int':'WSeed'})
df_lossseeds = df_seeds.rename(columns={'TeamID':'LTeamID', 'seed_int':'LSeed'})
df_dummy = pd.merge(left=df_tourn, right=df_winseeds, how='left', on=['Season', 'WTeamID'])
df_concat = pd.merge(left=df_dummy, right=df_lossseeds, on=['Season', 'LTeamID'])
df_concat['SeedDiff'] = df_concat.WSeed - df_concat.LSeed
df_concat['WinPercentageDiff'] = df_concat.win_percentage_x - df_concat.win_percentage_y
df_concat['PointsDiff'] = df_concat.points_x - df_concat.points_y
df_concat['PointsAllowedDiff'] = df_concat.points_allowed_x - df_concat.points_allowed_y
df_concat.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,win_percentage_x,points_x,points_allowed_x,LSeed,win_percentage_y,points_y,points_allowed_y,SeedDiff,WinPercentageDiff,PointsDiff,PointsAllowedDiff
0,1985,1116,1234,9,63.636364,2156,2036,8,66.666667,2092,1778,1,-3.030303,64,258
1,1985,1120,1345,11,62.068966,2040,1933,6,68.0,1728,1633,5,-5.931034,312,300
2,1985,1207,1250,1,92.592593,2045,1622,16,37.931034,1907,2036,-15,54.661558,138,-414
3,1985,1229,1425,9,74.074074,1933,1772,8,67.857143,1915,1809,1,6.216931,18,-37
4,1985,1242,1325,3,76.666667,2281,2112,14,74.074074,1824,1701,-11,2.592593,457,411


Create a dataframe that summarizes wins & losses along with their corresponding seed and regular-season stat differences. This is ultimately the training and validation data that will be used.

In [12]:
df_wins = pd.DataFrame()
df_wins['SeedDiff'] = df_concat['SeedDiff']
df_wins['WinPercentageDiff'] = df_concat['WinPercentageDiff']
df_wins['PointsDiff'] = df_concat['PointsDiff']
df_wins['PointsAllowedDiff'] = df_concat['PointsAllowedDiff']
df_wins['Result'] = 1

df_losses = pd.DataFrame()
df_losses['SeedDiff'] = -df_concat['SeedDiff']
df_losses['WinPercentageDiff'] = -df_concat['WinPercentageDiff']
df_losses['PointsDiff'] = -df_concat['PointsDiff']
df_losses['PointsAllowedDiff'] = -df_concat['PointsAllowedDiff']
df_losses['Result'] = 0

df_predictions = pd.concat((df_wins, df_losses))
df_predictions.head()

Unnamed: 0,SeedDiff,WinPercentageDiff,PointsDiff,PointsAllowedDiff,Result
0,1,-3.030303,64,258,1
1,5,-5.931034,312,300,1
2,-15,54.661558,138,-414,1
3,1,6.216931,18,-37,1
4,-11,2.592593,457,411,1


Pull outcomes into their own dataframe to validate against. Randomize the order of the data, and then set aside out 20 percent for testing. 

In [13]:
outcomes = df_predictions['Result']
data = df_predictions.drop('Result', axis = 1)

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, outcomes, test_size=0.2, random_state=12)

print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 3387 samples.
Testing set has 847 samples.




## Train the model ##
Test a few basic models to see which delivers the best performance on the testing data. Hyperparameters can be tuned to improve performance.

In [14]:
from sklearn.metrics import fbeta_score, accuracy_score

def train_predict(learner, X_train, y_train, X_test, y_test): 
    results = {}
    learner = learner.fit(X_train, y_train)
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train)
    results['acc_train'] = accuracy_score(y_train, predictions_train)
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    results['f_train'] = fbeta_score(y_train, predictions_train, 0.5)
    results['f_test'] = fbeta_score(y_test, predictions_test, 0.5)
    return results

clf_A = LogisticRegression(C=.01)
clf_B = RandomForestClassifier(random_state=47, max_depth=5, min_samples_leaf=4)
clf_C = GaussianNB()

results = {}
for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    results[clf_name] = \
    train_predict(clf, X_train, y_train, X_test, y_test)

print(results)

{'LogisticRegression': {'acc_train': 0.7118393858872158, 'acc_test': 0.7119244391971665, 'f_train': 0.7155123068116771, 'f_test': 0.6890547263681593}, 'RandomForestClassifier': {'acc_train': 0.723354000590493, 'acc_test': 0.7083825265643447, 'f_train': 0.7306914197310485, 'f_test': 0.6859296482412061}, 'GaussianNB': {'acc_train': 0.7068201948627104, 'acc_test': 0.7024793388429752, 'f_train': 0.7117995127044902, 'f_test': 0.6776379477250727}}


## Make Predictions ##

Pull in a list of possible matchups for the 2018 tournament. 

In [15]:
df_sample_sub = pd.read_csv(data_dir + 'SampleSubmissionStage2.csv')
n_test_games = len(df_sample_sub)

def get_year_t1_t2(ID):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in ID.split('_'))

Build a dataframe containing the data required to make predictions about 2018 games. For each matchup, this requires obtaining the 2018 tournament seed and selected 2018 regular-season stats, then finding the differnce in each data point between team 1 and team 2. 

In [16]:
X_test = np.zeros(shape=(n_test_games, 4))
for ii, row in df_sample_sub.iterrows():
    year, t1, t2 = get_year_t1_t2(row.ID)

    if len(df_seeds[(df_seeds.TeamID == t1) & (df_seeds.Season == year)].seed_int.values) > 0:
        t1_seed = df_seeds[(df_seeds.TeamID == t1) & (df_seeds.Season == year)].seed_int.values[0]
        t1_win_percentage = df_seeds[(df_seeds.TeamID == t1) & (df_seeds.Season == year)].win_percentage.values[0]
        t1_points = df_seeds[(df_seeds.TeamID == t1) & (df_seeds.Season == year)].points.values[0]
        t1_points_allowed = df_seeds[(df_seeds.TeamID == t1) & (df_seeds.Season == year)].points_allowed.values[0]
    if len(df_seeds[(df_seeds.TeamID == t2) & (df_seeds.Season == year)].seed_int.values) > 0:
        t2_seed = df_seeds[(df_seeds.TeamID == t2) & (df_seeds.Season == year)].seed_int.values[0]
        t2_win_percentage = df_seeds[(df_seeds.TeamID == t2) & (df_seeds.Season == year)].win_percentage.values[0]
        t2_points = df_seeds[(df_seeds.TeamID == t2) & (df_seeds.Season == year)].points.values[0]
        t2_points_allowed = df_seeds[(df_seeds.TeamID == t2) & (df_seeds.Season == year)].points_allowed.values[0]
    
    diff_seed = t1_seed - t2_seed
    diff_win_percentage = t1_win_percentage - t2_win_percentage
    diff_points = t1_points - t2_points
    diff_points_allowed = t1_points_allowed - t2_points_allowed
    
    X_test[ii, 0] = diff_seed
    X_test[ii, 1] = diff_win_percentage
    X_test[ii, 2] = diff_points
    X_test[ii, 3] = diff_points_allowed
    
print(X_test)

[[   5.          -23.52941176 -290.          -41.        ]
 [  -2.           -8.63377609 -129.           44.        ]
 [   2.          -11.76470588 -297.         -189.        ]
 ...
 [ -10.            6.25        374.          166.        ]
 [   3.           -6.72348485 -125.         -175.        ]
 [  13.          -12.97348485 -499.         -341.        ]]


Create predictions on this year's potential tournament matches using the trained model. Each game ID contains the year, the ID for Team 1 and the ID for Team 2. The prediction is the probability that Team 1 will win. 

In [17]:
preds = clf_A.predict_proba(X_test)[:,1]

clipped_preds = np.clip(preds, 0.05, 0.95)
df_sample_sub.Pred = clipped_preds
df_sample_sub.head()

Unnamed: 0,ID,Pred
0,2018_1104_1112,0.271933
1,2018_1104_1113,0.486218
2,2018_1104_1116,0.401414
3,2018_1104_1120,0.275527
4,2018_1104_1137,0.602658


Export predictions to a csv file. 

In [131]:
df_sample_sub.to_csv('logreg_seed_starter.csv', index=False)