# Data Exploration

In [1]:
import pandas as pd
import numpy as np

In [2]:
team_names = pd.read_csv("./data/WDataFiles/WTeams.csv")
seasons = pd.read_csv("./data/WDataFiles/WSeasons.csv")
seeds = pd.read_csv("./data/WDataFiles/WNCAATourneySeeds.csv")
season_results = pd.read_csv("./data/WDataFiles/WRegularSeasonCompactResults.csv")
tourney_results = pd.read_csv("./data/WDataFiles/WNCAATourneyCompactResults.csv")
cities = pd.read_csv("./data/WDataFiles/WCities.csv")
game_cities = pd.read_csv("./data/WDataFiles/WGameCities.csv")

In [3]:
seasons.head()

Unnamed: 0,Season,DayZero,RegionW,RegionX,RegionY,RegionZ
0,1998,10/27/1997,East,Midwest,Mideast,West
1,1999,10/26/1998,East,Mideast,Midwest,West
2,2000,11/1/1999,East,Midwest,Mideast,West
3,2001,10/30/2000,East,Midwest,Mideast,West
4,2002,10/29/2001,East,West,Mideast,Midwest


In [4]:
seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1998,W01,3330
1,1998,W02,3163
2,1998,W03,3112
3,1998,W04,3301
4,1998,W05,3272


In [126]:
tourney_results.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
1255,2017,147,3163,90,3332,52,H,0
1256,2017,147,3376,71,3199,64,N,0
1257,2017,151,3280,66,3163,64,N,1
1258,2017,151,3376,62,3390,53,N,0
1259,2017,153,3376,67,3280,55,N,0


### Intuitions to follow up on

* Does being on home court increase probability of a win?
* Does the outcome of the prior game for this team give them "momentum"?
* Do Win/Loss ratios for the season so far correlate with wins?
* Does seed distance from opponent correlate with winning?
* Does prior season normalized points scored with respect to total distribution correlate with winning?
* same question for season-to-date
* same question for historical and season-to-date defense
* can we easily determine distances from all cities to all other cities?  Do we know the home city for each team?  If so can we use distance from home city to game city as a proxy for travel fatigue? And does it affect winning outcomes?
* are relative win-ratios from regular season play predictive of tournament wins?
* are there different regional strengths?

## Stage 1 Predictions

The first portion of this exercise involves successfully predicting matchups from past tournaments.  The goal is to take the 64 teams in each tournament, and predict each possible matchup.  To get the formatting right, I'll build a model that extracts the seed for each tournament, and tries to learn a naive predictor just based on seed difference between teams.  Steps:

* 1: Make training set from tournament results.
  - extract seed 1 and seed 2
  - make Y vector, 1 for a team 1 victory, 0 for team 2 victory
  - teams in a match ordered by numeric ID

* 2: train model to convergence on entire batch

* 3: Make a test set from seeds list for all possible combinations and seed numbers

* 4: predict results and confirm an output size of 8,064 data rows.


In [6]:
from sklearn.linear_model import LogisticRegression
import itertools
import csv
    
def figure_team_1(row):
    if row['Team1Win']:
        return row['WTeamID']
    else:
        return row['LTeamID']

def figure_team_2(row):
    if row['Team1Win']:
        return row['LTeamID']
    else:
        return row['WTeamID']

def training_set_baseline(tourney_results_df, seeds_df):
    tourney_df = tourney_results_df[['Season','WTeamID','LTeamID']]
    tourney_df['Team1Win'] = tourney_df['WTeamID'] < tourney_df['LTeamID']
    tourney_df['Team1'] = tourney_df.apply(figure_team_1,axis=1)
    tourney_df['Team2'] = tourney_df.apply(figure_team_2,axis=1)
    training_df = pd.merge(tourney_df, seeds_df, left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
    training_df = training_df.rename(index=str, columns={"Seed": "Team1Seed"})
    training_df = training_df.drop(['TeamID'], axis=1)
    training_df = pd.merge(training_df, seeds_df, left_on=['Season', 'Team2'], right_on=['Season', 'TeamID'])
    training_df = training_df.rename(index=str, columns={"Seed": "Team2Seed"})
    training_df = training_df.drop(['TeamID'], axis=1)
    training_df['SeedNum1'] = training_df.apply(lambda row: int(row['Team1Seed'][1:3]), axis=1)
    training_df['SeedNum2'] = training_df.apply(lambda row: int(row['Team2Seed'][1:3]), axis=1)
    training_df['SeedDelta'] = (training_df['SeedNum1'] - training_df['SeedNum2'])
    seed_mu = training_df['SeedDelta'].mean()
    seed_sigma = training_df['SeedDelta'].std()
    training_df['NormalizedSeedDelta'] = (training_df['SeedDelta'] - seed_mu) / seed_sigma
    return training_df, { 'seed_mu': seed_mu, 'seed_sigma': seed_sigma }

def test_set_frame(seeds_df, training_meta):
    test_seeds = seeds_df[['Season', 'Seed', 'TeamID']]
    test_seeds['SeedNum'] = test_seeds.apply(lambda row: int(row['Seed'][1:3]), axis=1)
    seasons = np.unique(test_seeds.Season.values)
    test_set = [] # output_key (SSSS_XXXX_YYYY), seed1, seed2, seed_delta
    seasons = [ss for ss in seasons if ss >= 2014 and ss < 2018]
    for s in seasons:
        seed_set = test_seeds[test_seeds.Season == s].values
        for seed_pair in itertools.combinations(seed_set, 2):
            id_a = seed_pair[0][2]
            id_b = seed_pair[1][2]
            t1 = 0
            t2 = 0
            s1 = 0
            s2 = 0
            if id_a < id_b:
                t1 = id_a
                t2 = id_b
                s1 = seed_pair[0][3]
                s2 = seed_pair[1][3]
            else:
                t1 = id_b
                t2 = id_a
                s1 = seed_pair[1][3]
                s2 = seed_pair[0][3]
            output_key = "%s_%s_%s" % (s, t1, t2)
            test_set.append([output_key, s, t1, t2, s1, s2])
    tdf = pd.DataFrame(data=test_set, columns=['Key', 'Season', 'Team1', 'Team2', 'SeedNum1', 'SeedNum2'])
    tdf['SeedDelta'] = (tdf['SeedNum1'] - tdf['SeedNum2'])
    tdf['NormalizedSeedDelta'] = (tdf['SeedDelta'] - training_meta['seed_mu']) / training_meta['seed_sigma']
    return tdf
    
def test_set_baseline(seeds_df, training_meta):
    test_df = test_set_frame(seeds_df, training_meta)
    test_set = test_df[['Key', 'SeedNum1', 'SeedNum2', 'NormalizedSeedDelta']].values
    return test_set

def predict_with_naive_model(test_ds, model):
    test_x = test_ds[:, 3].astype(float)
    test_x = test_x.reshape(len(test_x), 1)
    predictions = model.predict_proba(test_x)[:, 1]
    return predictions
    
def build_naive_model(training_df):
    x_train = training_df['NormalizedSeedDelta'].values
    y_train = np.vectorize(lambda x: 1 if x else 0)(training_df['Team1Win'].values)
    x_train = x_train.reshape(len(x_train), 1)
    y_train = y_train.reshape(len(y_train), )
    naive_model = LogisticRegression()
    naive_model = naive_model.fit(x_train, y_train)
    print("SCORE: ", naive_model.score(x_train, y_train))
    return naive_model

def write_predictions(test_set, preds):
    with open("./data/outputs/stage_1_predictions.csv", "w") as f:
        wtr = csv.writer(f)
        wtr.writerow(['ID','Pred'])
        for i in range(len(test_set)):
            out_key = test_set[i][0]
            prediction = preds[i]
            wtr.writerow([out_key,prediction])
    

In [8]:
training_df, training_meta = training_set_baseline(tourney_results, seeds)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
training_df.head(10)

Unnamed: 0,Season,WTeamID,LTeamID,Team1Win,Team1,Team2,Team1Seed,Team2Seed,SeedNum1,SeedNum2,SeedDelta,NormalizedSeedDelta
0,1998,3104,3422,True,3104,3422,X02,X15,2,15,-13,-1.751175
1,1998,3104,3417,True,3104,3417,X02,X07,2,7,-5,-0.672212
2,1998,3417,3276,False,3276,3417,X10,X07,10,7,3,0.406752
3,1998,3256,3104,False,3104,3256,X02,X03,2,3,-1,-0.13273
4,1998,3256,3155,False,3155,3256,X06,X03,6,3,3,0.406752
5,1998,3256,3221,False,3221,3256,X14,X03,14,3,11,1.485716
6,1998,3112,3365,True,3112,3365,W03,W14,3,14,-11,-1.481434
7,1998,3112,3438,True,3112,3438,W03,W06,3,6,-3,-0.402471
8,1998,3438,3374,False,3374,3438,W11,W06,11,6,5,0.676493
9,1998,3163,3112,False,3112,3163,W03,W02,3,2,1,0.137011


In [20]:
current_model = build_naive_model(training_df)
current_test_set = test_set_baseline(seeds, training_meta)
predictions = predict_with_naive_model(current_test_set, current_model)

SCORE:  0.778571428571


## write first naive predictions file to get the format down right

In [33]:
write_predictions(current_test_set, predictions)        

## RESULTS For Naive Model:
* ACCURACY ON TRAINING: 0.7786
* TESTING SCORE: 0.4626

## including other features

Some other features that are likely to be helpful are offensive and defensive information for each team, so lets' try median offensive and defensive points for the regular season as a feature.

In [12]:
def build_all_scores(season_df):
    w_scores = season_df[['Season', 'WTeamID', 'WScore']].rename(index=str, columns={"WTeamID": "TeamID", "WScore": "Score"})
    l_scores = season_df[['Season', 'LTeamID', 'LScore']].rename(index=str, columns={"LTeamID": "TeamID", "LScore": "Score"})
    all_scores = pd.concat([w_scores, l_scores])
    grouped_scores = all_scores.groupby(['Season', 'TeamID'], as_index=False).aggregate(np.median)
    return grouped_scores
    
def training_set_with_scores(tourney_results_df, seeds_df, season_df):
    baseline_df, training_meta = training_set_baseline(tourney_results_df, seeds_df)
    grouped_scores = build_all_scores(season_df)
    score_mu = grouped_scores['Score'].mean()
    score_sigma = grouped_scores['Score'].std()
    training_meta['score_mu'] = score_mu
    training_meta['score_sigma'] = score_sigma

    training_df = pd.merge(baseline_df, grouped_scores, left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
    training_df = training_df.rename(index=str, columns={"Score": "Team1MedianScore"})
    training_df = training_df.drop(['TeamID'], axis=1)
    training_df = pd.merge(training_df, grouped_scores, left_on=['Season', 'Team2'], right_on=['Season', 'TeamID'])
    training_df = training_df.rename(index=str, columns={"Score": "Team2MedianScore"})
    training_df = training_df.drop(['TeamID'], axis=1)
    training_df['Team1NormedScore'] = (training_df['Team1MedianScore'] - score_mu) / (score_sigma ** 2)
    training_df['Team2NormedScore'] = (training_df['Team2MedianScore'] - score_mu) / (score_sigma ** 2)
    return training_df, training_meta

def build_multifeature_naive_model(training_df):
    x_train = training_df[['NormalizedSeedDelta', 'Team1NormedScore', 'Team2NormedScore']].values
    y_train = np.vectorize(lambda x: 1 if x else 0)(training_df['Team1Win'].values)
    x_train = x_train.reshape(len(x_train), 3)
    y_train = y_train.reshape(len(y_train), )
    naive_model = LogisticRegression()
    naive_model = naive_model.fit(x_train, y_train)
    print("SCORE: ", naive_model.score(x_train, y_train))
    return naive_model

def predict_with_multi_naive_model(test_df, model):
    test_x = test_df[['NormalizedSeedDelta', 'Team1NormedScore', 'Team2NormedScore']].values
    test_x.reshape(len(test_x), 3)
    predictions = model.predict_proba(test_x)[:, 1]
    return predictions

def test_set_with_scores(seeds_df, season_df, training_meta):
    tdf = test_set_frame(seeds_df, training_meta)
    grouped_scores = build_all_scores(season_df)
    
    score_mu = training_meta['score_mu']
    score_sigma = training_meta['score_sigma']

    test_df = pd.merge(tdf, grouped_scores, left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
    test_df = test_df.rename(index=str, columns={"Score": "Team1MedianScore"})
    test_df = test_df.drop(['TeamID'], axis=1)
    test_df = pd.merge(test_df, grouped_scores, left_on=['Season', 'Team2'], right_on=['Season', 'TeamID'])
    test_df = test_df.rename(index=str, columns={"Score": "Team2MedianScore"})
    test_df = test_df.drop(['TeamID'], axis=1)
    test_df['Team1NormedScore'] = (test_df['Team1MedianScore'] - score_mu) / (score_sigma ** 2)
    test_df['Team2NormedScore'] = (test_df['Team2MedianScore'] - score_mu) / (score_sigma ** 2)
    
    return test_df

In [24]:
training_set, meta = training_set_with_scores(tourney_results, seeds,season_results)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [91]:
training_set.head()

Unnamed: 0,Season,WTeamID,LTeamID,Team1Win,Team1,Team2,Team1Seed,Team2Seed,SeedNum1,SeedNum2,SeedDelta,NormalizedSeedDelta,Team1MedianScore,Team2MedianScore,Team1NormedScore,Team2NormedScore
0,1998,3104,3422,True,3104,3422,X02,X15,2,15,-13,-1.751175,74.0,75.0,0.197045,0.217213
1,1998,3104,3417,True,3104,3417,X02,X07,2,7,-5,-0.672212,74.0,80.0,0.197045,0.31805
2,1998,3417,3276,False,3276,3417,X10,X07,10,7,3,0.406752,77.0,80.0,0.257547,0.31805
3,1998,3256,3104,False,3104,3256,X02,X03,2,3,-1,-0.13273,74.0,86.0,0.197045,0.439054
4,1998,3256,3155,False,3155,3256,X06,X03,6,3,3,0.406752,73.0,86.0,0.176878,0.439054


In [25]:
multifeature_model = build_multifeature_naive_model(training_set)

SCORE:  0.786507936508


In [29]:
current_test_set = test_set_with_scores(seeds, season_results, meta)
predictions = predict_with_multi_naive_model(current_test_set, multifeature_model)

In [32]:
write_predictions(current_test_set['Key'].values.reshape((len(current_test_set), 1)), predictions)

## RESULTS For Model 2 (seeds and scoring history):
* ACCURACY ON TRAINING: 0.7865
* TESTING SCORE: 0.4582

In [112]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

def build_deep_model(input_features):
    model = Sequential()
    model.add(Dense(128, input_dim=input_features))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_deep_model(training_df):
    x_train = training_df[['NormalizedSeedDelta', 'Team1NormedScore', 'Team2NormedScore']].values
    y_train = np.vectorize(lambda x: 1 if x else 0)(training_df['Team1Win'].values)
    x_train = x_train.reshape(len(x_train), 3)
    y_train = y_train.reshape(len(y_train), )
    model = build_deep_model(3)
    model.fit(x_train, y_train, epochs=10000, batch_size=len(x_train), verbose=2)
    return model

def score_deep_model(training_df, model):
    x_train = training_df[['NormalizedSeedDelta', 'Team1NormedScore', 'Team2NormedScore']].values
    y_train = np.vectorize(lambda x: 1 if x else 0)(training_df['Team1Win'].values)
    x_train = x_train.reshape(len(x_train), 3)
    y_train = y_train.reshape(len(y_train), )
    return model.evaluate(x_train, y_train)

def predict_with_deep_model(test_df, model):
    test_x = test_df[['NormalizedSeedDelta', 'Team1NormedScore', 'Team2NormedScore']].values
    test_x.reshape(len(test_x), 3)
    predictions = model.predict(test_x)[:, 0]
    return predictions

In [48]:
d_model = train_deep_model(training_set)

In [54]:
d_predictios = predict_with_deep_model(current_test_set, d_model)

In [55]:
write_predictions(current_test_set['Key'].values.reshape((len(current_test_set), 1)), d_predictios)

In [59]:
score_deep_model(training_set, d_model)



[0.34670683334744168, 0.83174603193525287]

## First Deep Model Results

- ACCURACY ON TRAINING: 0.8317
- KAGGLE TESTING SCORE: 0.3683

***THESE ARE INVALID, THE TEST SET IS CONTAINED IN THE TRAINING DATA, IT'S SUPER OVER-FIT***

#### Improvement Ideas:
- Include mean spread
- Include exponentially-weighted average of point spread over the season
- Include spread variance
- Include Normalized Seed Num
- Batch Norm Model

In [155]:
def calculate_spreads(season_df):
    score_df = season_df[['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore']]
    score_df['WSpread'] = score_df['WScore'] - score_df['LScore']
    score_df['LSpread'] = score_df['WSpread'] * -1
    w_spreads = score_df[['Season','DayNum','WTeamID', 'WSpread']].rename(index=str, columns={'WTeamID': 'TeamID', 'WSpread': 'Spread'})
    l_spreads = score_df[['Season','DayNum','LTeamID', 'LSpread']].rename(index=str, columns={'LTeamID': 'TeamID', 'LSpread': 'Spread'})
    spreads_df = pd.concat([w_spreads, l_spreads]).sort_values(['Season','DayNum'])
    seasons = spreads_df['Season'].unique()
    teams = spreads_df['TeamID'].unique()
    join_frame = []
    for season in seasons:
        for team in teams:
            spreads = spreads_df[(spreads_df.Season == season) & (spreads_df.TeamID == team)].Spread
            mean_spread = spreads.mean()
            std_spread = spreads.std()
            ew_beta = 0.8
            expo_spread = 0
            for spread in spreads.values:
                expo_spread = (ew_beta * expo_spread) + (1 - ew_beta) * spread
            join_frame.append([season, team, mean_spread, std_spread, expo_spread])
    return pd.DataFrame(data=join_frame, columns=['Season', 'TeamID', 'MeanSpread', 'StdSpread', 'ExpoSpread'])
    

def training_set_with_spreads(tourney_df, seeds_df, season_df, spreads_df):
    train_df, meta = training_set_with_scores(tourney_df, seeds_df, season_df)
    train_df['Team1NormedSeed'] = (train_df['SeedNum1'] - 8) / 8
    train_df['Team2NormedSeed'] = (train_df['SeedNum2'] - 8) / 8
    
    meta['spread_mean_mu'] = spreads_df['MeanSpread'].mean()
    meta['spread_std_mu'] = spreads_df['StdSpread'].mean()
    meta['spread_expo_mu'] = spreads_df['ExpoSpread'].mean()
    meta['spread_mean_sigma'] = spreads_df['MeanSpread'].std() ** 2
    meta['spread_std_sigma'] = spreads_df['StdSpread'].std() ** 2
    meta['spread_expo_sigma'] = spreads_df['ExpoSpread'].std() ** 2

    training_df = pd.merge(train_df, spreads_df, left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
    training_df = training_df.rename(index=str, columns={"MeanSpread": "Team1MeanSpread", "StdSpread": "Team1StdSpread", "ExpoSpread": "Team1ExpoSpread"})
    training_df = training_df.drop(['TeamID'], axis=1)
    training_df = pd.merge(training_df, spreads_df, left_on=['Season', 'Team2'], right_on=['Season', 'TeamID'])
    training_df = training_df.rename(index=str, columns={"MeanSpread": "Team2MeanSpread", "StdSpread": "Team2StdSpread", "ExpoSpread": "Team2ExpoSpread"})
    training_df = training_df.drop(['TeamID'], axis=1)
    
    training_df['Team1NormMeanSpread'] = (training_df['Team1MeanSpread'] - meta['spread_mean_mu']) / meta['spread_mean_sigma']
    training_df['Team2NormMeanSpread'] = (training_df['Team2MeanSpread'] - meta['spread_mean_mu']) / meta['spread_mean_sigma']
    training_df['Team1NormStdSpread'] = (training_df['Team1StdSpread'] - meta['spread_std_mu']) / meta['spread_std_sigma']
    training_df['Team2NormStdSpread'] = (training_df['Team2StdSpread'] - meta['spread_std_mu']) / meta['spread_std_sigma']
    training_df['Team1NormExpSpread'] = (training_df['Team1ExpoSpread'] - meta['spread_expo_mu']) / meta['spread_expo_sigma']
    training_df['Team2NormExpSpread'] = (training_df['Team2ExpoSpread'] - meta['spread_expo_mu']) / meta['spread_expo_sigma']
    
    return training_df.sample(frac=1), meta

def test_set_with_spreads(seeds_df, season_df, training_meta, spreads_df):
    test_df = test_set_with_scores(seeds_df, season_df, training_meta)
    grouped_scores = build_all_scores(season_df)
    
    spread_mean_mu = training_meta['spread_mean_mu']
    spread_std_mu = training_meta['spread_std_mu']
    spread_expo_mu = training_meta['spread_expo_mu']
    spread_mean_sigma = training_meta['spread_mean_sigma']
    spread_std_sigma = training_meta['spread_std_sigma']
    spread_expo_sigma = training_meta['spread_expo_sigma']
    
    test_df['Team1NormedSeed'] = (test_df['SeedNum1'] - 8) / 8
    test_df['Team2NormedSeed'] = (test_df['SeedNum2'] - 8) / 8
    
    test_df = pd.merge(test_df, spreads_df, left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
    test_df = test_df.rename(index=str, columns={"MeanSpread": "Team1MeanSpread", "StdSpread": "Team1StdSpread", "ExpoSpread": "Team1ExpoSpread"})
    test_df = test_df.drop(['TeamID'], axis=1)
    test_df = pd.merge(test_df, spreads_df, left_on=['Season', 'Team2'], right_on=['Season', 'TeamID'])
    test_df = test_df.rename(index=str, columns={"MeanSpread": "Team2MeanSpread", "StdSpread": "Team2StdSpread", "ExpoSpread": "Team2ExpoSpread"})
    test_df = test_df.drop(['TeamID'], axis=1)
    
    test_df['Team1NormMeanSpread'] = (test_df['Team1MeanSpread'] - spread_mean_mu) / spread_mean_sigma
    test_df['Team2NormMeanSpread'] = (test_df['Team2MeanSpread'] - spread_mean_mu) / spread_mean_sigma
    test_df['Team1NormStdSpread'] = (test_df['Team1StdSpread'] - spread_std_mu) / spread_std_sigma
    test_df['Team2NormStdSpread'] = (test_df['Team2StdSpread'] - spread_std_mu) / spread_std_sigma
    test_df['Team1NormExpSpread'] = (test_df['Team1ExpoSpread'] - spread_expo_mu) / spread_expo_sigma
    test_df['Team2NormExpSpread'] = (test_df['Team2ExpoSpread'] - spread_expo_mu) / spread_expo_sigma
    
    return test_df

from keras.layers import BatchNormalization
from keras.regularizers import l2

def build_normed_deep_model(input_features, do_level=0.5, l2_lambda=0.01):
    model = Sequential()
    model.add(Dense(128, input_dim=input_features, kernel_regularizer=l2(l2_lambda)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(do_level))
    model.add(Dense(128, kernel_regularizer=l2(l2_lambda)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(do_level))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_normed_deep_model(training_df, epoch_count=500):
    features = [
        'NormalizedSeedDelta','Team1NormedScore','Team2NormedScore','Team1NormedSeed','Team2NormedSeed',
        'Team1NormMeanSpread','Team2NormMeanSpread','Team1NormStdSpread','Team2NormStdSpread',
        'Team1NormExpSpread','Team2NormExpSpread'
    ]
    x_train = training_df[features].values
    y_train = np.vectorize(lambda x: 1 if x else 0)(training_df['Team1Win'].values)
    x_train = x_train.reshape(len(x_train), len(features))
    y_train = y_train.reshape(len(y_train), )
    model = build_normed_deep_model(len(features))
    model.fit(x_train, y_train, epochs=epoch_count, batch_size=len(x_train), verbose=2, validation_split=0.2)
    return model

def score_normed_deep_model(training_df, model):
    features = [
        'NormalizedSeedDelta','Team1NormedScore','Team2NormedScore','Team1NormedSeed','Team2NormedSeed',
        'Team1NormMeanSpread','Team2NormMeanSpread','Team1NormStdSpread','Team2NormStdSpread',
        'Team1NormExpSpread','Team2NormExpSpread'
    ]
    x_train = training_df[features].values
    y_train = np.vectorize(lambda x: 1 if x else 0)(training_df['Team1Win'].values)
    x_train = x_train.reshape(len(x_train), len(features))
    y_train = y_train.reshape(len(y_train), )
    return model.evaluate(x_train, y_train)

def predict_with_normed_deep_model(test_df, model):
    features = [
        'NormalizedSeedDelta','Team1NormedScore','Team2NormedScore','Team1NormedSeed','Team2NormedSeed',
        'Team1NormMeanSpread','Team2NormMeanSpread','Team1NormStdSpread','Team2NormStdSpread',
        'Team1NormExpSpread','Team2NormExpSpread'
    ]
    test_x = test_df[features].values
    test_x.reshape(len(test_x), len(features))
    predictions = model.predict(test_x)[:, 0]
    return predictions

In [94]:
spreads_df = calculate_spreads(season_results)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [156]:
training_set, meta = training_set_with_spreads(tourney_results, seeds,season_results,spreads_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [157]:
model = train_normed_deep_model(training_set)

Train on 1008 samples, validate on 252 samples
Epoch 1/500
 - 2s - loss: 2.4415 - acc: 0.4534 - val_loss: 2.1278 - val_acc: 0.5714
Epoch 2/500
 - 0s - loss: 2.3020 - acc: 0.5079 - val_loss: 2.0289 - val_acc: 0.7143
Epoch 3/500
 - 0s - loss: 2.1885 - acc: 0.5506 - val_loss: 1.9641 - val_acc: 0.7381
Epoch 4/500
 - 0s - loss: 2.1088 - acc: 0.6220 - val_loss: 1.9223 - val_acc: 0.7659
Epoch 5/500
 - 0s - loss: 2.0617 - acc: 0.6478 - val_loss: 1.8953 - val_acc: 0.7619
Epoch 6/500
 - 0s - loss: 2.0309 - acc: 0.6677 - val_loss: 1.8775 - val_acc: 0.7738
Epoch 7/500
 - 0s - loss: 1.9671 - acc: 0.7024 - val_loss: 1.8648 - val_acc: 0.7698
Epoch 8/500
 - 0s - loss: 1.9320 - acc: 0.7222 - val_loss: 1.8550 - val_acc: 0.7778
Epoch 9/500
 - 0s - loss: 1.9175 - acc: 0.7302 - val_loss: 1.8461 - val_acc: 0.7817
Epoch 10/500
 - 0s - loss: 1.9132 - acc: 0.7321 - val_loss: 1.8379 - val_acc: 0.7817
Epoch 11/500
 - 0s - loss: 1.8895 - acc: 0.7262 - val_loss: 1.8294 - val_acc: 0.7817
Epoch 12/500
 - 0s - loss: 

Epoch 97/500
 - 0s - loss: 0.8726 - acc: 0.7857 - val_loss: 0.8746 - val_acc: 0.7738
Epoch 98/500
 - 0s - loss: 0.8651 - acc: 0.7679 - val_loss: 0.8689 - val_acc: 0.7738
Epoch 99/500
 - 0s - loss: 0.8498 - acc: 0.7867 - val_loss: 0.8632 - val_acc: 0.7738
Epoch 100/500
 - 0s - loss: 0.8465 - acc: 0.7718 - val_loss: 0.8576 - val_acc: 0.7738
Epoch 101/500
 - 0s - loss: 0.8323 - acc: 0.7847 - val_loss: 0.8520 - val_acc: 0.7738
Epoch 102/500
 - 0s - loss: 0.8337 - acc: 0.7857 - val_loss: 0.8464 - val_acc: 0.7738
Epoch 103/500
 - 0s - loss: 0.8275 - acc: 0.7817 - val_loss: 0.8409 - val_acc: 0.7738
Epoch 104/500
 - 0s - loss: 0.8219 - acc: 0.7768 - val_loss: 0.8353 - val_acc: 0.7738
Epoch 105/500
 - 0s - loss: 0.8177 - acc: 0.7808 - val_loss: 0.8300 - val_acc: 0.7738
Epoch 106/500
 - 0s - loss: 0.8239 - acc: 0.7837 - val_loss: 0.8248 - val_acc: 0.7738
Epoch 107/500
 - 0s - loss: 0.8017 - acc: 0.7857 - val_loss: 0.8196 - val_acc: 0.7698
Epoch 108/500
 - 0s - loss: 0.7877 - acc: 0.7808 - val_lo

Epoch 193/500
 - 0s - loss: 0.5272 - acc: 0.7877 - val_loss: 0.6640 - val_acc: 0.7183
Epoch 194/500
 - 0s - loss: 0.5210 - acc: 0.8036 - val_loss: 0.6653 - val_acc: 0.7222
Epoch 195/500
 - 0s - loss: 0.5319 - acc: 0.7897 - val_loss: 0.6668 - val_acc: 0.7222
Epoch 196/500
 - 0s - loss: 0.5223 - acc: 0.7966 - val_loss: 0.6679 - val_acc: 0.7183
Epoch 197/500
 - 0s - loss: 0.5104 - acc: 0.8036 - val_loss: 0.6683 - val_acc: 0.7183
Epoch 198/500
 - 0s - loss: 0.5009 - acc: 0.8065 - val_loss: 0.6679 - val_acc: 0.7222
Epoch 199/500
 - 0s - loss: 0.5197 - acc: 0.8016 - val_loss: 0.6668 - val_acc: 0.7302
Epoch 200/500
 - 0s - loss: 0.5209 - acc: 0.8006 - val_loss: 0.6656 - val_acc: 0.7302
Epoch 201/500
 - 0s - loss: 0.5156 - acc: 0.7946 - val_loss: 0.6650 - val_acc: 0.7302
Epoch 202/500
 - 0s - loss: 0.5099 - acc: 0.7897 - val_loss: 0.6643 - val_acc: 0.7262
Epoch 203/500
 - 0s - loss: 0.4999 - acc: 0.7976 - val_loss: 0.6641 - val_acc: 0.7262
Epoch 204/500
 - 0s - loss: 0.5096 - acc: 0.8016 - val

Epoch 289/500
 - 0s - loss: 0.4182 - acc: 0.8175 - val_loss: 0.6557 - val_acc: 0.7063
Epoch 290/500
 - 0s - loss: 0.4308 - acc: 0.8175 - val_loss: 0.6555 - val_acc: 0.7063
Epoch 291/500
 - 0s - loss: 0.4273 - acc: 0.8155 - val_loss: 0.6552 - val_acc: 0.7024
Epoch 292/500
 - 0s - loss: 0.4163 - acc: 0.8065 - val_loss: 0.6558 - val_acc: 0.7024
Epoch 293/500
 - 0s - loss: 0.4377 - acc: 0.8155 - val_loss: 0.6565 - val_acc: 0.7024
Epoch 294/500
 - 0s - loss: 0.4295 - acc: 0.8155 - val_loss: 0.6573 - val_acc: 0.7063
Epoch 295/500
 - 0s - loss: 0.4311 - acc: 0.8204 - val_loss: 0.6580 - val_acc: 0.7024
Epoch 296/500
 - 0s - loss: 0.4315 - acc: 0.8065 - val_loss: 0.6586 - val_acc: 0.7024
Epoch 297/500
 - 0s - loss: 0.4391 - acc: 0.8145 - val_loss: 0.6593 - val_acc: 0.7024
Epoch 298/500
 - 0s - loss: 0.4222 - acc: 0.8155 - val_loss: 0.6602 - val_acc: 0.6944
Epoch 299/500
 - 0s - loss: 0.4247 - acc: 0.8194 - val_loss: 0.6611 - val_acc: 0.6944
Epoch 300/500
 - 0s - loss: 0.4356 - acc: 0.7917 - val

Epoch 385/500
 - 0s - loss: 0.3976 - acc: 0.8313 - val_loss: 0.6353 - val_acc: 0.7143
Epoch 386/500
 - 0s - loss: 0.3965 - acc: 0.8313 - val_loss: 0.6329 - val_acc: 0.7143
Epoch 387/500
 - 0s - loss: 0.3930 - acc: 0.8224 - val_loss: 0.6299 - val_acc: 0.7222
Epoch 388/500
 - 0s - loss: 0.3965 - acc: 0.8403 - val_loss: 0.6264 - val_acc: 0.7143
Epoch 389/500
 - 0s - loss: 0.3918 - acc: 0.8333 - val_loss: 0.6208 - val_acc: 0.7143
Epoch 390/500
 - 0s - loss: 0.3841 - acc: 0.8254 - val_loss: 0.6162 - val_acc: 0.7222
Epoch 391/500
 - 0s - loss: 0.4022 - acc: 0.8264 - val_loss: 0.6140 - val_acc: 0.7222
Epoch 392/500
 - 0s - loss: 0.4073 - acc: 0.8155 - val_loss: 0.6125 - val_acc: 0.7222
Epoch 393/500
 - 0s - loss: 0.3925 - acc: 0.8204 - val_loss: 0.6121 - val_acc: 0.7262
Epoch 394/500
 - 0s - loss: 0.4035 - acc: 0.8125 - val_loss: 0.6127 - val_acc: 0.7222
Epoch 395/500
 - 0s - loss: 0.3935 - acc: 0.8294 - val_loss: 0.6145 - val_acc: 0.7143
Epoch 396/500
 - 0s - loss: 0.4004 - acc: 0.8383 - val

Epoch 481/500
 - 0s - loss: 0.3808 - acc: 0.8313 - val_loss: 0.5864 - val_acc: 0.7421
Epoch 482/500
 - 0s - loss: 0.3880 - acc: 0.8145 - val_loss: 0.5842 - val_acc: 0.7460
Epoch 483/500
 - 0s - loss: 0.3812 - acc: 0.8323 - val_loss: 0.5813 - val_acc: 0.7381
Epoch 484/500
 - 0s - loss: 0.3853 - acc: 0.8244 - val_loss: 0.5770 - val_acc: 0.7421
Epoch 485/500
 - 0s - loss: 0.3919 - acc: 0.8234 - val_loss: 0.5745 - val_acc: 0.7500
Epoch 486/500
 - 0s - loss: 0.3793 - acc: 0.8403 - val_loss: 0.5742 - val_acc: 0.7540
Epoch 487/500
 - 0s - loss: 0.3830 - acc: 0.8363 - val_loss: 0.5729 - val_acc: 0.7540
Epoch 488/500
 - 0s - loss: 0.3814 - acc: 0.8254 - val_loss: 0.5709 - val_acc: 0.7579
Epoch 489/500
 - 0s - loss: 0.3746 - acc: 0.8363 - val_loss: 0.5710 - val_acc: 0.7540
Epoch 490/500
 - 0s - loss: 0.3882 - acc: 0.8224 - val_loss: 0.5723 - val_acc: 0.7540
Epoch 491/500
 - 0s - loss: 0.3726 - acc: 0.8353 - val_loss: 0.5753 - val_acc: 0.7540
Epoch 492/500
 - 0s - loss: 0.3797 - acc: 0.8433 - val

In [121]:
test_set = test_set_with_spreads(seeds,season_results, meta, spreads_df)

In [152]:
score_normed_deep_model(training_set, model)



[0.51849292800540014, 0.79126984108062015]

In [153]:
predictions = predict_with_normed_deep_model(test_set, model)

In [154]:
write_predictions(test_set['Key'].values.reshape((len(test_set), 1)), predictions)

## Regularized Deep Model Results With Better Features

- ACCURACY ON TRAINING: 0.7913
- TRAINING LOSS: 0.5185
- KAGGLE TESTING LOSS: 0.4930