In [1]:
import march_madness_classes as mmc
import march_madness_games as mmg
import march_madness_models as mmm
import march_madness_train_and_tune as mmtt

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression as LogReg

In [2]:
# read in the dataset
seeds = pd.read_csv("datasets/kaggle_data/TourneySeeds.csv")
slots = pd.read_csv("datasets/kaggle_data/TourneySlots.csv")
games = pd.read_csv("datasets/kaggle_data/TourneyCompactResults.csv")

seeds_arr = mmg.filter_into_seasons(seeds)
slots_arr = mmg.filter_into_seasons(slots)
games_arr = mmg.filter_into_seasons(games)

In [3]:
# extract predictors, chosen from the variable selection notebook
markov          = pd.read_csv("datasets/our_data/stationary", index_col=0)
rpi             = pd.read_csv("datasets/our_data/rpi", index_col=0)
bad_losses      = pd.read_csv("datasets/our_data/bad_losses", index_col=0)

# seeds
seed_matrix_df  = pd.read_csv("datasets/our_data/team_summary_data/seeds_matrix", index_col=0)

In [4]:
# get data into correct format
predictor_names = ["min_index_id", "max_index_id", "markov","rpi","bad_losses"] 

# package the predictors into an array
predictor_dfs = [markov, rpi, bad_losses] 

In [5]:
train_test_arr = []

window   = 3
min_year = 2014
max_year = 2016

year_range = range(min_year, max_year)

# generate our train test split for each year
for year in year_range:
    # do the split for the current year
    train_test_tuple = mmtt.train_test_split(window, 
                                             year, 
                                             seeds_arr, 
                                             slots_arr, 
                                             games_arr, 
                                             predictor_names, 
                                             predictor_dfs)
    
    # add to our array
    train_test_arr.append(train_test_tuple)

In [6]:
# our cross validated value of c, from variable selection notebook
c = 1
variables = ["markov","rpi","bad_losses"] 

# models and scalers to be fit
models  = []
scalers = []

for year in year_range:
        # get train data
        train_x = train_test_arr[year - min_year][0][variables]
        train_y = train_test_arr[year - min_year][1].values[:, 0]

        # get cross validation set
        cross_x = train_test_arr[year - min_year][2][variables]
        cross_y = train_test_arr[year - min_year][3].values[:, 0]

        # scaling
        scaler = StandardScaler().fit(train_x)
        scaled_train_x = scaler.transform(train_x)
        scaled_cross_x = scaler.transform(cross_x)

        # init model
        model = LogReg(C=c)

        # fit model
        model.fit(scaled_train_x, train_y)
        
        # append to our lis of models
        models.append(model)
        scalers.append(scaler)

In [24]:
reload(mmm)

<module 'march_madness_models' from 'march_madness_models.py'>

In [27]:
# setup simulator
simulators = []

i = 0
# iterate years
for year in year_range: 
    # get data needed
    seeds_year = seeds_arr[year-1985] 
    slots_year = slots_arr[year-1985] 
    games_year = games_arr[year-1985]
    
    # setup head to head model
    head_to_head_model_year = mmm.ModelPredictor(models[i], scalers[i], predictor_dfs, year, 
                                                 simulation=True,
                                                 higher_seed_bias=False,
                                                 seeds_df=seeds_year, 
                                                 higher_seed_bias_delta=.04)
    
    # setup simulator
    simulators.append(mmc.Simulator(seeds_year, slots_year, head_to_head_model_year))
    
    i = i + 1

In [28]:
# run simulations
i = 0
for year in year_range:
    # run simulation
    points = simulators[i].simulate_tournament(200) 
    
    # predict bracket based on the simulation
    bracket = simulators[i].predict_tournament()
    
    i = i + 1
    
    print year

2014
2015


In [30]:
year = min_year

simulator_scores = np.zeros(len(year_range))
simulator_accuracys = np.zeros(len(year_range))

# analyze results for all simulations
i = 0
for simulator in simulators:
    # get data from our db
    seeds_year = seeds_arr[year-1985] 
    slots_year = slots_arr[year-1985] 
    games_year = games_arr[year-1985]
    
    # get actual models
    actual_model = mmm.ActualTournament(games_arr[year-1985])
    actual_tourney = mmc.Tournament(seeds_year, slots_year, actual_model, include_scoring_dif=False)
    
    # score tournament 
    simulator_scores[i], simulator_accuracys[i] = simulator.score_tournament(actual_tourney, print_res=False)
    
    year = year + 1
    i = i + 1

In [31]:
print simulator_scores

[ 570.  970.]


In [32]:
year = min_year
greedy_tourneys   = []
top_seed_tourneys = []
actual_tourneys   = []

# analyze results for all simulations
i = 0
for year in year_range:
    # get data from our db
    seeds_year = seeds_arr[year-1985] 
    slots_year = slots_arr[year-1985] 
    games_year = games_arr[year-1985]
    
    # get actual models
    actual_model = mmm.ActualTournament(games_arr[year-1985])
    actual_tourneys.append(mmc.Tournament(seeds_year, slots_year, actual_model, include_scoring_dif=False))
    
    # get greedy models
    greedy_model = mmm.ModelPredictor(models[i], scalers[i], predictor_dfs, year, simulation=False)
    greedy_tourneys.append(mmc.Tournament(seeds_year, slots_year, greedy_model, include_scoring_dif=False))
    
    # get top seed models
    top_seed_model = mmm.BasicPredictor()
    top_seed_tourneys.append(mmc.Tournament(seeds_year, slots_year, top_seed_model, include_scoring_dif=False))
    
    year = year + 1
    i = i + 1

In [33]:
# buffers
greedy_scores = np.zeros(len(year_range))
greedy_accuracys = np.zeros(len(year_range))

top_seed_scores = np.zeros(len(year_range))
top_seed_accuracys = np.zeros(len(year_range))

# analyze results 
year = min_year
i = 0
for i in range(len(year_range)):
    # get data from our db
    seeds_year = seeds_arr[year-1985] 
    slots_year = slots_arr[year-1985] 
    games_year = games_arr[year-1985]
    
    # get models
    actual_tourney = actual_tourneys[i]
    greedy_tourney = greedy_tourneys[i]
    top_seed_tourney = top_seed_tourneys[i]
    
    # score tournaments
    greedy_scores[i], greedy_accuracys[i] = greedy_tourney.score_tournament(actual_tourney, print_res=False)
    top_seed_scores[i], top_seed_accuracys[i] = top_seed_tourney.score_tournament(actual_tourney, print_res=False)
    
    year = year + 1
    i = i + 1

In [34]:
print greedy_scores

[ 610.  980.]


In [51]:
celeb_data = pd.read_csv("datasets/our_data/celeb_brackets_2015.csv")
celeb_data

Unnamed: 0,person,points,percentile
0,Cousin Sal - ESPN,1200,92.1
1,J.A. Adande - ESPN,1090,87.0
2,MatthewBerry - ESPN,1000,77.7
3,Jeff Goodman - ESPN,990,76.5
4,Scott Van Pelt - ESPN,990,76.5
5,Vanessa Marshall - ESPN,970,73.9
6,Taylor Gray - ESPN,960,72.5
7,Mike Golic - ESPN,950,71.1
8,Dave Rothenberg - ESPN,920,66.6
9,Michael Smith - ESPN,910,65.1
