In [7]:
import march_madness_classes as mmc
import march_madness_games as mmg
import march_madness_models as mmm
import march_madness_train_and_tune as mmtt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression as LogReg

In [2]:
# read in the dataset
seeds = pd.read_csv("datasets/kaggle_data/TourneySeeds.csv")
slots = pd.read_csv("datasets/kaggle_data/TourneySlots.csv")
games = pd.read_csv("datasets/kaggle_data/TourneyCompactResults.csv")

seeds_arr = mmg.filter_into_seasons(seeds)
slots_arr = mmg.filter_into_seasons(slots)
games_arr = mmg.filter_into_seasons(games)

In [3]:
# extract predictors
markov_data = pd.read_csv("datasets/our_data/stationary", index_col=0)
rpi = pd.read_csv("datasets/our_data/rpi", index_col=0)
bad_losses = pd.read_csv("datasets/our_data/bad_losses", index_col=0)

In [5]:
# get data into correct format
predictor_names = ["min_index_id", "max_index_id", "markov", "rpi", "bad_losses"]
predictor_dfs = [markov_data, rpi, bad_losses]                                        

In [8]:
# trian test split

train_test_arr = []

window   = 3
min_year = 2001
max_year = 2003

year_range = range(min_year, max_year)

# generate our train test split for each year
for year in year_range:
    # do the split for the current year
    train_test_tuple = mmtt.train_test_split(window, 
                                             year, 
                                             seeds_arr, 
                                             slots_arr, 
                                             games_arr, 
                                             predictor_names, 
                                             predictor_dfs)
    
    # add to our array
    train_test_arr.append(train_test_tuple)

In [10]:
# fit head to head models

# our cross validated value of c, from variable selection notebook
c = 1
variables = ["markov","rpi","bad_losses"] 

# models and scalers to be fit
models  = []
scalers = []

for year in year_range:
        # get train data
        train_x = train_test_arr[year - min_year][0][variables]
        train_y = train_test_arr[year - min_year][1].values[:, 0]

        # get cross validation set
        cross_x = train_test_arr[year - min_year][2][variables]
        cross_y = train_test_arr[year - min_year][3].values[:, 0]

        # scaling
        scaler = StandardScaler().fit(train_x)
        scaled_train_x = scaler.transform(train_x)
        scaled_cross_x = scaler.transform(cross_x)

        # init model
        model = LogReg(C=c)

        # fit model
        model.fit(scaled_train_x, train_y)
        
        # append to our lis of models
        models.append(model)
        scalers.append(scaler)

In [92]:
# run tournaments

year = min_year
unbiased_tourneys   = []
top_seed_tourneys = []
actual_tourneys   = []

# analyze results for all simulations
i = 0
for year in year_range:
    # get data from our db
    seeds_year = seeds_arr[year-1985] 
    slots_year = slots_arr[year-1985] 
    games_year = games_arr[year-1985]
    
    # get actual models
    actual_model = mmm.ActualTournament(games_arr[year-1985])
    actual_tourneys.append(mmc.Tournament(seeds_year, slots_year, actual_model, include_scoring_dif=False))
    
    # get greedy models
    unbiased_model = mmm.ModelPredictor(models[i], scalers[i], predictor_dfs, year, seeds_year, simulation=False)
    unbiased_tourneys.append(mmc.Tournament(seeds_year, slots_year, unbiased_model, include_scoring_dif=False))
    
    # get top seed models
    top_seed_model = mmm.BasicPredictor()
    top_seed_tourneys.append(mmc.Tournament(seeds_year, slots_year, top_seed_model, include_scoring_dif=False))
    
    year = year + 1
    i = i + 1

In [128]:
reload(mmm)

<module 'march_madness_models' from 'march_madness_models.py'>

In [171]:
test_dif_bracket = mmm.ModelPredictor(models[0], scalers[0], predictor_dfs, 2001, seeds_arr[2001-1985], other_bracket_arr=[unbiased_tourneys[0].entire_bracket])
test_dif_tourney = mmc.Tournament(seeds_arr[2001-1985], slots_arr[2001-1985], test_dif_bracket, include_scoring_dif=False)

In [172]:
test_dif_bracket_2 = mmm.ModelPredictor(models[0], scalers[0], predictor_dfs, 2001, seeds_arr[2001-1985], other_bracket_arr=[unbiased_tourneys[0].entire_bracket, test_dif_tourney.entire_bracket])
test_dif_tourney_2 = mmc.Tournament(seeds_arr[2001-1985], slots_arr[2001-1985], test_dif_bracket_2, include_scoring_dif=False)

In [176]:
print unbiased_tourneys[0].score_tournament(actual_tourneys[0])
print test_dif_tourney.score_tournament(actual_tourneys[0])
print test_dif_tourney_2.score_tournament(actual_tourneys[0])

(1150, 0.6031746031746031)
(1040, 0.5396825396825397)
(1170, 0.5714285714285714)


In [181]:
print unbiased_tourneys[0].compare_to_dif_tournament(actual_tourneys[0], top_seed_tourneys[0], print_res=False)
print test_dif_tourney.compare_to_dif_tournament(actual_tourneys[0], top_seed_tourneys[0], print_res=False)
print test_dif_tourney_2.compare_to_dif_tournament(actual_tourneys[0], top_seed_tourneys[0], print_res=False)

(9, 4, 4)
(18, 5, 9)
(8, 2, 4)


In [182]:
print test_dif_tourney.compare_to_dif_tournament(actual_tourneys[0], unbiased_tourneys[0], print_res=False)
print test_dif_tourney_2.compare_to_dif_tournament(actual_tourneys[0], unbiased_tourneys[0], print_res=False)

(16, 3, 7)
(7, 1, 3)


In [197]:
print test_dif_tourney_2.compare_to_dif_tournament(actual_tourneys[0], test_dif_tourney, print_res=False)

(12, 5, 3)


In [250]:
reload(mmc)
ensemble_tournament = mmc.Ensemble(seeds_arr[2001-1985], 
                                   slots_arr[2001-1985], 
                                   models[0], 
                                   scalers[0],  
                                   predictor_dfs, 
                                   2001)

In [251]:
ensemble_tournament.score_tournament(actual_tourneys[0])

array([ 1150.,  1180.,  1040.,  1040.,   620.,  1090.,  1170.,   590.,
        1080.,  1090.])

In [252]:
ensemble_tournament.compute_dif_matrix(actual_tourneys[0])

array([[  0.,  12.,  16.,  16.,  12.,   5.,   7.,   9.,  13.,   5.],
       [ 12.,   0.,   6.,   7.,   5.,  16.,  12.,  15.,   6.,  16.],
       [ 16.,   6.,   0.,   1.,   8.,  11.,  12.,  17.,   4.,  11.],
       [ 16.,   7.,   1.,   0.,   9.,  11.,  11.,  16.,   5.,  11.],
       [ 12.,   5.,   8.,   9.,   0.,  17.,  15.,  12.,   6.,  17.],
       [  5.,  16.,  11.,  11.,  17.,   0.,   5.,  12.,  12.,   0.],
       [  7.,  12.,  12.,  11.,  15.,   5.,   0.,  10.,  10.,   5.],
       [  9.,  15.,  17.,  16.,  12.,  12.,  10.,   0.,  18.,  12.],
       [ 13.,   6.,   4.,   5.,   6.,  12.,  10.,  18.,   0.,  12.],
       [  5.,  16.,  11.,  11.,  17.,   0.,   5.,  12.,  12.,   0.]])

In [253]:
ensemble_tournament.avg_game_dif()

array([ 10.55555556,  10.55555556,   9.55555556,   9.66666667,
        11.22222222,   9.88888889,   9.66666667,  13.44444444,
         9.55555556,   9.88888889])

In [254]:
ensemble_tournament.compute_dif_vect(actual_tourneys[0], top_seed_tourneys[0])

array([  9.,  19.,  18.,  18.,  21.,   7.,   8.,  17.,  16.,   7.])