# Data Exploration- Advanced Predictors
- Robert Shaw, Daniel Alpert, Spencer Evans, Sean Coleman
- CS109a Project - Data Driven March Madness

In this file, we generate and explore some more predictors by testing their relation to the response.

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.preprocessing import StandardScaler

import march_madness_classes as mmc
import march_madness_games as mmg
import march_madness_models as mmm

## 1) Confirm Baseline Model is Working

In [2]:
# read in the dataset
tournament_data = pd.read_csv("datasets/kaggle_data/TourneyCompactResults.csv")
teams = pd.read_csv("datasets/kaggle_data/Teams.csv")
seeds = pd.read_csv("datasets/kaggle_data/TourneySeeds.csv")
slots = pd.read_csv("datasets/kaggle_data/TourneySlots.csv")

In [3]:
games_arr = mmg.filter_into_seasons(tournament_data)
seeds_arr = mmg.filter_into_seasons(seeds)
slots_arr = mmg.filter_into_seasons(slots)

In [4]:
tourney_summary = pd.read_csv("datasets/our_data/team_summary_data/tourney_wins_matrix", index_col=0)

In [None]:
wins_in_prev_2_year = np.zeros(tourney_summary.shape)
wins_in_prev_2_year[0:2, :] = np.nan

In [None]:
for i in range(2, tourney_summary.shape[0]):
    wins_in_prev_2_year[i, :] = tourney_summary.values[i - 1, :] + tourney_summary.values[i - 2, :]

In [None]:
past_resul = pd.DataFrame(wins_in_prev_2_year, columns = tourney_summary.columns, index= tourney_summary.index)

In [None]:
past_resul.to_csv("datasets/our_data/past_results")

In [None]:
past_resul.head()

In [None]:
pred, resp = mmg.generate_multiple_years_of_games(range(1987, 2001), seeds_arr, slots_arr, games_arr, ["min_index_id", "max_index_id", "markov", "dominance", "consistency", "prev_resul"], [markov_data, dominance, consistency, wins_in_prev_year_df])

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
train_index, cross_index = train_test_split(pred.index, test_size = .25)

train_x = pred.loc[train_index]
train_y = resp.loc[train_index]
cross_x = pred.loc[cross_index]
cross_y = resp.loc[cross_index]

In [None]:
scaler = StandardScaler().fit(train_x.iloc[:, 2].reshape(-1,1))

model = LogReg(C = 1)
model.fit(scaler.transform(train_x.iloc[:, 2].reshape(-1,1)), train_y.values.T[0])
model.score(scaler.transform(cross_x.iloc[:, 2].reshape(-1,1)), cross_y)

In [None]:
scaler = StandardScaler().fit(train_x.iloc[:, 2:])

model = LogReg(C = 1)
model.fit(scaler.transform(train_x.iloc[:, 2:]), train_y.values.T[0])
model.score(scaler.transform(cross_x.iloc[:, 2:]), cross_y)

In [None]:
model.coef_

---

## 2) Generate Some New Predictors

In [4]:
regular_seasons = pd.read_csv("datasets/kaggle_data/RegularSeasonCompactResults.csv")

In [5]:
regular_data = mmg.filter_into_seasons(regular_seasons)

In [6]:
markov_data = pd.read_csv("datasets/our_data/stationary", index_col=0)
consistency = pd.read_csv("datasets/our_data/consistency", index_col=0)
dominance = pd.read_csv("datasets/our_data/dominance", index_col=0)
past_resul = pd.read_csv("datasets/our_data/past_results", index_col=0)

In [307]:
markov_data.head()

Unnamed: 0,1101,1102,1103,1104,1105,1106,1107,1108,1109,1110,...,1455,1456,1457,1458,1459,1460,1461,1462,1463,1464
1985,,0.001977,0.002041,0.00674,,0.002103,,0.004213,0.0007,0.002069,...,0.005063,0.003075,,0.003508,,,0.002656,0.004163,0.001704,0.003417
1986,,0.001891,0.003455,0.007511,,0.001762,,0.002528,0.001231,0.002317,...,0.003365,0.002522,,0.003353,,,0.003583,0.004989,0.001872,0.002665
1987,,0.002088,0.003639,0.008608,,0.001934,,0.001467,0.001235,0.002323,...,0.004792,0.001879,0.002204,0.003847,,,0.00477,0.003123,0.002368,0.002122
1988,,0.001889,0.003348,0.003962,,0.001699,,0.001919,0.001337,0.002366,...,0.004682,0.002291,0.002427,0.003201,,0.002231,0.00662,0.005202,0.001792,0.001372
1989,,0.00293,0.004787,0.004998,,0.002166,,0.001266,0.001543,0.002164,...,0.004399,0.001338,0.001499,0.005727,,0.001831,0.003816,0.004563,0.001729,0.001277


---

### Convert Seeds to Numeric Matrix

In [None]:
# convert seeds arr to numeric value
seed_matrix = np.zeros((2016 - 1985 + 1, teams.shape[0]))

i = 0
for year in range(1985, 2017):
    j = 0
    for team in teams["Team_Id"]:
        seeds_in_year_i = seeds_arr[i]
        team_seed_in_year_i = seeds_in_year_i.loc[seeds_in_year_i["Team"] == team, "Seed"]
        
        seed = np.nan
        if len(team_seed_in_year_i.values) != 0:
            seed = team_seed_in_year_i.values[0][1:3]
    
        seed_matrix[i, j] = seed
        
        j = j + 1
        
    i = i + 1

In [None]:
seed_matrix_df = pd.DataFrame(data=seed_matrix, columns=past_resul.columns, index=past_resul.index)

seed_matrix_df.to_csv("datasets/our_data/team_summary_data/seeds_matrix")

In [7]:
seed_matrix_df = pd.read_csv("datasets/our_data/team_summary_data/seeds_matrix", index_col=0)

---

### Weighted Wins

In [None]:
# weighted wins = \sum (wins / seed)
weighted_wins_np = np.zeros(seed_matrix_df.shape)

i = 0
# iterate years
for year in range(1985, 2017):
    j = 0
    # iterate teams
    for team in teams["Team_Id"]:
        # extract games the team won
        wins = regular_data[i].loc[regular_data[i]["Wteam"] == team]
        
        # start with 0
        weighted_wins = 0
        
        # iterate wins
        for index, game in wins.iterrows():
            l_team = game["Lteam"]
            l_team_seed = seed_matrix_df.loc[year, str(l_team)]
        
            # if the loser is in the tourney, then add to weighted wins
            if ~np.isnan(l_team_seed):
                # ww = 1 / (lteam seed)
                weighted_wins = weighted_wins + 1./ l_team_seed
                
        # put into our array
        weighted_wins_np[i, j] = weighted_wins
        
        j = j + 1
        
    i = i + 1

In [None]:
weighted_wins = pd.DataFrame(data=weighted_wins_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)

weighted_wins.to_csv("datasets/our_data/weighted_wins")

In [8]:
weighted_wins = pd.read_csv("datasets/our_data/weighted_wins", index_col=0)

---

### Momentum (Markov * Wins in Last 30 Days)

In [309]:
regular_seasons.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [None]:
momentum_np = np.zeros(seed_matrix_df.shape)

i = 0
# iterate years
for year in range(1985, 2017):
    j = 0
    # iterate teams
    for team in teams["Team_Id"]:
        season = regular_data[i]
        
        # get the last 30 days of the season
        max_day = season["Daynum"].max()
        last_month = season[season["Daynum"] >= max_day - 30]
        
        # wins
        wins = last_month[last_month["Wteam"] == team]
        weighted_wins = 0.
        
        for index, win in wins.iterrows():
            l_team = win["Lteam"]
            l_team_pi = markov_data.loc[year, str(l_team)]
            
            weighted_wins = weighted_wins + l_team_pi
        
        momentum_np[i, j] = weighted_wins
        j = j + 1
    i = i + 1

In [None]:
momentum = pd.DataFrame(data=momentum_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)

momentum.to_csv("datasets/our_data/momentum")

In [9]:
momentum = pd.read_csv("datasets/our_data/momentum", index_col=0)

---

### Close Win Percentage (OT Wins/ Wins By <= 1 Basket)

In [None]:
close_wins_np = np.zeros(seed_matrix_df.shape)
close_games_np = np.zeros(seed_matrix_df.shape)
close_game_win_percetage_np = np.zeros(seed_matrix_df.shape)

i = 0
# iterate years
for year in range(1985, 2017):
    j = 0
    # iterate teams
    for team in teams["Team_Id"]:
        season = regular_data[i]
        
        # games 
        wins = season.loc[season["Wteam"] == team]
        losses = season.loc[season["Lteam"] == team]
        
        close_wins = 0.
        close_losses = 0.
        
        # iter wins, count close wins
        for index, win in wins.iterrows():
            if win["Wscore"] - win["Lscore"] <= 3:
                close_wins = close_wins + 1.
            elif win["Numot"] > 0:
                close_wins = close_wins + 1.
        
        for index, loss in losses.iterrows():
            if loss["Wscore"] - loss["Lscore"] <= 3:
                close_losses = close_losses + 1.
            elif loss["Numot"] > 0:
                close_losses = close_losses + 1.
        
        close_wins_np[i, j] = close_wins
        close_games_np[i, j] = close_wins + close_losses
        
        if close_wins > 0:
            close_game_win_percetage_np[i,j] = close_wins / (close_wins +  close_losses)
             
        j = j + 1
    print year
    i = i + 1

In [None]:
# save to database
close_games = pd.DataFrame(data=close_games_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
close_games.to_csv("datasets/our_data/close_games")

close_wins = pd.DataFrame(data=close_wins_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
close_wins.to_csv("datasets/our_data/close_wins")

close_wins_perc = pd.DataFrame(data=close_game_win_percetage_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
close_wins_perc.to_csv("datasets/our_data/close_wins_perc")

In [10]:
close_games= pd.read_csv("datasets/our_data/close_games", index_col=0)
close_wins = pd.read_csv("datasets/our_data/close_wins",index_col=0)
close_wins_perc = pd.read_csv("datasets/our_data/close_wins_perc", index_col=0)

---

### Tempo (Possessions)

In [19]:
regular_seasons_detailed = pd.read_csv("datasets/kaggle_data/RegularSeasonDetailedResults.csv")

In [23]:
regular_seasons_detailed.columns

Index([u'Season', u'Daynum', u'Wteam', u'Wscore', u'Lteam', u'Lscore', u'Wloc',
       u'Numot', u'Wfgm', u'Wfga', u'Wfgm3', u'Wfga3', u'Wftm', u'Wfta',
       u'Wor', u'Wdr', u'Wast', u'Wto', u'Wstl', u'Wblk', u'Wpf', u'Lfgm',
       u'Lfga', u'Lfgm3', u'Lfga3', u'Lftm', u'Lfta', u'Lor', u'Ldr', u'Last',
       u'Lto', u'Lstl', u'Lblk', u'Lpf'],
      dtype='object')

In [21]:
regular_detailed = mmg.filter_into_seasons(regular_seasons_detailed)

In [93]:
tempo_np = np.zeros(seed_matrix_df.shape)
tempo_opp_np = np.zeros(seed_matrix_df.shape)

i = 0
# iterate years
for year in range(1985, 2017):
    j = 0
    
    if year < 2003:
        tempo_np[i, :] = np.nan
        tempo_opp_np[i,:] =np.nan
    else:
        # iterate teams
        for team in teams["Team_Id"]:
            season = regular_detailed[i - (2003 - 1985)]

            # games 
            wins = season.loc[season["Wteam"] == team]
            losses = season.loc[season["Lteam"] == team]

            possessions = 0.
            possessions_opp = 0.
            
            # iterate wins
            for index, win in wins.iterrows():
                # possessions ~ field goal attempts - offensive rebounds + turnovers + .475 * free throw attempts
                possessions = possessions + win['Wfga'] - win['Wor'] + win['Wto'] + .475 * win['Wfta']
                possessions_opp = possessions_opp + win['Lfga'] - win['Lor'] + win['Lto'] + .475 * win['Lfta']
                
            # iterate losses
            for index, loss in losses.iterrows():
                # possessions ~ field goal attempts - offensive rebounds + turnovers + .475 * free throw attempts
                possessions = possessions + loss['Lfga'] - loss['Lor'] + loss['Lto'] + .475 * loss['Lfta']
                possessions_opp = possessions_opp + loss['Wfga'] - loss['Wor'] + loss['Wto'] + .475 * loss['Wfta']
                
            # update the buffer
            if possessions == 0.:
                tempo_np[i,j] = np.nan
                tempo_opp_np[i,j] = np.nan
            else:
                tempo_np[i, j] = possessions
                tempo_opp_np[i,j] = possessions_opp
            
            j = j + 1
    i = i + 1
    print year

1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016


In [94]:
tempo = pd.DataFrame(data=tempo_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
tempo.to_csv("datasets/our_data/tempo")

tempo_opp = pd.DataFrame(data=tempo_opp_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
tempo_opp.to_csv("datasets/our_data/tempo_opp")

In [95]:
tempo = pd.read_csv("datasets/our_data/tempo", index_col=0)
tempo_opp = pd.read_csv("datasets/our_data/tempo_opp", index_col=0)

---

### Points For/Points Against/Win Ratio/Luck

In [106]:
points_for_np = np.zeros(seed_matrix_df.shape)
points_against_np = np.zeros(seed_matrix_df.shape)
i = 0
# iterate years
for year in range(1985, 2017):
    j = 0
    
    # iterate teams
    for team in teams["Team_Id"]:
        season = regular_data[i]

        # games 
        wins = season.loc[season["Wteam"] == team]
        losses = season.loc[season["Lteam"] == team]

        points_for = 0.
        points_against = 0.

        # iterate wins
        for index, win in wins.iterrows():
            # sum points
            points_for = points_for + win["Wscore"]
            points_against = points_against + win["Lscore"]

        # iterate losses
        for index, loss in losses.iterrows():
            # sum points
            points_for = points_for + loss["Lscore"]
            points_against = points_against + loss["Wscore"]

        # update the buffer
        if points_for == 0.:
            points_for_np[i,j] = np.nan
            points_against_np[i,j] = np.nan
        else:
            points_for_np[i,j] = points_for
            points_against_np[i,j] = points_against

        j = j + 1
    i = i + 1
    print year

1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016


In [107]:
points_for = pd.DataFrame(data=points_for_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
points_for.to_csv("datasets/our_data/points_for")

points_against = pd.DataFrame(data=points_against_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
points_against.to_csv("datasets/our_data/points_against")

In [251]:
# Pythagorean Expectation
win_ratio_np = (points_for_np ** 11.5) / (points_for_np ** 11.5 + points_against_np ** 11.5)

win_ratio = pd.DataFrame(data=win_ratio_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
win_ratio.to_csv("datasets/our_data/win_ratio")

In [252]:
points_for = pd.read_csv("datasets/our_data/points_for", index_col=0)
points_against = pd.read_csv("datasets/our_data/points_against", index_col=0)
win_ratio = pd.read_csv("datasets/our_data/win_ratio", index_col=0)
games = pd.read_csv("datasets/our_data/team_summary_data/games_matrix", index_col=0)
wins = pd.read_csv("datasets/our_data/team_summary_data/regular_season_wins", index_col=0)

In [276]:
# wins - expected wins
luck = wins - win_ratio * games

---

### Offensive and Defensive Efficiency

In [127]:
# ppp = total points for / total possessions
points_per_possesion_np = points_for_np / tempo_np
points_per_possesion_opp_np = points_against_np / tempo_opp_np

In [128]:
off_eff = pd.DataFrame(data=points_per_possesion_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
off_eff.to_csv("datasets/our_data/off_eff")

def_eff = pd.DataFrame(data=points_per_possesion_opp_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
def_eff.to_csv("datasets/our_data/def_eff")

In [129]:
off_eff = pd.read_csv("datasets/our_data/off_eff", index_col=0)
def_eff = pd.read_csv("datasets/our_data/def_eff", index_col=0)

---

### Adj Offensive And Defensive Efficiency

We will use an itertive process to find this.

Adj Off Eff = Mean Def Eff + Deviation

Adj Def Eff = Mean Off Eff + Deviation

Deviation = Sum_i = actual_ppp - opponent_def_ppp

In [295]:
mean_off_eff_start = off_eff.mean(axis =1).values
mean_def_eff_start = def_eff.mean(axis =1).values

In [299]:
# sums of the deviations
dev_off_np = np.zeros(seed_matrix_df.shape)
dev_def_np = np.zeros(seed_matrix_df.shape)

i = 0
# iterate years
for year in range(1985, 2017):
    if year < 2003:
        dev_off_np[i, :] = np.nan
        dev_def_np[i,:] =np.nan
        i = i + 1
    else:        
        j = 0
        # iterate teams
        for team in teams["Team_Id"]:
            season = regular_detailed[i - (2003 - 1985)]

            # games 
            wins = season.loc[season["Wteam"] == team]
            losses = season.loc[season["Lteam"] == team]

            deviation_off = 0.
            deviation_def = 0.    

            # iterate wins
            for index, win in wins.iterrows():
                # get other team if
                other_team = str(win["Lteam"])

                # points for and againsts
                points_for = win["Wscore"]
                points_against = win["Lscore"]

                # possessions
                possessions = win['Wfga'] - win['Wor'] + win['Wto'] + .475 * win['Wfta']
                possessions_op = win['Lfga'] - win['Lor'] + win['Lto'] + .475 * win['Lfta']

                # points per possession
                ppp = points_for / possessions
                ppp_op = points_for / possessions

                # dev_i = actual ppp - opponent_def_eff
                deviation_off = deviation_off + ppp - def_eff.loc[year, other_team]

                # dev_i = actual ppp - opponent_off_eff
                deviation_def = deviation_def + ppp_op - off_eff.loc[year, other_team]

            # iterate losses
            for index, loss in losses.iterrows():
                # get other team if
                other_team = str(loss["Wteam"])

                # points for and againsts
                points_for = loss["Lscore"]
                points_against = loss["Wscore"]

                # possessions
                possessions = loss['Lfga'] - loss['Lor'] + loss['Lto'] + .475 * loss['Lfta']
                possessions_op = loss['Wfga'] - loss['Wor'] + loss['Wto'] + .475 * loss['Wfta']

                # points per possession
                ppp = points_for / possessions
                ppp_op = points_for / possessions

                # dev_i = actual ppp - opponent_def_eff
                deviation_off = deviation_off + ppp - def_eff.loc[year, other_team]

                # dev_i = actual ppp - opponent_off_eff
                deviation_def = deviation_def + ppp_op - off_eff.loc[year, other_team]

            # update the buffer
            if wins.shape[0] + losses.shape[0] == 0:
                dev_off_np[i,j] = np.nan
                dev_def_np[i,j] = np.nan
            else:
                dev_off_np[i,j] = deviation_off
                dev_off_np[i,j] = deviation_def

            j = j + 1
        i = i + 1
        print year

2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016


In [300]:
dev_off_np

array([[        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       ..., 
       [-2.36625348, -1.82235537,  0.05247388, ...,  0.72738317,
        -0.77056818,  1.09995148],
       [-2.31914329,  1.75394989,  0.48256294, ...,  1.58026111,
         0.88063988,  0.30908171],
       [-0.46906026, -1.84807523,  2.12161277, ...,  2.49936939,
         1.86422628, -1.10592767]])

In [304]:
adj_off_eff = np.zeros(dev_off_np.shape)
for i in range(dev_off_np.shape[0]):
    adj_off_eff[i, :] = dev_off_np[i, :] + mean_def_eff_start[i]

In [305]:
adj_off_eff

array([[        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       ..., 
       [-1.32023344, -0.77633533,  1.09849391, ...,  1.7734032 ,
         0.27545186,  2.14597152],
       [-1.29807308,  2.77502011,  1.50363315, ...,  2.60133132,
         1.9017101 ,  1.33015193],
       [ 0.56847797, -0.81053701,  3.159151  , ...,  3.53690762,
         2.90176451, -0.06838945]])

In [306]:
mean_def_eff_start

array([        nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,  1.0110884 ,  1.00592829,
        1.00843796,  1.0120837 ,  1.01682775,  1.0153275 ,  1.01090295,
        1.01314026,  1.01711846,  1.01198424,  1.006411  ,  1.04602004,
        1.02107021,  1.03753823])

---

### Extract Predictors from Original Data Exploration

In [11]:
rpi = pd.read_csv("datasets/our_data/rpi", index_col=0)
bad_losses = pd.read_csv("datasets/our_data/bad_losses", index_col=0)
tough_wins = pd.read_csv("datasets/our_data/tough_wins", index_col=0)

### Model Head to Head Wins

In [16]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold

In [279]:
pred, resp = mmg.generate_multiple_years_of_games(range(2003, 2016), 
                                                  seeds_arr, 
                                                  slots_arr, 
                                                  games_arr, 
                                                  ["min_index_id", "max_index_id", "markov", "dominance", "rpi", "bad_losses", "tough_wins", "close_wins", "close_wins_perc", "weighted_wins", "past_resul", "momentum", "tempo", "off_eff", "def_eff", "luck"], 
                                                  [markov_data, dominance, rpi, bad_losses, tough_wins, close_wins, close_wins_perc, weighted_wins, past_resul, momentum, tempo, off_eff, def_eff, luck],
                                                  scoring_dif = False)

In [284]:
score = 0.
subset = [2,4,5,13,14,15]

for i in range(100):

    train_index, cross_index = train_test_split(pred.index, test_size = .25)

    train_x = pred.loc[train_index]
    train_y = resp.loc[train_index]
    cross_x = pred.loc[cross_index]
    cross_y = resp.loc[cross_index]

    scaler = StandardScaler().fit(train_x.iloc[:, subset])

    model = LogReg(C = 1)
    model.fit(scaler.transform(train_x.iloc[:, subset]), train_y.values.T[0])
    score = score +  model.score(scaler.transform(cross_x.iloc[:, subset]), cross_y)
    
print score / 100.


for i in range(len(pred.columns[subset].values)):
    print "\n{} : {}".format(pred.columns[subset].values[i], model.coef_[0, i])

0.715609756098

markov : 0.515462716526

rpi : 0.888394750474

bad_losses : -0.143162572217

off_eff : 0.18436225876

def_eff : -0.124765007925

luck : -0.250712783599


--- 

### Model Score Differential

In [19]:
from sklearn.linear_model import LinearRegression as ols
from sklearn.linear_model import Lasso as lasso
from sklearn.linear_model import Ridge as ridge

In [20]:
pred, resp = mmg.generate_multiple_years_of_games(range(1987, 2001), 
                                                  seeds_arr, 
                                                  slots_arr, 
                                                  games_arr, 
                                                  ["min_index_id", "max_index_id", "markov", "dominance", "rpi", "bad_losses", "tough_wins", "close_wins", "close_wins_perc", "weighted_wins", "past_resul", "momentum", "win_ratio"], 
                                                  [markov_data, dominance, rpi, bad_losses, tough_wins, close_wins, close_wins_perc, weighted_wins, past_resul, momentum, win_ratio],
                                                  scoring_dif = True)

#### Linear Regression

In [21]:
scores = 0
for i in range(500):
    train_index, cross_index = train_test_split(pred.index, test_size = .25)

    train_x = pred.loc[train_index]
    train_y = resp.loc[train_index]
    cross_x = pred.loc[cross_index]
    cross_y = resp.loc[cross_index]

    scaler = StandardScaler().fit(train_x.iloc[:, 2:])

    model = ols()
    model.fit(scaler.transform(train_x.iloc[:, 2:]), train_y.values.T[0])
    scores = scores + model.score(scaler.transform(cross_x.iloc[:, 2:]), cross_y)
    
print scores/500.
print model.coef_

0.454621409751
[  5.26574426e+00  -2.81475572e-01   6.77657015e+00  -1.52682092e-03
   1.00357205e+00   2.42975781e-01  -1.25747888e+00  -1.18497733e+00
  -3.62427489e-01  -6.12538062e-01]


#### Lasso Regression

In [22]:
scores = 0
for i in range(500):
    train_index, cross_index = train_test_split(pred.index, test_size = .25)

    train_x = pred.loc[train_index]
    train_y = resp.loc[train_index]
    cross_x = pred.loc[cross_index]
    cross_y = resp.loc[cross_index]

    scaler = StandardScaler().fit(train_x.iloc[:, 2:])

    model = lasso()
    model.fit(scaler.transform(train_x.iloc[:, 2:]), train_y.values.T[0])
    scores = scores + model.score(scaler.transform(cross_x.iloc[:, 2:]), cross_y)
    
print scores/500.
print model.coef_

0.455894317282
[ 4.87620608  0.          4.47165101 -0.49475219  0.         -0.         -0.1686631
  0.          0.          0.        ]


#### Ridge Regression

In [23]:
scores = 0
for i in range(500):
    train_index, cross_index = train_test_split(pred.index, test_size = .25)

    train_x = pred.loc[train_index]
    train_y = resp.loc[train_index]
    cross_x = pred.loc[cross_index]
    cross_y = resp.loc[cross_index]

    scaler = StandardScaler().fit(train_x.iloc[:, 2:])

    model = ridge()
    model.fit(scaler.transform(train_x.iloc[:, 2:]), train_y.values.T[0])
    scores = scores + model.score(scaler.transform(cross_x.iloc[:, 2:]), cross_y)
    
print scores/500.
print model.coef_

0.458373670232
[ 5.22297271  0.24896519  6.3009136  -0.42831358  0.36642889  0.4813125
 -1.37434403 -0.24265882 -0.27576285 -1.04558994]
