# Data Exploration- Advanced Predictors
- Robert Shaw, Daniel Alpert, Spencer Evans, Sean Coleman
- CS109a Project - Data Driven March Madness

In this file, we generate and explore some more predictors by testing their relation to the response.

In [93]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.preprocessing import StandardScaler

import march_madness_classes as mmc
import march_madness_games as mmg
import march_madness_models as mmm

## 1) Confirm Baseline Model is Working

In [94]:
# read in the dataset
tournament_data = pd.read_csv("datasets/kaggle_data_2021/MNCAATourneyCompactResults.csv")
teams = pd.read_csv("datasets/kaggle_data_2021/MTeams.csv")
seeds = pd.read_csv("datasets/kaggle_data_2021/MNCAATourneySeeds.csv")
slots = pd.read_csv("datasets/kaggle_data_2021/MNCAATourneySlots.csv")

In [95]:
games_arr = mmg.filter_into_seasons(tournament_data)
seeds_arr = mmg.filter_into_seasons(seeds)
slots_arr = mmg.filter_into_seasons(slots)

In [144]:
slots_arr

[    Season  Slot StrongSeed WeakSeed
 0     1985  R1W1        W01      W16
 1     1985  R1W2        W02      W15
 2     1985  R1W3        W03      W14
 3     1985  R1W4        W04      W13
 4     1985  R1W5        W05      W12
 ..     ...   ...        ...      ...
 58    1985  R4Y1       R3Y1     R3Y2
 59    1985  R4Z1       R3Z1     R3Z2
 60    1985  R5WX       R4W1     R4X1
 61    1985  R5YZ       R4Y1     R4Z1
 62    1985  R6CH       R5WX     R5YZ
 
 [63 rows x 4 columns],      Season  Slot StrongSeed WeakSeed
 63     1986  R1W1        W01      W16
 64     1986  R1W2        W02      W15
 65     1986  R1W3        W03      W14
 66     1986  R1W4        W04      W13
 67     1986  R1W5        W05      W12
 ..      ...   ...        ...      ...
 121    1986  R4Y1       R3Y1     R3Y2
 122    1986  R4Z1       R3Z1     R3Z2
 123    1986  R5WX       R4W1     R4X1
 124    1986  R5YZ       R4Y1     R4Z1
 125    1986  R6CH       R5WX     R5YZ
 
 [63 rows x 4 columns],      Season  Slot StrongS

In [104]:
tourney_summary = pd.read_csv("datasets/our_data/tourney_wins_matrix", index_col=0)

In [105]:
wins_in_prev_2_year = np.zeros(tourney_summary.shape)
wins_in_prev_2_year[0:2, :] = np.nan

In [106]:
for i in range(2, tourney_summary.shape[0]):
    wins_in_prev_2_year[i, :] = tourney_summary.values[i - 1, :] + tourney_summary.values[i - 2, :]

In [107]:
past_resul = pd.DataFrame(wins_in_prev_2_year, columns = tourney_summary.columns, index= tourney_summary.index)

In [108]:
past_resul.to_csv("datasets/our_data/past_results")

In [111]:
past_resul.head()

Unnamed: 0,1101,1102,1103,1104,1105,1106,1107,1108,1109,1110,...,1462,1463,1464,1465,1466,1467,1468,1469,1470,1471
1985,,,,,,,,,,,...,,,,,,,,,,
1986,,,,,,,,,,,...,,,,,,,,,,
1987,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1988,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1989,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2) Generate Some New Predictors

In [112]:
regular_seasons = pd.read_csv("datasets/kaggle_data_2021/MRegularSeasonCompactResults.csv")

In [113]:
regular_data = mmg.filter_into_seasons(regular_seasons)

In [136]:
markov_data = pd.read_csv("datasets/our_data/stationary", index_col=0)
consistency = pd.read_csv("datasets/our_data/regular_season_std_score_differential_matrix", index_col=0)
dominance = pd.read_csv("datasets/our_data/regular_season_avg_score_differential_matrix", index_col=0)
past_resul = pd.read_csv("datasets/our_data/past_results", index_col=0)

In [139]:
# sanity check
print(markov_data.shape)
print(consistency.shape)
print(dominance.shape)
print(past_resul.shape)

(37, 371)
(37, 371)
(37, 371)
(37, 371)


In [140]:
markov_data.head()

Unnamed: 0,1101,1102,1103,1104,1105,1106,1107,1108,1109,1110,...,1462,1463,1464,1465,1466,1467,1468,1469,1470,1471
1985,,0.001999,0.00205,0.006738,,0.0021,,0.004164,0.00071,0.002074,...,0.004162,0.001711,0.003414,,,,,,,
1986,,0.001911,0.00343,0.007474,,0.001758,,0.002514,0.001238,0.002316,...,0.004977,0.001879,0.002661,,,,,,,
1987,,0.002103,0.003632,0.008563,,0.001932,,0.001469,0.00125,0.002318,...,0.00313,0.002376,0.00213,,,,,,,
1988,,0.001908,0.003336,0.003977,,0.001696,,0.001915,0.001351,0.002371,...,0.005175,0.001801,0.001386,,,,,,,
1989,,0.00294,0.004772,0.004999,,0.002157,,0.001267,0.001553,0.002174,...,0.004556,0.001737,0.001287,,,,,,,


In [157]:
## maybe don't need the code below until "Convert Seeds..."

In [153]:
pred, resp = mmg.generate_multiple_years_of_games(range(1987, 2001), seeds_arr, slots_arr, games_arr, \
                                                  ["min_index_id", "max_index_id", "markov", "dominance", "consistency", "prev_resul"],  \
                                                  [markov_data, dominance, consistency, past_resul])

KeyError: 'Strongseed'

In [26]:
from sklearn.cross_validation import train_test_split

ModuleNotFoundError: No module named 'sklearn.cross_validation'

In [27]:
train_index, cross_index = train_test_split(pred.index, test_size = .25)

train_x = pred.loc[train_index]
train_y = resp.loc[train_index]
cross_x = pred.loc[cross_index]
cross_y = resp.loc[cross_index]

NameError: name 'train_test_split' is not defined

In [None]:
scaler = StandardScaler().fit(train_x.iloc[:, 2].reshape(-1,1))

model = LogReg(C = 1)
model.fit(scaler.transform(train_x.iloc[:, 2].reshape(-1,1)), train_y.values.T[0])
model.score(scaler.transform(cross_x.iloc[:, 2].reshape(-1,1)), cross_y)

In [None]:
scaler = StandardScaler().fit(train_x.iloc[:, 2:])

model = LogReg(C = 1)
model.fit(scaler.transform(train_x.iloc[:, 2:]), train_y.values.T[0])
model.score(scaler.transform(cross_x.iloc[:, 2:]), cross_y)

In [None]:
model.coef_

---

---

### Convert Seeds to Numeric Matrix

In [155]:
# convert seeds arr to numeric value
seed_matrix = np.zeros((2021 - 1985 + 1, teams.shape[0]))

i = 0
for year in range(1985, 2022):
    j = 0
    for team in teams["TeamID"]:
        seeds_in_year_i = seeds_arr[i]
        team_seed_in_year_i = seeds_in_year_i.loc[seeds_in_year_i["TeamID"] == team, "Seed"]
        
        seed = np.nan
        if len(team_seed_in_year_i.values) != 0:
            seed = team_seed_in_year_i.values[0][1:3]
    
        seed_matrix[i, j] = seed
        
        j = j + 1
        
    i = i + 1

In [156]:
teams.shape

(371, 4)

In [158]:
seed_matrix_df = pd.DataFrame(data=seed_matrix, columns=past_resul.columns, index=past_resul.index)

seed_matrix_df.to_csv("datasets/our_data/team_summary_data/seeds_matrix")

In [159]:
seed_matrix_df = pd.read_csv("datasets/our_data/team_summary_data/seeds_matrix", index_col=0)

---

### Weighted Wins

In [161]:
# weighted wins = \sum (wins / seed)
weighted_wins_np = np.zeros(seed_matrix_df.shape)

i = 0
# iterate years
for year in range(1985, 2022):
    j = 0
    # iterate teams
    for team in teams["TeamID"]:
        # extract games the team won
        wins = regular_data[i].loc[regular_data[i]["WTeamID"] == team]
        
        # start with 0
        weighted_wins = 0
        
        # iterate wins
        for index, game in wins.iterrows():
            l_team = game["LTeamID"]
            l_team_seed = seed_matrix_df.loc[year, str(l_team)]
        
            # if the loser is in the tourney, then add to weighted wins
            if ~np.isnan(l_team_seed):
                # ww = 1 / (lteam seed)
                weighted_wins = weighted_wins + 1./ l_team_seed
                
        # put into our array
        weighted_wins_np[i, j] = weighted_wins
        
        j = j + 1
        
    i = i + 1

In [162]:
weighted_wins = pd.DataFrame(data=weighted_wins_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)

weighted_wins.to_csv("datasets/our_data/weighted_wins")

In [163]:
weighted_wins = pd.read_csv("datasets/our_data/weighted_wins", index_col=0)

---

### Momentum (Markov * Wins in Last 30 Days)

In [164]:
regular_seasons.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [165]:
momentum_np = np.zeros(seed_matrix_df.shape)

i = 0
# iterate years
for year in range(1985, 2022):
    j = 0
    # iterate teams
    for team in teams["TeamID"]:
        season = regular_data[i]
        
        # get the last 30 days of the season
        max_day = season["DayNum"].max()
        last_month = season[season["DayNum"] >= max_day - 30]
        
        # wins
        wins = last_month[last_month["WTeamID"] == team]
        weighted_wins = 0.
        
        for index, win in wins.iterrows():
            l_team = win["LTeamID"]
            l_team_pi = markov_data.loc[year, str(l_team)]
            
            weighted_wins = weighted_wins + l_team_pi
        
        momentum_np[i, j] = weighted_wins
        j = j + 1
    i = i + 1

In [166]:
momentum = pd.DataFrame(data=momentum_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)

momentum.to_csv("datasets/our_data/momentum")

In [167]:
momentum = pd.read_csv("datasets/our_data/momentum", index_col=0)

---

### Close Win Percentage (OT Wins/ Wins By <= 1 Basket)

In [169]:
close_wins_np = np.zeros(seed_matrix_df.shape)
close_games_np = np.zeros(seed_matrix_df.shape)
close_game_win_percetage_np = np.zeros(seed_matrix_df.shape)

i = 0
# iterate years
for year in range(1985, 2022):
    j = 0
    # iterate teams
    for team in teams["TeamID"]:
        season = regular_data[i]
        
        # games 
        wins = season.loc[season["WTeamID"] == team]
        losses = season.loc[season["LTeamID"] == team]
        
        close_wins = 0.
        close_losses = 0.
        
        # iter wins, count close wins
        for index, win in wins.iterrows():
            if win["WScore"] - win["LScore"] <= 3:
                close_wins = close_wins + 1.
            elif win["NumOT"] > 0:
                close_wins = close_wins + 1.
        
        for index, loss in losses.iterrows():
            if loss["WScore"] - loss["LScore"] <= 3:
                close_losses = close_losses + 1.
            elif loss["NumOT"] > 0:
                close_losses = close_losses + 1.
        
        close_wins_np[i, j] = close_wins
        close_games_np[i, j] = close_wins + close_losses
        
        if close_wins > 0:
            close_game_win_percetage_np[i,j] = close_wins / (close_wins +  close_losses)
             
        j = j + 1
    print(year)
    i = i + 1

1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021


In [170]:
# save to database
close_games = pd.DataFrame(data=close_games_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
close_games.to_csv("datasets/our_data/close_games")

close_wins = pd.DataFrame(data=close_wins_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
close_wins.to_csv("datasets/our_data/close_wins")

close_wins_perc = pd.DataFrame(data=close_game_win_percetage_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
close_wins_perc.to_csv("datasets/our_data/close_wins_perc")

In [171]:
close_games= pd.read_csv("datasets/our_data/close_games", index_col=0)
close_wins = pd.read_csv("datasets/our_data/close_wins",index_col=0)
close_wins_perc = pd.read_csv("datasets/our_data/close_wins_perc", index_col=0)

---

### Tempo (Possessions)

In [172]:
regular_seasons_detailed = pd.read_csv("datasets/kaggle_data_2021/MRegularSeasonDetailedResults.csv")

In [173]:
regular_seasons_detailed.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')

In [174]:
regular_detailed = mmg.filter_into_seasons(regular_seasons_detailed)

In [176]:
tempo_np = np.zeros(seed_matrix_df.shape)
tempo_opp_np = np.zeros(seed_matrix_df.shape)

i = 0
# iterate years
for year in range(1985, 2022):
    j = 0
    # tempo didn't exist before 2003
    if year < 2003:
        tempo_np[i, :] = np.nan
        tempo_opp_np[i,:] =np.nan
    else:
        # iterate teams
        for team in teams["TeamID"]:
            season = regular_detailed[i - (2003 - 1985)]

            # games 
            wins = season.loc[season["WTeamID"] == team]
            losses = season.loc[season["LTeamID"] == team]

            possessions = 0.
            possessions_opp = 0.
            
            # iterate wins
            for index, win in wins.iterrows():
                # possessions ~ field goal attempts - offensive rebounds + turnovers + .475 * free throw attempts
                possessions = possessions + win['WFGA'] - win['WOR'] + win['WTO'] + .475 * win['WFTA']
                possessions_opp = possessions_opp + win['LFGA'] - win['LOR'] + win['LTO'] + .475 * win['LFTA']
                
            # iterate losses
            for index, loss in losses.iterrows():
                # possessions ~ field goal attempts - offensive rebounds + turnovers + .475 * free throw attempts
                possessions = possessions + loss['LFGA'] - loss['LOR'] + loss['LTO'] + .475 * loss['LFTA']
                possessions_opp = possessions_opp + loss['WFGA'] - loss['WOR'] + loss['WTO'] + .475 * loss['WFTA']
                
            # update the buffer
            if possessions == 0.:
                tempo_np[i,j] = np.nan
                tempo_opp_np[i,j] = np.nan
            else:
                tempo_np[i, j] = possessions
                tempo_opp_np[i,j] = possessions_opp
            
            j = j + 1
    i = i + 1
    print(year)

1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021


In [177]:
tempo = pd.DataFrame(data=tempo_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
tempo.to_csv("datasets/our_data/tempo")

tempo_opp = pd.DataFrame(data=tempo_opp_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
tempo_opp.to_csv("datasets/our_data/tempo_opp")

In [178]:
tempo = pd.read_csv("datasets/our_data/tempo", index_col=0)
tempo_opp = pd.read_csv("datasets/our_data/tempo_opp", index_col=0)

---

### Points For/Points Against/Win Ratio/Luck

In [190]:
points_for_np = np.zeros(seed_matrix_df.shape)
points_against_np = np.zeros(seed_matrix_df.shape)
i = 0
# iterate years
for year in range(1985, 2022):
    j = 0
    
    # iterate teams
    for team in teams["TeamID"]:
        season = regular_data[i]

        # games 
        wins = season.loc[season["WTeamID"] == team]
        losses = season.loc[season["LTeamID"] == team]

        points_for = 0.
        points_against = 0.

        # iterate wins
        for index, win in wins.iterrows():
            # sum points
            points_for = points_for + win["WScore"]
            points_against = points_against + win["LScore"]

        # iterate losses
        for index, loss in losses.iterrows():
            # sum points
            points_for = points_for + loss["LScore"]
            points_against = points_against + loss["WScore"]

        # update the buffer
        if points_for == 0.:
            points_for_np[i,j] = np.nan
            points_against_np[i,j] = np.nan
        else:
            points_for_np[i,j] = points_for
            points_against_np[i,j] = points_against

        j = j + 1
    i = i + 1
    print(year)

1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021


In [191]:
points_for = pd.DataFrame(data=points_for_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
points_for.to_csv("datasets/our_data/points_for")

points_against = pd.DataFrame(data=points_against_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
points_against.to_csv("datasets/our_data/points_against")

In [192]:
# Pythagorean Expectation
win_ratio_np = (points_for_np ** 11.5) / (points_for_np ** 11.5 + points_against_np ** 11.5)

win_ratio = pd.DataFrame(data=win_ratio_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
win_ratio.to_csv("datasets/our_data/win_ratio")

In [193]:
points_for = pd.read_csv("datasets/our_data/points_for", index_col=0)
points_against = pd.read_csv("datasets/our_data/points_against", index_col=0)
win_ratio = pd.read_csv("datasets/our_data/win_ratio", index_col=0)
games = pd.read_csv("datasets/our_data/team_summary_data/games_matrix", index_col=0)
wins = pd.read_csv("datasets/our_data/team_summary_data/regular_season_wins", index_col=0)

In [194]:
# wins - expected wins
luck = wins - win_ratio * games

---

### Offensive and Defensive Efficiency

In [195]:
# ppp = total points for / total possessions
points_per_possesion_np = points_for_np / tempo_np
points_per_possesion_opp_np = points_against_np / tempo_opp_np

In [196]:
off_eff = pd.DataFrame(data=points_per_possesion_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
off_eff.to_csv("datasets/our_data/off_eff")

def_eff = pd.DataFrame(data=points_per_possesion_opp_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
def_eff.to_csv("datasets/our_data/def_eff")

In [197]:
off_eff = pd.read_csv("datasets/our_data/off_eff", index_col=0)
def_eff = pd.read_csv("datasets/our_data/def_eff", index_col=0)

---

### Adj Offensive And Defensive Efficiency

We will use an itertive process to find this.

Adj Off Eff = Mean Def Eff + Deviation

Adj Def Eff = Mean Off Eff + Deviation

Deviation = Sum_i = actual_ppp - opponent_def_ppp

In [198]:
mean_off_eff_start = off_eff.mean(axis =1).values
mean_def_eff_start = def_eff.mean(axis =1).values

In [199]:
# sums of the deviations
dev_off_np = np.zeros(seed_matrix_df.shape)
dev_def_np = np.zeros(seed_matrix_df.shape)

i = 0
# iterate years
for year in range(1985, 2022):
    if year < 2003:
        dev_off_np[i, :] = np.nan
        dev_def_np[i,:] =np.nan
        i = i + 1
    else:        
        j = 0
        # iterate teams
        for team in teams["TeamID"]:
            season = regular_detailed[i - (2003 - 1985)]

            # games 
            wins = season.loc[season["WTeamID"] == team]
            losses = season.loc[season["LTeamID"] == team]

            deviation_off = 0.
            deviation_def = 0.    

            # iterate wins
            for index, win in wins.iterrows():
                # get other team if
                other_team = str(win["LTeamID"])

                # points for and againsts
                points_for = win["WScore"]
                points_against = win["LScore"]

                # possessions
                possessions = win['WFGA'] - win['WOR'] + win['WTO'] + .475 * win['WFTA']
                possessions_op = win['LFGA'] - win['LOR'] + win['LTO'] + .475 * win['LFTA']

                # points per possession
                ppp = points_for / possessions
                ppp_op = points_for / possessions

                # dev_i = actual ppp - opponent_def_eff
                deviation_off = deviation_off + ppp - def_eff.loc[year, other_team]

                # dev_i = actual ppp - opponent_off_eff
                deviation_def = deviation_def + ppp_op - off_eff.loc[year, other_team]

            # iterate losses
            for index, loss in losses.iterrows():
                # get other team if
                other_team = str(loss["WTeamID"])

                # points for and againsts
                points_for = loss["LScore"]
                points_against = loss["WScore"]

                # possessions
                possessions = loss['LFGA'] - loss['LOR'] + loss['LTO'] + .475 * loss['LFTA']
                possessions_op = loss['WFGA'] - loss['WOR'] + loss['WTO'] + .475 * loss['WFTA']

                # points per possession
                ppp = points_for / possessions
                ppp_op = points_for / possessions

                # dev_i = actual ppp - opponent_def_eff
                deviation_off = deviation_off + ppp - def_eff.loc[year, other_team]

                # dev_i = actual ppp - opponent_off_eff
                deviation_def = deviation_def + ppp_op - off_eff.loc[year, other_team]

            # update the buffer
            if wins.shape[0] + losses.shape[0] == 0:
                dev_off_np[i,j] = np.nan
                dev_def_np[i,j] = np.nan
            else:
                dev_off_np[i,j] = deviation_off
                dev_off_np[i,j] = deviation_def

            j = j + 1
        i = i + 1
        print(year)

2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021


In [200]:
dev_off_np

array([[        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       ...,
       [ 2.11843207, -1.20034863, -1.5610791 , ...,         nan,
                nan,         nan],
       [ 0.46463284,  0.70006535,  1.90902322, ...,         nan,
                nan,         nan],
       [ 1.93546791, -1.86434512,  0.76825779, ..., -1.9481178 ,
        -0.59667459, -0.60037378]])

In [201]:
adj_off_eff = np.zeros(dev_off_np.shape)
for i in range(dev_off_np.shape[0]):
    adj_off_eff[i, :] = dev_off_np[i, :] + mean_def_eff_start[i]

In [202]:
adj_off_eff

array([[        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       ...,
       [ 3.15037124, -0.16840946, -0.52913994, ...,         nan,
                nan,         nan],
       [ 1.47250569,  1.7079382 ,  2.91689607, ...,         nan,
                nan,         nan],
       [ 2.93828787, -0.86152516,  1.77107775, ..., -0.94529785,
         0.40614537,  0.40244618]])

In [203]:
mean_def_eff_start

array([       nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan, 1.0110884 , 1.00592829,
       1.00843796, 1.0120837 , 1.01682775, 1.0153275 , 1.01090295,
       1.01314026, 1.01711846, 1.01198424, 1.006411  , 1.04602004,
       1.02107021, 1.03753823, 1.0366957 , 1.04274647, 1.03193917,
       1.00787285, 1.00281996])

---

### Extract Predictors from Original Data Exploration

In [204]:
rpi = pd.read_csv("datasets/our_data/rpi", index_col=0)
bad_losses = pd.read_csv("datasets/our_data/bad_losses", index_col=0)
tough_wins = pd.read_csv("datasets/our_data/tough_wins", index_col=0)

### Model Head to Head Wins

In [206]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [208]:
pred, resp = mmg.generate_multiple_years_of_games(range(2003, 2016), 
                                                  seeds_arr, 
                                                  slots_arr, 
                                                  games_arr, 
                                                  ["min_index_id", "max_index_id", "markov", "dominance", "rpi", "bad_losses", "tough_wins", "close_wins", "close_wins_perc", "weighted_wins", "past_resul", "momentum", "tempo", "off_eff", "def_eff", "luck"], 
                                                  [markov_data, dominance, rpi, bad_losses, tough_wins, close_wins, close_wins_perc, weighted_wins, past_resul, momentum, tempo, off_eff, def_eff, luck],
                                                  scoring_dif = False)

KeyError: 'Strongseed'

In [284]:
score = 0.
subset = [2,4,5,13,14,15]

for i in range(100):

    train_index, cross_index = train_test_split(pred.index, test_size = .25)

    train_x = pred.loc[train_index]
    train_y = resp.loc[train_index]
    cross_x = pred.loc[cross_index]
    cross_y = resp.loc[cross_index]

    scaler = StandardScaler().fit(train_x.iloc[:, subset])

    model = LogReg(C = 1)
    model.fit(scaler.transform(train_x.iloc[:, subset]), train_y.values.T[0])
    score = score +  model.score(scaler.transform(cross_x.iloc[:, subset]), cross_y)
    
print score / 100.


for i in range(len(pred.columns[subset].values)):
    print "\n{} : {}".format(pred.columns[subset].values[i], model.coef_[0, i])

0.715609756098

markov : 0.515462716526

rpi : 0.888394750474

bad_losses : -0.143162572217

off_eff : 0.18436225876

def_eff : -0.124765007925

luck : -0.250712783599


--- 

### Model Score Differential

In [19]:
from sklearn.linear_model import LinearRegression as ols
from sklearn.linear_model import Lasso as lasso
from sklearn.linear_model import Ridge as ridge

In [20]:
pred, resp = mmg.generate_multiple_years_of_games(range(1987, 2001), 
                                                  seeds_arr, 
                                                  slots_arr, 
                                                  games_arr, 
                                                  ["min_index_id", "max_index_id", "markov", "dominance", "rpi", "bad_losses", "tough_wins", "close_wins", "close_wins_perc", "weighted_wins", "past_resul", "momentum", "win_ratio"], 
                                                  [markov_data, dominance, rpi, bad_losses, tough_wins, close_wins, close_wins_perc, weighted_wins, past_resul, momentum, win_ratio],
                                                  scoring_dif = True)

#### Linear Regression

In [21]:
scores = 0
for i in range(500):
    train_index, cross_index = train_test_split(pred.index, test_size = .25)

    train_x = pred.loc[train_index]
    train_y = resp.loc[train_index]
    cross_x = pred.loc[cross_index]
    cross_y = resp.loc[cross_index]

    scaler = StandardScaler().fit(train_x.iloc[:, 2:])

    model = ols()
    model.fit(scaler.transform(train_x.iloc[:, 2:]), train_y.values.T[0])
    scores = scores + model.score(scaler.transform(cross_x.iloc[:, 2:]), cross_y)
    
print scores/500.
print model.coef_

0.454621409751
[  5.26574426e+00  -2.81475572e-01   6.77657015e+00  -1.52682092e-03
   1.00357205e+00   2.42975781e-01  -1.25747888e+00  -1.18497733e+00
  -3.62427489e-01  -6.12538062e-01]


#### Lasso Regression

In [22]:
scores = 0
for i in range(500):
    train_index, cross_index = train_test_split(pred.index, test_size = .25)

    train_x = pred.loc[train_index]
    train_y = resp.loc[train_index]
    cross_x = pred.loc[cross_index]
    cross_y = resp.loc[cross_index]

    scaler = StandardScaler().fit(train_x.iloc[:, 2:])

    model = lasso()
    model.fit(scaler.transform(train_x.iloc[:, 2:]), train_y.values.T[0])
    scores = scores + model.score(scaler.transform(cross_x.iloc[:, 2:]), cross_y)
    
print scores/500.
print model.coef_

0.455894317282
[ 4.87620608  0.          4.47165101 -0.49475219  0.         -0.         -0.1686631
  0.          0.          0.        ]


#### Ridge Regression

In [23]:
scores = 0
for i in range(500):
    train_index, cross_index = train_test_split(pred.index, test_size = .25)

    train_x = pred.loc[train_index]
    train_y = resp.loc[train_index]
    cross_x = pred.loc[cross_index]
    cross_y = resp.loc[cross_index]

    scaler = StandardScaler().fit(train_x.iloc[:, 2:])

    model = ridge()
    model.fit(scaler.transform(train_x.iloc[:, 2:]), train_y.values.T[0])
    scores = scores + model.score(scaler.transform(cross_x.iloc[:, 2:]), cross_y)
    
print scores/500.
print model.coef_

0.458373670232
[ 5.22297271  0.24896519  6.3009136  -0.42831358  0.36642889  0.4813125
 -1.37434403 -0.24265882 -0.27576285 -1.04558994]
