# Data Exploration

In [1]:
import pandas as pd
import numpy as np

In [2]:
team_names = pd.read_csv("./data/WDataFiles/WTeams.csv")
seasons = pd.read_csv("./data/WDataFiles/WSeasons.csv")
seeds = pd.read_csv("./data/WDataFiles/WNCAATourneySeeds.csv")
season_results = pd.read_csv("./data/WDataFiles/WRegularSeasonCompactResults.csv")
tourney_results = pd.read_csv("./data/WDataFiles/WNCAATourneyCompactResults.csv")
cities = pd.read_csv("./data/WDataFiles/WCities.csv")
game_cities = pd.read_csv("./data/WDataFiles/WGameCities.csv")

In [13]:
len(team_names)

364

In [3]:
team_names.head()

Unnamed: 0,TeamID,TeamName
0,3101,Abilene Chr
1,3102,Air Force
2,3103,Akron
3,3104,Alabama
4,3105,Alabama A&M


In [4]:
seasons.head()

Unnamed: 0,Season,DayZero,RegionW,RegionX,RegionY,RegionZ
0,1998,10/27/1997,East,Midwest,Mideast,West
1,1999,10/26/1998,East,Mideast,Midwest,West
2,2000,11/1/1999,East,Midwest,Mideast,West
3,2001,10/30/2000,East,Midwest,Mideast,West
4,2002,10/29/2001,East,West,Mideast,Midwest


In [14]:
len(seeds)

1280

In [16]:
len(seeds.loc[seeds.Season == 1998])

64

In [5]:
seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1998,W01,3330
1,1998,W02,3163
2,1998,W03,3112
3,1998,W04,3301
4,1998,W05,3272


In [7]:
season_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1998,18,3104,91,3202,41,H,0
1,1998,18,3163,87,3221,76,H,0
2,1998,18,3222,66,3261,59,H,0
3,1998,18,3307,69,3365,62,H,0
4,1998,18,3349,115,3411,35,H,0


In [8]:
tourney_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1998,137,3104,94,3422,46,H,0
1,1998,137,3112,75,3365,63,H,0
2,1998,137,3163,93,3193,52,H,0
3,1998,137,3198,59,3266,45,H,0
4,1998,137,3203,74,3208,72,A,0


In [9]:
cities.head()

Unnamed: 0,CityID,City,State
0,4001,Abilene,TX
1,4002,Akron,OH
2,4003,Albany,NY
3,4004,Albuquerque,NM
4,4005,Allentown,PA


In [10]:
game_cities.head()

Unnamed: 0,Season,DayNum,WTeamID,LTeamID,CRType,CityID
0,2015,11,3103,3127,Regular,4002
1,2015,11,3104,3197,Regular,4353
2,2015,11,3107,3383,Regular,4384
3,2015,11,3113,3292,Regular,4343
4,2015,11,3114,3261,Regular,4024


In [11]:
len(season_results)

96685

In [12]:
len(tourney_results)

1260

### Intuitions to follow up on

* Does being on home court increase probability of a win?
* Does the outcome of the prior game for this team give them "momentum"?
* Do Win/Loss ratios for the season so far correlate with wins?
* Does seed distance from opponent correlate with winning?
* Does prior season normalized points scored with respect to total distribution correlate with winning?
* same question for season-to-date
* same question for historical and season-to-date defense
* can we easily determine distances from all cities to all other cities?  Do we know the home city for each team?  If so can we use distance from home city to game city as a proxy for travel fatigue? And does it affect winning outcomes?
* are relative win-ratios from regular season play predictive of tournament wins?

## Stage 1 Predictions

The first portion of this exercise involves successfully predicting matchups from past tournaments.  The goal is to take the 64 teams in each tournament, and predict each possible matchup.  To get the formatting right, I'll build a model that extracts the seed for each tournament, and tries to learn a naive predictor just based on seed difference between teams.  Steps:

* 1: Make training set from tournament results.
  - extract seed 1 and seed 2
  - make Y vector, 1 for a team 1 victory, 0 for team 2 victory
  - teams in a match ordered by numeric ID

* 2: train model to convergence on entire batch

* 3: Make a test set from seeds list for all possible combinations and seed numbers

* 4: predict results and confirm an output size of 8,064 data rows.


In [82]:
def figure_team_1(row):
    if row['Team1Win']:
        return row['WTeamID']
    else:
        return row['LTeamID']

def figure_team_2(row):
    if row['Team1Win']:
        return row['LTeamID']
    else:
        return row['WTeamID']

tourney_df = tourney_results[['Season','WTeamID','LTeamID']]
tourney_df['Team1Win'] = tourney_df['WTeamID'] < tourney_df['LTeamID']
tourney_df['Team1'] = tourney_df.apply(figure_team_1,axis=1)
tourney_df['Team2'] = tourney_df.apply(figure_team_2,axis=1)
len(tourney_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


1260

In [43]:
seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1998,W01,3330
1,1998,W02,3163
2,1998,W03,3112
3,1998,W04,3301
4,1998,W05,3272


In [110]:
training_df = pd.merge(tourney_df, seeds, left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
training_df = training_df.rename(index=str, columns={"Seed": "Team1Seed"})
training_df = training_df.drop(['TeamID'], axis=1)
training_df = pd.merge(training_df, seeds, left_on=['Season', 'Team2'], right_on=['Season', 'TeamID'])
training_df = training_df.rename(index=str, columns={"Seed": "Team2Seed"})
training_df = training_df.drop(['TeamID'], axis=1)
training_df['SeedNum1'] = training_df.apply(lambda row: int(row['Team1Seed'][1:3]), axis=1)
training_df['SeedNum2'] = training_df.apply(lambda row: int(row['Team2Seed'][1:3]), axis=1)
training_df['SeedDelta'] = (training_df['SeedNum1'] - training_df['SeedNum2'])
seed_mu = training_df['SeedDelta'].mean()
seed_sigma = training_df['SeedDelta'].std()
training_df['NormalizedSeedDelta'] = (training_df['SeedDelta'] - seed_mu) / seed_sigma

In [84]:
len(training_df)

1260

In [79]:
training_df.head(10)

Unnamed: 0,Season,WTeamID,LTeamID,Team1Win,Team1,Team2,Team1Seed,Team2Seed,SeedNum1,SeedNum2,SeedDelta,NormalizedSeedDelta
0,1998,3104,3422,True,3104,3422,X02,X15,2,15,-13,-1.751175
1,1998,3104,3417,True,3104,3417,X02,X07,2,7,-5,-0.672212
2,1998,3417,3276,False,3276,3417,X10,X07,10,7,3,0.406752
3,1998,3256,3104,False,3104,3256,X02,X03,2,3,-1,-0.13273
4,1998,3256,3155,False,3155,3256,X06,X03,6,3,3,0.406752
5,1998,3256,3221,False,3221,3256,X14,X03,14,3,11,1.485716
6,1998,3112,3365,True,3112,3365,W03,W14,3,14,-11,-1.481434
7,1998,3112,3438,True,3112,3438,W03,W06,3,6,-3,-0.402471
8,1998,3438,3374,False,3374,3438,W11,W06,11,6,5,0.676493
9,1998,3163,3112,False,3112,3163,W03,W02,3,2,1,0.137011


In [97]:
x_train = training_df['NormalizedSeedDelta'].values
y_train = np.vectorize(lambda x: 1 if x else 0)(training_df['Team1Win'].values)
x_train = x_train.reshape(len(x_train), 1)
y_train = y_train.reshape(len(y_train), )
print(x_train.shape)
print(y_train.shape)

(1260, 1)
(1260,)


## Build an incredibly naive model

In [102]:
from sklearn.linear_model import LogisticRegression
naive_model = LogisticRegression()
naive_model = model.fit(x_train, y_train)
naive_model.score(x_train, y_train)

0.77857142857142858

## Construct Test Set from tourney combinations

In [118]:
test_seeds = seeds[['Season', 'Seed', 'TeamID']]
test_seeds['SeedNum'] = test_seeds.apply(lambda row: int(row['Seed'][1:3]), axis=1)

In [140]:
import itertools

seasons = np.unique(test_seeds.Season.values)
test_set = [] # output_key (SSSS_XXXX_YYYY), seed1, seed2, seed_delta
seasons = [ss for ss in seasons if ss >= 2014 and ss < 2018]
for s in seasons:
    seed_set = test_seeds[test_seeds.Season == s].values
    for seed_pair in itertools.combinations(seed_set, 2):
        id_a = seed_pair[0][2]
        id_b = seed_pair[1][2]
        t1 = 0
        t2 = 0
        s1 = 0
        s2 = 0
        if id_a < id_b:
            t1 = id_a
            t2 = id_b
            s1 = seed_pair[0][3]
            s2 = seed_pair[1][3]
        else:
            t1 = id_b
            t2 = id_a
            s1 = seed_pair[1][3]
            s2 = seed_pair[0][3]
        seed_delta = ((s1 - s2) - seed_mu) / seed_sigma
        output_key = "%s_%s_%s" % (s, t1, t2)
        test_set.append([output_key, s1, s2, seed_delta])

test_set = np.array(test_set)
    

In [141]:
test_x = test_set[:, 3].astype(float)
test_x = test_x.reshape(len(test_x), 1)

In [142]:
predictions = naive_model.predict_proba(test_x)[:, 1]

## write first naive predictions file to get the format down right

In [149]:
import csv

with open("./data/outputs/stage_1_predictions.csv", "w") as f:
    wtr = csv.writer(f)
    wtr.writerow(['ID','Pred'])
    for i in range(len(test_set)):
        out_key = test_set[i][0]
        prediction = predictions[i]
        wtr.writerow([out_key,prediction])
        