In [6]:
import sklearn 
from sklearn.linear_model import LogisticRegression
import pandas as pd 
import numpy as np
from statsmodels.tsa.arima_model import ARMA
import datetime


# Load CSVs

In [7]:
regular_season_results = pd.read_csv("../ncaa_data/RegularSeasonDetailedResults.csv")
tourney_results = pd.read_csv("../ncaa_data/NCAATourneyDetailedResults.csv")
results = pd.concat([regular_season_results, tourney_results])

In [8]:
seasons = pd.read_csv("../ncaa_data/Seasons.csv")
seasons = seasons[["Season","DayZero"]]
results = results.set_index("Season").join(seasons.set_index("Season"), how="inner", rsuffix="season").reset_index()

# Build Team/Season Features

In [14]:
winning_results = results[['Season', 'DayNum','DayZero', 'WTeamID', 'WScore', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]
losing_results = results[['Season', 'DayNum','DayZero', 'LTeamID', 'LScore', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']]
winning_results.columns = ['Season', 'DayNum','DayZero', 'TeamID', 'Score','FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR',
'Ast', 'TO', 'Stl', 'Blk', 'PF']
losing_results.columns = ['Season', 'DayNum','DayZero', 'TeamID', 'Score', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR',
'Ast', 'TO', 'Stl', 'Blk', 'PF']
team_df = pd.concat([winning_results, losing_results])


# Bring in Massey Ordinals 

In [4]:
massey = pd.read_csv("../ncaa_data/MasseyOrdinals.csv")
massey = massey.rename(columns={"RankingDayNum":"DayNum"})
massey = massey.pivot_table(index=["Season","TeamID","DayNum"], columns="SystemName", values="OrdinalRank")
massey = massey.fillna(method="ffill")
massey = massey.drop("DC2", axis=1)

In [15]:
team_df = team_df.set_index(["Season","TeamID","DayNum"]).join(massey, how="left")


In [17]:
team_df = team_df.reset_index().sort_values(["Season","DayNum"]).set_index("DayNum").groupby(["Season","TeamID"]).apply(lambda x: x.reindex(range(0,150), method='ffill'))

# Clean Up and Make Some Columns MA

In [21]:
team_df = team_df.drop(["TeamID","Season"], axis=1)
deltas = team_df.reset_index()["DayNum"].apply(lambda x: datetime.timedelta(days=x))
team_df["date"] = pd.to_datetime(team_df["DayZero"]) + deltas.values

In [22]:
exclude = ["date","DayZero","Score","DayNum","Season"]
ma_columns = team_df.columns.tolist()
ma_columns = [item for item in ma_columns if item not in exclude]

In [23]:
ma_data = team_df.groupby(["Season","TeamID"])[ma_columns].apply(lambda x: x.rolling(window=10, min_periods=1).mean())

In [24]:
team_df[ma_columns] = ma_data

# Pickle Team DF

In [25]:
team_df.to_pickle("../ncaa_data/team_df.p")

# Build Game DF

In [26]:
games = results[["WTeamID","LTeamID","Season","DayNum","WLoc","DayZero","WScore","LScore"]]

In [27]:
games["Team1"] = games[["WTeamID","LTeamID"]].min(axis=1)
games["Team2"] = games[["WTeamID","LTeamID"]].max(axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [28]:
games["Team1Score"] = 0
games["Team2Score"] = 0
games["Team1"] = games[["WTeamID","LTeamID"]].min(axis=1)
games["Team2"] = games[["WTeamID","LTeamID"]].max(axis=1)
games.loc[games["WTeamID"] == games["Team1"],"Team1Score"] = games["WScore"]
games.loc[games["WTeamID"] != games["Team1"],"Team1Score"] = games[games["WTeamID"] != games["Team1"]]["LScore"]
games.loc[games["WTeamID"] == games["Team2"],"Team2Score"] = games["WScore"]
games.loc[games["WTeamID"] != games["Team2"],"Team2Score"] = games["LScore"]
# games["ScoreDiff"] = 
# games["Team1Home"] = (games["WLoc"]=="H") & (games["WTeamID"] == games["Team1"])
# games["Team2Home"] = (games["WLoc"]=="H") & (games["WTeamID"] == games["Team2"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [29]:
games = games.drop(["WTeamID","LTeamID","WLoc","DayZero"], axis=1)


# Increment the dayNum so that we only use results up through the day previous to the game

In [30]:
incremented = team_df.reset_index()
incremented["DayNum"] = incremented["DayNum"] + 1
incremented = incremented.set_index(["Season","TeamID","DayNum"])
full_df = games.join(incremented, on=["Season", "Team1","DayNum"], how="right").join(incremented, on=["Season", "Team2","DayNum"], rsuffix="2", how="inner")

In [31]:
games.shape

(77617, 8)

# Add Some Diff Columns

In [32]:
for column in ma_columns:
    try:
        full_df[column + "diff"] = full_df[column] - full_df[str(column +"2")]
    except Exception as e:
        print(e)

In [33]:
full_df.to_pickle("../ncaa_data/full_df.p")

# Match Ups to Predict

In [19]:
pairs_to_predict = pd.read_csv("../ncaa_data/SampleSubmissionStage1.csv")["ID"].str.split("_")
seasons = pairs_to_predict.apply(lambda x: int(x[0]))
team1 = pairs_to_predict.apply(lambda x: int(x[1]))
team2 = pairs_to_predict.apply(lambda x:int(x[2]))

In [57]:
tourney_matchups = pd.DataFrame()
tourney_matchups["Team1"] = team1
tourney_matchups["Team2"] = team2
tourney_matchups["Season"] = seasons
tourney_matchups["DayNum"] = 149

In [46]:
last_games = team_df.reset_index()[team_df.reset_index()["DayNum"]==149]

In [51]:
last_games = last_games.set_index(["Season","TeamID","DayNum"])

In [59]:
tourney_matchups = tourney_matchups.join(last_games, on=["Season", "Team1","DayNum"], how="right").join(last_games, on=["Season", "Team2","DayNum"], rsuffix="2", how="inner")

In [60]:
for column in ma_columns:
    try:
        tourney_matchups[column + "diff"] = tourney_matchups[column] - tourney_matchups[str(column +"2")]
    except Exception as e:
        print(e)

In [62]:
tourney_matchups.to_pickle("../ncaa_data/tourney_matchups.p")