In [85]:
import sklearn 
from sklearn.linear_model import LogisticRegression
import pandas as pd 
import numpy as np
from statsmodels.tsa.arima_model import ARMA
import datetime


# Load CSVs

In [86]:
regular_season_results = pd.read_csv("../ncaa_data/RegularSeasonDetailedResults.csv")
tourney_results = pd.read_csv("../ncaa_data/NCAATourneyDetailedResults.csv")
results = pd.concat([regular_season_results, tourney_results])

In [87]:
seasons = pd.read_csv("../ncaa_data/Seasons.csv")
seasons = seasons[["Season","DayZero"]]
results = results.set_index("Season").join(seasons.set_index("Season"), how="inner", rsuffix="season").reset_index()

# Build Team/Season Features

In [88]:
winning_results = results[['Season', 'DayNum','DayZero', 'WTeamID', 'WScore', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]
losing_results = results[['Season', 'DayNum','DayZero', 'LTeamID', 'LScore', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']]
winning_results.columns = ['Season', 'DayNum','DayZero', 'TeamID', 'Score','FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR',
'Ast', 'TO', 'Stl', 'Blk', 'PF']
losing_results.columns = ['Season', 'DayNum','DayZero', 'TeamID', 'Score', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR',
'Ast', 'TO', 'Stl', 'Blk', 'PF']
team_df = pd.concat([winning_results, losing_results]).sort_values(["Season","DayNum"]).set_index("DayNum").groupby(["Season","TeamID"]).apply(lambda x: x.reindex(range(0,150), method='ffill'))


# Bring in Massey Ordinals 

In [89]:
massey = pd.read_csv("../ncaa_data/MasseyOrdinals.csv")
massey = massey.rename(columns={"RankingDayNum":"DayNum"})
massey = massey.pivot_table(index=["Season","TeamID","DayNum"], columns="SystemName", values="OrdinalRank")
massey = massey.fillna(method="ffill")

In [90]:
team_df = team_df.join(massey, how="inner")


# Clean Up and Make Some Columns MA

In [91]:
team_df = team_df.drop(["TeamID","Season"], axis=1)
deltas = team_df.reset_index()["DayNum"].apply(lambda x: datetime.timedelta(days=x))
team_df["date"] = pd.to_datetime(team_df["DayZero"]) + deltas.values

In [92]:
team_df.columns

Index(['DayZero', 'Score', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR',
       'DR',
       ...
       'USA', 'WIL', 'WLK', 'WMR', 'WOB', 'WOL', 'WTE', 'YAG', 'ZAM', 'date'],
      dtype='object', length=175)

In [None]:
ma_columnns = 

In [35]:
team_df = team_df.reset_index().set_index("date").groupby(["Season","TeamID"]).apply(lambda x: x.rolling(window=10).mean())

# Pickle Team DF

In [36]:
team_df.to_pickle("../ncaa_data/team_df.p")

In [40]:
results.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
       'DayZero'],
      dtype='object')

# Build Game DF

In [79]:
games = results[["WTeamID","LTeamID","Season","DayNum","WLoc","DayZero"]]

In [80]:
games["Team1"] = games[["WTeamID","LTeamID"]].min(axis=1)
games["Team2"] = games[["WTeamID","LTeamID"]].max(axis=1)
# games["Team1Home"] = (games["WLoc"]=="H") & (games["WTeamID"] == games["Team1"])
# games["Team2Home"] = (games["WLoc"]=="H") & (games["WTeamID"] == games["Team2"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [81]:
games = games.drop(["WTeamID","LTeamID","WLoc","DayZero"], axis=1)
games.set_index(["Season","DayNum"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Team1,Team2
Season,DayNum,Unnamed: 2_level_1,Unnamed: 3_level_1
2003,10,1104,1328
2003,10,1272,1393
2003,11,1266,1437
2003,11,1296,1457
2003,11,1208,1400
2003,11,1186,1458
2003,12,1161,1236
2003,12,1186,1457
2003,12,1156,1194
2003,12,1296,1458


In [84]:
team_df

Unnamed: 0_level_0,Season,TeamID,DayNum,DayZero,Score,FGM,FGA,FGM3,FGA3,FTM,...,UPS,USA,WIL,WLK,WMR,WOB,WOL,WTE,YAG,ZAM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2002-12-09,,,,11/4/2002,,,,,,,...,,,,,,,,,,
2002-12-11,,,,11/4/2002,,,,,,,...,,,,,,,,,,
2002-12-16,,,,11/4/2002,,,,,,,...,,,,,,,,,,
2002-12-17,,,,11/4/2002,,,,,,,...,,,,,,,,,,
2002-12-18,,,,11/4/2002,,,,,,,...,,,,,,,,,,
2002-12-23,,,,11/4/2002,,,,,,,...,,,,,,,,,,
2002-12-25,,,,11/4/2002,,,,,,,...,,,,,,,,,,
2002-12-30,,,,11/4/2002,,,,,,,...,,,,,,,,,,
2002-12-31,,,,11/4/2002,,,,,,,...,,,,,,,,,,
2003-01-01,2003.0,1102.0,47.2,11/4/2002,70.2,22.9,42.4,11.4,24.6,13.0,...,,,,,,,,,,


In [82]:
games.join(team_df.set_index(["Season","DayNum","TeamID"]))

TypeError: Cannot compare type 'Timestamp' with type 'int'

# Build Model Inputs

In [131]:
y = results[["WTeamID","LTeamID","Season"]]
y["Team1Wins"] = [1] * y.shape[0]
y2 = y.copy()
tmp = y2["LTeamID"]
y2["LTeamID"] = y2["WTeamID"]
y2["WTeamID"] = tmp
y2["Team1Wins"] = [0] * y2.shape[0]
y = pd.concat([y,y2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [132]:
full_df = y.join(grouped, rsuffix="W", on=["WTeamID","Season"])
full_df = full_df.join(grouped, rsuffix="L", on=["LTeamID","Season"])

In [133]:
for column in ['Score','NumOT', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR',
'Ast', 'TO', 'Stl', 'Blk', 'PF']:
        full_df["DIFF"+column] = full_df[column] -  full_df[column + "L"]

In [134]:
x = full_df.drop(["WTeamID","LTeamID","Team1Wins"], axis=1)
x_train = x[x["Season"] != 2014]
y_train = y[y["Season"] != 2014]
x_test = x[x["Season"] == 2014]
y_test = y[y["Season"] == 2014]

# Fit Model

In [137]:
model = LogisticRegression()
model.fit(x_train, y_train["Team1Wins"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Predict All Match Ups

In [545]:
pairs_to_predict = pd.read_csv("../ncaa_data/SampleSubmissionStage1.csv")["ID"].str.split("_")
seasons = pairs_to_predict.apply(lambda x: int(x[0]))
team1 = pairs_to_predict.apply(lambda x: int(x[1]))
team2 = pairs_to_predict.apply(lambda x:int(x[2]))

In [546]:
tourney_matchups = pd.DataFrame()
tourney_matchups["team1"] = team1
tourney_matchups["team2"] = team2
tourney_matchups["season"] = seasons

In [547]:
tourney_matchups.shape

(9112, 3)

In [548]:
tourney_matchups = pd.merge(team_df.reset_index(), tourney_matchups, how="inner", left_on=["TeamID","Season"], right_on=["team1","season"], suffixes=("","1"))
tourney_matchups =  pd.merge(team_df.reset_index(), tourney_matchups, how="inner", left_on=["TeamID","Season"], right_on=["team2","season"], suffixes=("","2"))


In [549]:
tourney_matchups = tourney_matchups.drop(["TeamID","TeamID2","Season2","season"],axis=1)
tourney_matchups = tourney_matchups.set_index(["team1","team2"])

In [550]:
for column in ['Score','NumOT', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR',
'Ast', 'TO', 'Stl', 'Blk', 'PF']:
        tourney_matchups["DIFF"+column] = tourney_matchups[column] -  tourney_matchups[column + "2"]

In [551]:
# all_matchups=  all_matchups.set_index(["TeamID","TeamID2"])
# all_matchups = all_matchups.drop(["Season2"], axis=1)


In [552]:
predicted = [pred[0] for pred in model.predict_proba(tourney_matchups)]

In [553]:
# tourney_matchups=  tourney_matchups.reset_index().set_index(["team1","team2", "Season"])


In [554]:
# all_matchups = all_matchups.reset_index()
# all_matchups = all_matchups[all_matchups["TeamID"] < all_matchups["TeamID2"]]

In [555]:
tourney_matchups = tourney_matchups.reset_index()

In [556]:
csv_file = open("submission.csv", "w+")
team_ids = list(tourney_matchups["team1"])
team_ids2 = list(tourney_matchups["team2"])
seasons = list(tourney_matchups["Season"])
csv_file.write("ID,Pred\n")
for i in range(len(tourney_matchups["Season"])):
    csv_string = str(seasons[i]) + "_" + str(team_ids[i]) + "_" + str(team_ids2[i])  + "," + str(predicted[i]) + "\n"
    csv_file.write(csv_string)

In [557]:
csv_file.close()