In [21]:
import datetime
import numpy as np
import pandas as pd 
import random
import scipy
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, log_loss
from statsmodels.tsa.arima_model import ARMA

In [4]:
full_df = pd.read_pickle("../ncaa_data/full_df.p")

# Filter Down to Rows Where Each Team Has Actually Played a Previous Game So We Have Data

In [5]:
populated_df = full_df[(full_df["FGM"] > 0) & (full_df["FGM2"] > 0)]

In [6]:
y = populated_df["Team1Score"] - populated_df["Team2Score"]

In [7]:
exclude = ["Team1Score","Team2Score", "date","DayZero","Score","DayNum","Season","WScore","LScore","Team1","Team2","DayZero2","Score2","DayNum2","Season2","date2"]
feature_columns = populated_df.columns
feature_columns = [item for item in feature_columns if item not in exclude]

# Split It Up and Train a Model

### For a random split


In [9]:
x_train, x_test, y_train, y_test = train_test_split(populated_df[feature_columns], y, test_size=0.1)

### For a split with only the last tournament as test data


In [10]:
train_df = populated_df[(populated_df["DayNum"] < 134) | (populated_df["Season"] != 2017)]
test_df = populated_df[(populated_df["DayNum"] >= 134) & (populated_df["Season"] == 2017)]

x_train = train_df[feature_columns]
x_test = test_df[feature_columns]
y_train = train_df["Team1Score"] - train_df["Team2Score"]
y_test = test_df["Team1Score"] - test_df["Team2Score"]


In [12]:
tourney_matchups = pd.read_pickle("../ncaa_data/tourney_matchups.p")

In [None]:
models = [        
    (linear_model.Ridge(), {}),
    (RandomForestRegressor(), {}),
    (linear_model.Lasso(), {}),
    (linear_model.SGDRegressor(), {}),
    (linear_model.BayesianRidge(), {})
         ]

for model_tuple in models:
    model = model_tuple[0]
    print(model)
    model.fit(x_train.fillna(0), y_train)
    predicted = model.predict(x_test)
    predicted[predicted > 0] = .9
    predicted[predicted < 0] = .1
    
#     print(log_loss(y_test,predicted))

In [None]:
csv_file = open("submission.csv", "w+")
team_ids = list(tourney_matchups["Team1"])
team_ids2 = list(tourney_matchups["Team2"])
seasons = list(tourney_matchups["Season"])
csv_file.write("ID,Pred\n")
for i in range(len(tourney_matchups["Season"])):
    csv_string = str(seasons[i]) + "_" + str(team_ids[i]) + "_" + str(int(team_ids2[i]))  + "," + str(predicted[i]) + "\n"
    csv_file.write(csv_string)

# Final Output

In [162]:
#For a split with only the last tournament as test data
train_df = populated_df

x_train = train_df[feature_columns]
y_train = train_df["Team1Score"] - train_df["Team2Score"]


In [163]:
model = RandomForestRegressor(random_state=rand)
model.fit(x_train[feature_columns].fillna(0), y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=36, verbose=0, warm_start=False)

In [164]:
tourney_matchups = pd.read_pickle("../ncaa_data/final_matchups.p")
predictions = model.predict(tourney_matchups[feature_columns].fillna(0))

In [165]:
probs = [(1- (1 - stats.norm.cdf(0, loc=prediction, scale=sd))) for prediction in predictions]

In [167]:
csv_file = open("linear-submission3-18.csv", "w+")
team_ids = list(tourney_matchups["Team1"])
team_ids2 = list(tourney_matchups["Team2"])
seasons = list(tourney_matchups["Season"])
csv_file.write("ID,Pred\n")
for i in range(len(tourney_matchups["Season"])):
    csv_string = str(seasons[i]) + "_" + str(team_ids[i]) + "_" + str(int(team_ids2[i]))  + "," + str(probs[i]) + "\n"
    csv_file.write(csv_string)

In [168]:
csv_file.close()

In [172]:
predictions = pd.concat([pd.Series([prob for prob in probs]), pd.Series(team_ids), pd.Series(team_ids2)], axis=1, ignore_index=True)
predictions.columns = ['Predicted', 'team1' , 'team2']
team_data = pd.read_csv("../ncaa_data/womens-machine-learning-competition-2019/WTeams.csv")
predictions = predictions.join(team_data.set_index('TeamID'), on='team2', how='left').join(team_data.set_index('TeamID'), lsuffix="team1", on='team1', how='left')

In [173]:
predictions[['TeamNameteam1', 'TeamName', 'Predicted']].to_csv('linear-brackets.csv')