In [326]:
import sklearn 
from sklearn.linear_model import LogisticRegression, SGDClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
import pandas as pd 
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, log_loss

In [166]:
full_df = pd.read_pickle("../ncaa_data/full_df.p")

# Filter Down to Rows Where Each Team Has Actually Played a Previous Game So We Have Data

In [140]:
populated_df = full_df[(full_df["FGM"] > 0) & (full_df["FGM2"] > 0)]

In [141]:
y = populated_df["Team1Score"] - populated_df["Team2Score"]

In [142]:
exclude = ["Team1Score","Team2Score", "date","DayZero","Score","DayNum","Season","WScore","LScore","Team1","Team2","DayZero2","Score2","DayNum2","Season2","date2","index","index2"]
feature_columns = populated_df.columns
feature_columns = [item for item in feature_columns if item not in exclude]

In [143]:
y[y>0] = 1
y[y<0] = 0

In [144]:
feature_columns

['Team1Home',
 'Team2Home',
 'Ast',
 'Blk',
 'DR',
 'FGA',
 'FGA3',
 'FGM',
 'FGM3',
 'FGM3_Perc',
 'FGM_Perc',
 'FTA',
 'FTM',
 'FTM_Perc',
 'OR',
 'PF',
 'Stl',
 'TO',
 'WFGM_Perc',
 'Seed',
 '7OT',
 'ACU',
 'ADE',
 'AP',
 'ARG',
 'AUS',
 'BBT',
 'BCM',
 'BD',
 'BIH',
 'BKM',
 'BLS',
 'BNM',
 'BOB',
 'BOW',
 'BP5',
 'BPI',
 'BRZ',
 'BUR',
 'BWE',
 'CJB',
 'CMV',
 'CNG',
 'COL',
 'CPA',
 'CPR',
 'CRO',
 'CRW',
 'CTL',
 'D1A',
 'DAV',
 'DC',
 'DCI',
 'DDB',
 'DES',
 'DII',
 'DOK',
 'DOL',
 'DUN',
 'DWH',
 'EBB',
 'EBP',
 'ECK',
 'ENT',
 'ERD',
 'ESR',
 'FAS',
 'FMG',
 'FSH',
 'GC',
 'GRN',
 'GRS',
 'HAS',
 'HAT',
 'HER',
 'HKB',
 'HKS',
 'HOL',
 'HRN',
 'IMS',
 'INP',
 'ISR',
 'JCI',
 'JEN',
 'JNG',
 'JON',
 'JRT',
 'KBM',
 'KEL',
 'KLK',
 'KMV',
 'KOS',
 'KPI',
 'KPK',
 'KRA',
 'LMC',
 'LOG',
 'LYD',
 'LYN',
 'MAS',
 'MB',
 'MCL',
 'MGY',
 'MIC',
 'MKV',
 'MOR',
 'MPI',
 'MSX',
 'MUZ',
 'MvG',
 'NOL',
 'NOR',
 'OCT',
 'OMY',
 'PEQ',
 'PGH',
 'PH',
 'PIG',
 'PKL',
 'PMC',
 'POM',
 'PPR

# Split It Up and Train a Model On All Games

In [198]:
x_train, x_test, y_train, y_test = train_test_split(populated_df[feature_columns], y, test_size=0.1)

In [450]:
# linear_model = LinearRegression()
# model = LogisticRegression()
# model = RandomForestClassifier(n_estimators=1000)
# model = SVC(probability=True, C=100)
model = GradientBoostingClassifier(n_estimators=1000, max_depth=3)
# model = SGDClassifier(alpha=10, loss='log', penalty='elasticnet')
model.fit(x_train.fillna(0), y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [451]:
predicted = model.predict(x_test.fillna(0))
probs = model.predict_proba(x_test.fillna(0))

In [452]:
predicted[:10]

array([ 1.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  0.,  0.])

In [453]:
y_test[:10]

54125    1.0
71860    0.0
50387    0.0
72859    1.0
39822    0.0
49697    0.0
8881     1.0
9898     1.0
55814    1.0
36766    0.0
dtype: float64

In [454]:
predicted

array([ 1.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,
        0.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,
        1.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,
        1.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        1.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,
        0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,
        1.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,
        0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,
        1.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,  0.,
        1.,  1.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,
        1.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,  0.,
        1.,  1.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  1

In [455]:
probs = [prob[1] for prob in probs]

In [456]:
f1_score(y_test, predicted)

0.82101167315175094

In [457]:
len(predicted)

517

In [458]:
log_loss(y_test, probs)

0.37267962967667134

In [440]:
[print (item) for item in list(zip(feature_columns,model.coef_[0]))]

('Team1Home', 0.0)
('Team2Home', 0.0)
('Ast', 0.0)
('Blk', 0.0)
('DR', 0.0)
('FGA', 0.0)
('FGA3', 0.0)
('FGM', 0.0)
('FGM3', 0.0)
('FGM3_Perc', 0.0)
('FGM_Perc', 0.0)
('FTA', 0.0)
('FTM', 0.0)
('FTM_Perc', 0.0)
('OR', 0.0)
('PF', 0.0)
('Stl', 0.0)
('TO', 0.0)
('WFGM_Perc', 0.0)
('Seed', 0.0)
('7OT', 0.0)
('ACU', 0.0)
('ADE', 0.0)
('AP', 0.0)
('ARG', 0.0)
('AUS', 0.0)
('BBT', 0.0)
('BCM', 0.0)
('BD', 0.0)
('BIH', 0.0)
('BKM', 0.0)
('BLS', 0.0)
('BNM', 0.0)
('BOB', 0.0)
('BOW', 0.0)
('BP5', 0.0)
('BPI', 0.0)
('BRZ', 0.0)
('BUR', 0.0)
('BWE', 0.0)
('CJB', 0.0)
('CMV', 0.0)
('CNG', 0.0)
('COL', 0.0)
('CPA', 0.0)
('CPR', 0.0)
('CRO', 0.0)
('CRW', 0.0)
('CTL', 0.0)
('D1A', 0.0)
('DAV', 0.0)
('DC', 0.0)
('DCI', 0.0)
('DDB', 0.0)
('DES', 0.0)
('DII', 0.0)
('DOK', 0.0)
('DOL', 0.0)
('DUN', 0.0)
('DWH', 0.0)
('EBB', 0.0)
('EBP', 0.0)
('ECK', 0.0)
('ENT', 0.0)
('ERD', 0.0)
('ESR', 0.0)
('FAS', 0.0)
('FMG', 0.0)
('FSH', 0.0)
('GC', 0.0)
('GRN', 0.0)
('GRS', 0.0)
('HAS', 0.0)
('HAT', 0.0)
('HER', 0

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [399]:
tournament_df

Unnamed: 0,Season,DayNum,WScore,LScore,Team1,Team2,Team1Score,Team2Score,Team1Home,Team2Home,...,UPSdiff,USAdiff,WILdiff,WLKdiff,WMRdiff,WOBdiff,WOLdiff,WTEdiff,YAGdiff,ZAMdiff
4616,2003,134,92.0,84.0,1411,1421,84.0,92.0,False,False,...,-2.842171e-14,0.000000,2.842171e-14,-19.060606,0.0,9.909091,11.545455,-26.727273,-2.842171e-14,-2.842171e-14
4617,2003,136,80.0,51.0,1112,1436,80.0,51.0,False,False,...,2.842171e-14,-22.000000,0.000000e+00,-173.357143,0.0,-177.571429,-186.714286,-182.642857,2.842171e-14,2.842171e-14
4618,2003,136,84.0,71.0,1113,1272,84.0,71.0,False,False,...,-2.842171e-14,-15.000000,-2.842171e-14,-17.000000,0.0,8.000000,6.000000,6.000000,2.842171e-14,2.842171e-14
4619,2003,136,79.0,73.0,1141,1166,79.0,73.0,False,False,...,2.842171e-14,-16.555556,-2.842171e-14,25.428571,0.0,24.952381,34.682540,40.444444,2.842171e-14,2.842171e-14
4620,2003,136,76.0,74.0,1143,1301,76.0,74.0,False,False,...,-5.684342e-14,-4.000000,5.684342e-14,-40.000000,0.0,-34.000000,-29.500000,-26.000000,-5.684342e-14,-5.684342e-14
4621,2003,136,58.0,53.0,1140,1163,53.0,58.0,False,False,...,2.842171e-14,-22.000000,-2.842171e-14,-9.000000,0.0,-14.250000,-12.250000,-8.000000,2.842171e-14,2.842171e-14
4622,2003,136,67.0,57.0,1161,1181,57.0,67.0,False,False,...,0.000000e+00,15.000000,0.000000e+00,122.000000,0.0,114.000000,111.000000,117.000000,0.000000e+00,0.000000e+00
4623,2003,136,74.0,69.0,1153,1211,69.0,74.0,False,False,...,0.000000e+00,1.000000,0.000000e+00,-4.229167,0.0,-3.770833,-2.458333,-16.833333,0.000000e+00,0.000000e+00
4624,2003,136,65.0,60.0,1228,1443,65.0,60.0,False,False,...,2.842171e-14,-8.500000,-2.842171e-14,-47.611111,0.0,-48.944444,-57.000000,-47.777778,2.842171e-14,2.842171e-14
4625,2003,136,64.0,61.0,1242,1429,64.0,61.0,False,False,...,0.000000e+00,-17.000000,-2.842171e-14,-75.000000,0.0,-73.000000,-81.000000,-60.000000,0.000000e+00,0.000000e+00


In [400]:
tourney_matchups = pd.read_pickle("../ncaa_data/tourney_matchups.p")

In [401]:
probs = model.predict_proba(tourney_matchups[feature_columns].fillna(0))

In [402]:
csv_file = open("submission.csv", "w+")
team_ids = list(tourney_matchups["Team1"])
team_ids2 = list(tourney_matchups["Team2"])
seasons = list(tourney_matchups["Season"])
csv_file.write("ID,Pred\n")
for i in range(len(tourney_matchups["Season"])):
    csv_string = str(seasons[i]) + "_" + str(team_ids[i]) + "_" + str(int(team_ids2[i]))  + "," + str(probs[i][1]) + "\n"
    csv_file.write(csv_string)

In [403]:
feature_columns

['Team1Home',
 'Team2Home',
 'Ast',
 'Blk',
 'DR',
 'FGA',
 'FGA3',
 'FGM',
 'FGM3',
 'FGM3_Perc',
 'FGM_Perc',
 'FTA',
 'FTM',
 'FTM_Perc',
 'OR',
 'PF',
 'Stl',
 'TO',
 'WFGM_Perc',
 'Seed',
 '7OT',
 'ACU',
 'ADE',
 'AP',
 'ARG',
 'AUS',
 'BBT',
 'BCM',
 'BD',
 'BIH',
 'BKM',
 'BLS',
 'BNM',
 'BOB',
 'BOW',
 'BP5',
 'BPI',
 'BRZ',
 'BUR',
 'BWE',
 'CJB',
 'CMV',
 'CNG',
 'COL',
 'CPA',
 'CPR',
 'CRO',
 'CRW',
 'CTL',
 'D1A',
 'DAV',
 'DC',
 'DCI',
 'DDB',
 'DES',
 'DII',
 'DOK',
 'DOL',
 'DUN',
 'DWH',
 'EBB',
 'EBP',
 'ECK',
 'ENT',
 'ERD',
 'ESR',
 'FAS',
 'FMG',
 'FSH',
 'GC',
 'GRN',
 'GRS',
 'HAS',
 'HAT',
 'HER',
 'HKB',
 'HKS',
 'HOL',
 'HRN',
 'IMS',
 'INP',
 'ISR',
 'JCI',
 'JEN',
 'JNG',
 'JON',
 'JRT',
 'KBM',
 'KEL',
 'KLK',
 'KMV',
 'KOS',
 'KPI',
 'KPK',
 'KRA',
 'LMC',
 'LOG',
 'LYD',
 'LYN',
 'MAS',
 'MB',
 'MCL',
 'MGY',
 'MIC',
 'MKV',
 'MOR',
 'MPI',
 'MSX',
 'MUZ',
 'MvG',
 'NOL',
 'NOR',
 'OCT',
 'OMY',
 'PEQ',
 'PGH',
 'PH',
 'PIG',
 'PKL',
 'PMC',
 'POM',
 'PPR