In [52]:
import pandas as pd
import numpy as np
from sklearn import *


In [53]:
# datafiles = sorted(glob.glob('../input/**'))
# datafiles = {file.split('/')[-1].split('.')[0]: pd.read_csv(file, encoding='latin-1') for file in datafiles}

NCAA_compact = pd.read_csv('./DataFiles/NCAATourneyCompactResults.csv')
NCAA_detailed = pd.read_csv('./DataFiles/NCAATourneyDetailedResults.csv')
reg_compact = pd.read_csv('./DataFiles/RegularSeasonCompactResults.csv')
reg_detailed = pd.read_csv('./DataFiles/RegularSeasonDetailedResults.csv')
secondary_compact = pd.read_csv('./DataFiles/SecondaryTourneyCompactResults.csv')

In [54]:
WLoc = {'A': 1, 'H': 2, 'N': 3}
Secondary_Tourney = {'NIT': 1, 'CBI': 2, 'CIT': 3, 'V16': 4, 'Regular': 5 ,'NCAA': 6}

## Getting our massive data frame.


In [55]:
games = pd.concat((NCAA_compact,reg_compact), axis = 0, ignore_index=True)

games = pd.concat((games, secondary_compact), axis=0, ignore_index=True)


In [56]:
games.reset_index(drop=True, inplace=True)
games['WLoc'] = games["WLoc"].map(WLoc)
games['SecondaryTourney'] = games['SecondaryTourney'].map(Secondary_Tourney)
games.head()
games.shape

(154285, 9)

## Manipulating the data/ Feature Engineering

In [57]:
games['ID'] = games.apply(lambda x: '_'.join(map(str,[x['Season']]+sorted([x['WTeamID'],x['LTeamID']]))),axis=1)

games['IDTeams'] = games.apply(lambda x: '_'.join(map(str, sorted([x['WTeamID'],x['LTeamID']]))),axis=1)

games['Team1'] = games.apply(lambda x: sorted([x['WTeamID'], x['LTeamID']])[0], axis = 1)

games['Team2'] = games.apply(lambda x: sorted([x['WTeamID'], x['LTeamID']])[1], axis = 1)

games['IDTeam1'] = games.apply(lambda x: '_'.join(map(str, [x['Season'], x['Team1']])), axis=1)

games['IDTeam2'] = games.apply(lambda x: '_'.join(map(str, [x['Season'], x['Team2']])), axis=1)

## Add Seed Data

In [58]:
#import first
seeds = pd.read_csv('./DataFiles/NCAATourneySeeds.csv')

seeds = {'_'.join(map(str,[int(k1), k2])): int(v[1:3]) for k1, v, k2 in seeds.values }

games['Team1Seed'] = games['IDTeam1'].map(seeds).fillna(0)
games['Team2Seed'] = games['IDTeam2'].map(seeds).fillna(0)

## Additional Features and some Cleanup

In [59]:
games['Score_Differential'] = games['WScore'] - games['LScore']
games['Predictions'] = games.apply(lambda x: 1. if sorted([x['WTeamID'], x['LTeamID']])[0] == x['WTeamID'] else 0.0, axis = 1)

In [60]:
games['Normalized_Score_Diff'] = games.apply(lambda x: x['Score_Differential'] * -1. if x['Predictions']== 0. else x['Score_Differential'], axis=1)
games['Seed_Differential'] = games["Team1Seed"] - games['Team2Seed']

In [61]:
games = games.fillna(-1)

## Train Test Split / Prepping the Test File.

In [31]:
submission = pd.read_csv('./SampleSubmissionStage1.csv')
submission['WLoc'] = 3
submission['SecondaryTourney'] = 6
submission['Season'] = submission['ID'].map(lambda x: x.split('_')[0])
submission['Season'] = submission['ID'].map(lambda x: x.split('_')[0])
submission['Team1'] = submission['ID'].map(lambda x: x.split('_')[1])
submission['Team2'] = submission['ID'].map(lambda x: x.split('_')[2])
submission['IDTeams'] = submission.apply(lambda x: '_'.join(map(str, [x['Team1'], x['Team2']])), axis=1)
submission['IDTeam1'] = submission.apply(lambda x: '_'.join(map(str, [x['Season'], x['Team1']])), axis=1)
submission['IDTeam2'] = submission.apply(lambda x: '_'.join(map(str, [x['Season'], x['Team2']])), axis=1)
submission['Team1Seed'] = submission['IDTeam1'].map(seeds).fillna(0)
submission['Team2Seed'] = submission['IDTeam2'].map(seeds).fillna(0)
submission['Seed_Differential'] = submission['Team1Seed'] - submission['Team2Seed']

In [120]:
#Prep the results.

results = []
for season in submission['Season'].unique():
    print(season)
    x1 = games[((games['Season']<int(season)) & (games['SecondaryTourney']==6))]
    x1 = pd.concat((x1, games[((games['Season']<int(int(season)+1)) & (games['SecondaryTourney']!=6))]), axis=0, ignore_index=True)
    x2 = games[((games['Season'] > int(season)) & (games['SecondaryTourney'] == 6))]
    
    test = submission[submission['Season'] == season]
    
    sdn = x1.groupby(['IDTeams'], as_index=False)[['Normalized_Score_Diff']].mean()
    test = pd.merge(test, sdn, how='left', on=['IDTeams'])
    test['Normalized_Score_Diff'] = test['Normalized_Score_Diff'].fillna(0.0)
    
    #Setting up the model / interactions.

    interactions = games[['IDTeam2', 'IDTeam1', 'Season', 'Predictions']].rename(columns={'IDTeam2': 'Target', 'IDTeam1': 'Common'})
    interactions['Predictions'] = interactions['Predictions'] * -1.
    interactions = pd.concat((interactions, games[['IDTeam1', 'IDTeam2', 'Season', 'Predictions']].rename(columns={'IDTeam1': 'Target', 'IDTeam2': 'Common'})),axis=0, ignore_index=True).reset_index(drop=True)
    interactions = interactions[((interactions['Season'] <= int(season)) & (interactions['Season'] > int(season)-2))]
    interactions = pd.merge(interactions, interactions, how='inner', on=['Common', 'Season'])
    interactions = interactions[interactions['Target_x'] != interactions['Target_y']]
    interactions['IDTeams'] = interactions.apply(lambda x: '_'.join(map(str, [x['Target_x'].split('_')[1], x['Target_x'].split('_')[1]])), axis=1)
    interactions = interactions[['IDTeams', 'Predictions_x']]
    interactions = interactions.groupby(['IDTeams'], as_index=False)[['Predictions_x']].sum()
    interactions = {k:int(v) for k, v in interactions.values}
    
    x1['Interactions'] = x1['IDTeams'].map(interactions).fillna(0)
    x2['Interactions'] = x2['IDTeams'].map(interactions).fillna(0)
    test['Interactions'] = test['IDTeams'].map(interactions).fillna(0)
    cols = [c for c in x1.columns if c not in ['ID', 'Team1', 'Team2', 'IDTeams', 'IDTeam1', 'IDTeam2', 'Predictions', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'NumOT', 'Score_Differential']]
    
#     X = x1[cols]
#     y = x1['Predictions']
#     model = linear_model.HuberRegressor()

#     model.fit(X,y)
#     preds = model.predict(x2[cols]).clip(0.05, 0.95)
#     print('Log Loss:', metrics.log_loss(x2['Predictions'], pred))
    

#     test['Predictions'] = model.predict(test[cols])

#     results.append(test)
                       

# results = pd.concat(results, axis=0, ignore_index=True).reset_index(drop=True)
# results = {k:float(v) for k,v in results[['ID', 'Predictions']].values}
# submission['Predictions'] = submission['ID'].map(results).clip(0.05, 0.95).fillna(0.49)
# submission[['ID', 'Predictions']].to_csv('ridge_submission.csv', index=False)

2014
2015
2016
2017


In [121]:
x2

Unnamed: 0,DayNum,LScore,LTeamID,NumOT,Season,SecondaryTourney,WLoc,WScore,WTeamID,ID,...,Team2,IDTeam1,IDTeam2,Team1Seed,Team2Seed,Score_Differential,Predictions,Normalized_Score_Diff,Seed_Differential,Interactions
