In [20]:
import pandas as pd
import numpy as np

m = pd.read_csv('data/MasseyOrdinals.csv')
m.tail()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
3492315,2018,133,ZAM,1460,132
3492316,2018,133,ZAM,1461,95
3492317,2018,133,ZAM,1462,14
3492318,2018,133,ZAM,1463,216
3492319,2018,133,ZAM,1464,328


In [3]:
class FeatureFileGenerator(object):
    def __init__(self):
        np.random.seed(0) # seed so we always get same answer
    
    def new_file(self, filename):
        t_df = pd.read_csv('data/NCAATourneyCompactResults.csv')
        t_df = self.randomize_teams(t_df)
        t_df['TourneyGame'] = np.full(t_df.shape[0], 1)
                
        s_df = pd.read_csv('data/RegularSeasonCompactResults.csv')
        s_df = self.randomize_teams(s_df)
        s_df['TourneyGame'] = np.full(s_df.shape[0], 0)
        
        df = pd.concat([t_df, s_df], ignore_index=True)
        df.to_csv(filename, encoding='ascii')
        
    def randomize_teams(self, df):
        df = df[df['Season'] > 2002]
        
        Ngames = df.shape[0]
        df['team0Win'] = np.random.randint(2,size=Ngames)
        
        for feature in [self.team0, self.team1]:
            df[feature.__name__] = df.apply(feature, axis=1)
        
        return df

    def team0(self, game):
        if game['team0Win'] == 1:
            return game['WTeamID']
        else:
            return game['LTeamID']

    def team1(self, game):
        if game['team0Win'] == 1:
            return game['LTeamID']
        else:
            return game['WTeamID']

In [4]:
%%time
f = FeatureFileGenerator()
f.new_file('data/fulldata.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


CPU times: user 17.5 s, sys: 292 ms, total: 17.8 s
Wall time: 4.06 s


In [5]:
df = pd.read_csv('data/fulldata.csv', index_col=0)
df.shape

(83089, 12)

In [6]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,team0Win,team0,team1,TourneyGame
0,2003,134,1421,92,1411,84,N,1,0,1411,1421,1
1,2003,136,1112,80,1436,51,N,0,1,1112,1436,1
2,2003,136,1113,84,1272,71,N,0,1,1113,1272,1
3,2003,136,1141,79,1166,73,N,0,0,1166,1141,1
4,2003,136,1143,76,1301,74,N,1,1,1143,1301,1


In [7]:
m = pd.read_csv('data/MasseyOrdinals.csv')
msystems = pd.Series(m['SystemName'].unique())
msystems = msystems.sort_values().values
msystems

array(['7OT', 'ACU', 'ADE', 'AP', 'ARG', 'AUS', 'BBT', 'BCM', 'BD', 'BIH',
       'BKM', 'BLS', 'BNM', 'BNT', 'BOB', 'BOW', 'BP5', 'BPI', 'BRZ',
       'BUR', 'BWE', 'CJB', 'CMV', 'CNG', 'COL', 'COX', 'CPA', 'CPR',
       'CRO', 'CRW', 'CTL', 'D1A', 'DAV', 'DC', 'DC2', 'DCI', 'DDB',
       'DES', 'DII', 'DOK', 'DOL', 'DUN', 'DWH', 'EBB', 'EBP', 'ECK',
       'ENT', 'ERD', 'ESR', 'FAS', 'FMG', 'FSH', 'GC', 'GRN', 'GRS',
       'HAS', 'HAT', 'HER', 'HKB', 'HKS', 'HOL', 'HRN', 'IMS', 'INP',
       'ISR', 'JCI', 'JEN', 'JJK', 'JNG', 'JON', 'JRT', 'KBM', 'KEL',
       'KLK', 'KMV', 'KOS', 'KPI', 'KPK', 'KRA', 'LAB', 'LMC', 'LOG',
       'LYD', 'LYN', 'MAS', 'MB', 'MCL', 'MGY', 'MIC', 'MKV', 'MMG',
       'MOR', 'MPI', 'MSX', 'MUZ', 'MvG', 'NOL', 'NOR', 'OCT', 'OMY',
       'PEQ', 'PGH', 'PH', 'PIG', 'PKL', 'PMC', 'POM', 'PPR', 'PRR',
       'PTS', 'RAG', 'REI', 'REN', 'REW', 'RIS', 'RM', 'ROG', 'ROH',
       'RPI', 'RSE', 'RSL', 'RT', 'RTB', 'RTH', 'RTP', 'RTR', 'SAG',
       'SAP', 'SAU', 

In [8]:
def getOrdinal(season, day, team):
    res = pd.DataFrame(index=msystems)
    res['OrdinalRank'] = np.nan
    
    try:
        cut = m.loc[(m['Season']==season) & (m['RankingDayNum']<day) & (m['TeamID']==team)]
        cut = cut.loc[cut['RankingDayNum'] == cut['RankingDayNum'].max()] # get the index of the latest day closest to game day
        cut = cut.set_index('SystemName')
        res = res.combine_first(cut)['OrdinalRank']
    except:
        pass
    return res.values

def massey_game(game):
    try:
        daynum = game['DayNum']
    except:
        daynum = 134 # submission file games don't have daynum, but are all after day 133 (last day before tourney)

    ordinals0 = getOrdinal(game['Season'], daynum, game['team0'])
    ordinals1 = getOrdinal(game['Season'], daynum, game['team1'])
    
    f0 = pd.Series(ordinals0, index=[system+'0' for system in msystems])   
    f1 = pd.Series(ordinals1, index=[system+'1' for system in msystems])   
    return pd.concat([game,f0,f1])

In [9]:
metadf = pd.DataFrame([massey_game(df.loc[0])])
metadf

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,team0Win,team0,...,USA1,WIL1,WLK1,WMR1,WMV1,WOB1,WOL1,WTE1,YAG1,ZAM1
0,2003,134,1421,92,1411,84,N,1,0,1411,...,,,237.0,,,212.0,220.0,289.0,,


In [None]:
%%time
res = df.apply(massey_game, axis=1)

In [None]:
res.to_csv('data/trainingdata.csv', encoding='ascii')

In [22]:
res = pd.read_csv('data/trainingdata.csv', index_col=0)
res.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,team0Win,team0,...,USA1,WIL1,WLK1,WMR1,WMV1,WOB1,WOL1,WTE1,YAG1,ZAM1
83084,2018,132,1153,56,1222,55,N,0,0,1222,...,8.0,7.0,4.0,,8.0,5.0,4.0,,7.0,6.0
83085,2018,132,1172,58,1348,57,N,0,0,1348,...,,91.0,60.0,,74.0,94.0,91.0,,44.0,39.0
83086,2018,132,1209,74,1426,61,N,0,0,1426,...,,160.0,125.0,,138.0,158.0,143.0,,134.0,117.0
83087,2018,132,1246,77,1397,72,N,0,1,1246,...,12.0,8.0,12.0,,10.0,9.0,12.0,,18.0,13.0
83088,2018,132,1335,68,1217,65,N,0,1,1335,...,,188.0,159.0,,169.0,177.0,147.0,,144.0,166.0


In [25]:
sub = pd.read_csv('data/2016SampleSubmission.csv', index_col=0)
sub.tail()

Unnamed: 0_level_0,Pred
Id,Unnamed: 1_level_1
2016_1455_1462,0.5
2016_1455_1463,0.5
2016_1458_1462,0.5
2016_1458_1463,0.5
2016_1462_1463,0.5


In [34]:
d = sub['Pred'].to_dict()

In [35]:
d['2016_1455_1462']

0.5