In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

m = pd.read_csv('data/MasseyOrdinals.csv')
m.head()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
0,2003,35,SEL,1102,159
1,2003,35,SEL,1103,229
2,2003,35,SEL,1104,12
3,2003,35,SEL,1105,314
4,2003,35,SEL,1106,260


In [2]:
class FeatureFileGenerator(object):
    def __init__(self):
        np.random.seed(0) # seed so we always get same answer
    
    def new_file(self, filename):
        t_df = pd.read_csv('data/NCAATourneyCompactResults.csv')
        t_df = self.randomize_teams(t_df)
        t_df['TourneyGame'] = np.full(t_df.shape[0], 1)
                
        s_df = pd.read_csv('data/RegularSeasonCompactResults.csv')
        s_df = self.randomize_teams(s_df)
        s_df['TourneyGame'] = np.full(s_df.shape[0], 0)
        
        df = pd.concat([t_df, s_df], ignore_index=True)
        df.to_csv(filename, encoding='ascii')
        
    def randomize_teams(self, df):
        df = df[df['Season'] > 2002]
        
        Ngames = df.shape[0]
        df['team0Win'] = np.random.randint(2,size=Ngames)
        
        for feature in [self.team0, self.team1]:
            df[feature.__name__] = df.apply(feature, axis=1)
        
        return df

    def team0(self, game):
        if game['team0Win'] == 1:
            return game['WTeamID']
        else:
            return game['LTeamID']

    def team1(self, game):
        if game['team0Win'] == 1:
            return game['LTeamID']
        else:
            return game['WTeamID']

In [3]:
%%time
f = FeatureFileGenerator()
f.new_file('data/fulldata.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


CPU times: user 6.55 s, sys: 91.6 ms, total: 6.64 s
Wall time: 3.81 s


In [4]:
df = pd.read_csv('data/fulldata.csv', index_col=0)
df.shape

(83089, 12)

In [5]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,team0Win,team0,team1,TourneyGame
0,2003,134,1421,92,1411,84,N,1,0,1411,1421,1
1,2003,136,1112,80,1436,51,N,0,1,1112,1436,1
2,2003,136,1113,84,1272,71,N,0,1,1113,1272,1
3,2003,136,1141,79,1166,73,N,0,0,1166,1141,1
4,2003,136,1143,76,1301,74,N,1,1,1143,1301,1


In [6]:
m = pd.read_csv('data/MasseyOrdinals.csv')
msystems = pd.Series(m['SystemName'].unique())
msystems = msystems.sort_values().values
msystems

array(['7OT', 'ACU', 'ADE', 'AP', 'ARG', 'AUS', 'BBT', 'BCM', 'BD', 'BIH',
       'BKM', 'BLS', 'BNM', 'BNT', 'BOB', 'BOW', 'BP5', 'BPI', 'BRZ',
       'BUR', 'BWE', 'CJB', 'CMV', 'CNG', 'COL', 'COX', 'CPA', 'CPR',
       'CRO', 'CRW', 'CTL', 'D1A', 'DAV', 'DC', 'DC2', 'DCI', 'DDB',
       'DES', 'DII', 'DOK', 'DOL', 'DUN', 'DWH', 'EBB', 'EBP', 'ECK',
       'ENT', 'ERD', 'ESR', 'FAS', 'FMG', 'FSH', 'GC', 'GRN', 'GRS',
       'HAS', 'HAT', 'HER', 'HKB', 'HKS', 'HOL', 'HRN', 'IMS', 'INP',
       'ISR', 'JCI', 'JEN', 'JJK', 'JNG', 'JON', 'JRT', 'KBM', 'KEL',
       'KLK', 'KMV', 'KOS', 'KPI', 'KPK', 'KRA', 'LAB', 'LMC', 'LOG',
       'LYD', 'LYN', 'MAS', 'MB', 'MCL', 'MGY', 'MIC', 'MKV', 'MMG',
       'MOR', 'MPI', 'MSX', 'MUZ', 'MvG', 'NOL', 'NOR', 'OCT', 'OMY',
       'PEQ', 'PGH', 'PH', 'PIG', 'PKL', 'PMC', 'POM', 'PPR', 'PRR',
       'PTS', 'RAG', 'REI', 'REN', 'REW', 'RIS', 'RM', 'ROG', 'ROH',
       'RPI', 'RSE', 'RSL', 'RT', 'RTB', 'RTH', 'RTP', 'RTR', 'SAG',
       'SAP', 'SAU', 

In [10]:
def getOrdinal(season, day, team):
    res = pd.DataFrame(index=msystems)
    res['OrdinalRank'] = np.nan
    
    try:
        cut = m.loc[(m['Season']==season) & (m['RankingDayNum']<day) & (m['TeamID']==team)]
        cut = cut.loc[cut['RankingDayNum'] == cut['RankingDayNum'].max()] # get the index of the latest day closest to game day
        cut = cut.set_index('SystemName')
        res = res.combine_first(cut)['OrdinalRank']
    except:
        pass
    return res.values

def massey_game(game):
    try:
        daynum = game['DayNum']
    except:
        daynum = 134 # submission file games don't have daynum, but are all after day 133 (last day before tourney)

    ordinals0 = getOrdinal(game['Season'], daynum, game['team0'])
    ordinals1 = getOrdinal(game['Season'], daynum, game['team1'])
    
    f0 = pd.Series(ordinals0, index=[system+'0' for system in msystems])   
    f1 = pd.Series(ordinals1, index=[system+'1' for system in msystems])   
    return pd.concat([game,f0,f1])

In [11]:
metadf = pd.DataFrame([massey_game(df.loc[0])])
metadf

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,team0Win,team0,...,USA1,WIL1,WLK1,WMR1,WMV1,WOB1,WOL1,WTE1,YAG1,ZAM1
0,2003,134,1421,92,1411,84,N,1,0,1411,...,,,237.0,,,212.0,220.0,289.0,,


In [12]:
%%time
ddf = dd.from_pandas(df.head(10), npartitions=2)
res = ddf.apply(massey_game, axis=1, meta=metadf).compute(scheduler='processes')

CPU times: user 13.8 s, sys: 911 ms, total: 14.7 s
Wall time: 16 s


In [14]:
res

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,team0Win,team0,...,USA1,WIL1,WLK1,WMR1,WMV1,WOB1,WOL1,WTE1,YAG1,ZAM1
0,2003,134,1421,92,1411,84,N,1,0,1411,...,,,237.0,,,212.0,220.0,289.0,,
1,2003,136,1112,80,1436,51,N,0,1,1112,...,,,153.0,,,164.0,177.0,178.0,,
2,2003,136,1113,84,1272,71,N,0,1,1113,...,22.0,,21.0,,,18.0,19.0,31.0,,
3,2003,136,1141,79,1166,73,N,0,0,1166,...,,,32.0,,,44.0,49.0,54.0,,
4,2003,136,1143,76,1301,74,N,1,1,1143,...,,,51.0,,,49.0,47.0,81.0,,
5,2003,136,1163,58,1140,53,N,0,1,1163,...,,,25.0,,,26.0,30.0,25.0,,
6,2003,136,1181,67,1161,57,N,0,1,1181,...,,,92.0,,,76.0,76.0,99.0,,
7,2003,136,1211,74,1153,69,N,0,1,1211,...,,,45.0,,,47.0,45.0,33.0,,
8,2003,136,1228,65,1443,60,N,0,1,1228,...,,,53.0,,,57.0,63.0,59.0,,
9,2003,136,1242,64,1429,61,N,0,1,1242,...,,,89.0,,,81.0,95.0,72.0,,


In [101]:
res.to_csv('data/trainingdata.csv', encoding='ascii')